]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_bld.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
32 #include "ucol_bld.h"
33 #include "ucol_elm.h"
34 #include "ucol_cnt.h"
35 #include "ucln_in.h"
36 #include "umutex.h"
37 #include "cmemory.h"
38 #include "cstring.h"
39
40 static const InverseUCATableHeader* _staticInvUCA = NULL;
41 static UDataMemory* invUCA_DATA_MEM = NULL;
42
43 U_CDECL_BEGIN
44 static UBool U_CALLCONV
45 isAcceptableInvUCA(void * /*context*/,
46 const char * /*type*/, const char * /*name*/,
47 const UDataInfo *pInfo)
48 {
49 /* context, type & name are intentionally not used */
50 if( pInfo->size>=20 &&
51 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
52 pInfo->charsetFamily==U_CHARSET_FAMILY &&
53 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
54 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
55 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
56 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
57 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
58 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
59 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
60 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
61 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
62 )
63 {
64 UVersionInfo UCDVersion;
65 u_getUnicodeVersion(UCDVersion);
66 return (pInfo->dataVersion[0]==UCDVersion[0] &&
67 pInfo->dataVersion[1]==UCDVersion[1]);
68 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
69 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
70 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
71 } else {
72 return FALSE;
73 }
74 }
75 U_CDECL_END
76
77 /*
78 * Takes two CEs (lead and continuation) and
79 * compares them as CEs should be compared:
80 * primary vs. primary, secondary vs. secondary
81 * tertiary vs. tertiary
82 */
83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
84 uint32_t s1 = source0, s2, t1 = target0, t2;
85 if(isContinuation(source1)) {
86 s2 = source1;
87 } else {
88 s2 = 0;
89 }
90 if(isContinuation(target1)) {
91 t2 = target1;
92 } else {
93 t2 = 0;
94 }
95
96 uint32_t s = 0, t = 0;
97 if(s1 == t1 && s2 == t2) {
98 return 0;
99 }
100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
102 if(s < t) {
103 return -1;
104 } else if(s > t) {
105 return 1;
106 } else {
107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
109 if(s < t) {
110 return -1;
111 } else if(s > t) {
112 return 1;
113 } else {
114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
116 if(s < t) {
117 return -1;
118 } else {
119 return 1;
120 }
121 }
122 }
123 }
124
125 static
126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
127 uint32_t bottom = 0, top = src->invUCA->tableSize;
128 uint32_t i = 0;
129 uint32_t first = 0, second = 0;
130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
131 int32_t res = 0;
132
133 while(bottom < top-1) {
134 i = (top+bottom)/2;
135 first = *(CETable+3*i);
136 second = *(CETable+3*i+1);
137 res = compareCEs(first, second, CE, SecondCE);
138 if(res > 0) {
139 top = i;
140 } else if(res < 0) {
141 bottom = i;
142 } else {
143 break;
144 }
145 }
146
147 /* weiv: */
148 /* in searching for elements, I have removed the failure */
149 /* The reason for this is that the builder does not rely */
150 /* on search mechanism telling it that it didn't find an */
151 /* element. However, indirect positioning relies on being */
152 /* able to find the elements around any CE, even if it is */
153 /* not defined in the UCA. */
154 return i;
155 /*
156 if((first == CE && second == SecondCE)) {
157 return i;
158 } else {
159 return -1;
160 }
161 */
162 }
163
164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
165 0xFFFF0000,
166 0xFFFFFF00,
167 0xFFFFFFFF
168 };
169
170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
171 uint32_t CE, uint32_t contCE,
172 uint32_t *nextCE, uint32_t *nextContCE,
173 uint32_t strength)
174 {
175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
176 int32_t iCE;
177
178 iCE = ucol_inv_findCE(src, CE, contCE);
179
180 if(iCE<0) {
181 *nextCE = UCOL_NOT_FOUND;
182 return -1;
183 }
184
185 CE &= strengthMask[strength];
186 contCE &= strengthMask[strength];
187
188 *nextCE = CE;
189 *nextContCE = contCE;
190
191 while((*nextCE & strengthMask[strength]) == CE
192 && (*nextContCE & strengthMask[strength]) == contCE)
193 {
194 *nextCE = (*(CETable+3*(++iCE)));
195 *nextContCE = (*(CETable+3*(iCE)+1));
196 }
197
198 return iCE;
199 }
200
201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
202 uint32_t CE, uint32_t contCE,
203 uint32_t *prevCE, uint32_t *prevContCE,
204 uint32_t strength)
205 {
206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
207 int32_t iCE;
208
209 iCE = ucol_inv_findCE(src, CE, contCE);
210
211 if(iCE<0) {
212 *prevCE = UCOL_NOT_FOUND;
213 return -1;
214 }
215
216 CE &= strengthMask[strength];
217 contCE &= strengthMask[strength];
218
219 *prevCE = CE;
220 *prevContCE = contCE;
221
222 while((*prevCE & strengthMask[strength]) == CE
223 && (*prevContCE & strengthMask[strength])== contCE
224 && iCE > 0) /* this condition should prevent falling off the edge of the world */
225 {
226 /* here, we end up in a singularity - zero */
227 *prevCE = (*(CETable+3*(--iCE)));
228 *prevContCE = (*(CETable+3*(iCE)+1));
229 }
230
231 return iCE;
232 }
233
234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
235 uint32_t prevCE, uint32_t prevContCE)
236 {
237 if(prevCE == CE && prevContCE == contCE) {
238 return UCOL_IDENTICAL;
239 }
240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
242 {
243 return UCOL_PRIMARY;
244 }
245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
247 {
248 return UCOL_SECONDARY;
249 }
250 return UCOL_TERTIARY;
251 }
252
253
254 /*static
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
256
257 uint32_t CE = lh->baseCE;
258 uint32_t SecondCE = lh->baseContCE;
259
260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
261 uint32_t previousCE, previousContCE;
262 int32_t iCE;
263
264 iCE = ucol_inv_findCE(src, CE, SecondCE);
265
266 if(iCE<0) {
267 return -1;
268 }
269
270 CE &= strengthMask[strength];
271 SecondCE &= strengthMask[strength];
272
273 previousCE = CE;
274 previousContCE = SecondCE;
275
276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
277 previousCE = (*(CETable+3*(--iCE)));
278 previousContCE = (*(CETable+3*(iCE)+1));
279 }
280 lh->previousCE = previousCE;
281 lh->previousContCE = previousContCE;
282
283 return iCE;
284 }*/
285
286 static
287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
288 uint32_t CE = lh->baseCE;
289 uint32_t SecondCE = lh->baseContCE;
290
291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
292 uint32_t nextCE, nextContCE;
293 int32_t iCE;
294
295 iCE = ucol_inv_findCE(src, CE, SecondCE);
296
297 if(iCE<0) {
298 return -1;
299 }
300
301 CE &= strengthMask[strength];
302 SecondCE &= strengthMask[strength];
303
304 nextCE = CE;
305 nextContCE = SecondCE;
306
307 while((nextCE & strengthMask[strength]) == CE
308 && (nextContCE & strengthMask[strength]) == SecondCE)
309 {
310 nextCE = (*(CETable+3*(++iCE)));
311 nextContCE = (*(CETable+3*(iCE)+1));
312 }
313
314 lh->nextCE = nextCE;
315 lh->nextContCE = nextContCE;
316
317 return iCE;
318 }
319
320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
321 /* reset all the gaps */
322 int32_t i = 0;
323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
324 uint32_t st = 0;
325 uint32_t t1, t2;
326 int32_t pos;
327
328 UColToken *tok = lh->first;
329 uint32_t tokStrength = tok->strength;
330
331 for(i = 0; i<3; i++) {
332 lh->gapsHi[3*i] = 0;
333 lh->gapsHi[3*i+1] = 0;
334 lh->gapsHi[3*i+2] = 0;
335 lh->gapsLo[3*i] = 0;
336 lh->gapsLo[3*i+1] = 0;
337 lh->gapsLo[3*i+2] = 0;
338 lh->numStr[i] = 0;
339 lh->fStrToken[i] = NULL;
340 lh->lStrToken[i] = NULL;
341 lh->pos[i] = -1;
342 }
343
344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
345
346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
348 lh->pos[0] = 0;
349 t1 = lh->baseCE;
350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
356
357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
359
360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
363 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
365 lh->pos[0] = 0;
366 t1 = lh->baseCE;
367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
371 t1 = lh->nextCE;
372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
376 } else {
377 for(;;) {
378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
380 lh->fStrToken[tokStrength] = tok;
381 } else { /* The CE must be implicit, since it's not in the table */
382 /* Error */
383 *status = U_INTERNAL_PROGRAM_ERROR;
384 }
385 }
386
387 while(tok != NULL && tok->strength >= tokStrength) {
388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
389 lh->lStrToken[tokStrength] = tok;
390 }
391 tok = tok->next;
392 }
393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
394 /* check if previous interval is the same and merge the intervals if it is so */
395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
397 lh->fStrToken[tokStrength+1] = NULL;
398 lh->lStrToken[tokStrength+1] = NULL;
399 lh->pos[tokStrength+1] = -1;
400 }
401 }
402 if(tok != NULL) {
403 tokStrength = tok->strength;
404 } else {
405 break;
406 }
407 }
408 for(st = 0; st < 3; st++) {
409 if((pos = lh->pos[st]) >= 0) {
410 t1 = *(CETable+3*(pos));
411 t2 = *(CETable+3*(pos)+1);
412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
416 //pos--;
417 //t1 = *(CETable+3*(pos));
418 //t2 = *(CETable+3*(pos)+1);
419 t1 = lh->baseCE;
420 t2 = lh->baseContCE;
421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
424 }
425 }
426 }
427 }
428
429
430 #define ucol_countBytes(value, noOfBytes) \
431 { \
432 uint32_t mask = 0xFFFFFFFF; \
433 (noOfBytes) = 0; \
434 while(mask != 0) { \
435 if(((value) & mask) != 0) { \
436 (noOfBytes)++; \
437 } \
438 mask >>= 8; \
439 } \
440 }
441
442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
443 if(U_SUCCESS(*status)) {
444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
445 }
446 return g->current;
447 }
448
449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
450 /* TODO: rename to enum names */
451 uint32_t high, low, count=1;
452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
453
454 if(strength == UCOL_SECONDARY) {
455 low = UCOL_COMMON_TOP2<<24;
456 high = 0xFFFFFFFF;
457 count = 0xFF - UCOL_COMMON_TOP2;
458 } else {
459 low = UCOL_BYTE_COMMON << 24; //0x05000000;
460 high = 0x40000000;
461 count = 0x40 - UCOL_BYTE_COMMON;
462 }
463
464 if(tok->next != NULL && tok->next->strength == strength) {
465 count = tok->next->toInsert;
466 }
467
468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
469 g->current = UCOL_BYTE_COMMON<<24;
470
471 if(g->noOfRanges == 0) {
472 *status = U_INTERNAL_PROGRAM_ERROR;
473 }
474 return g->current;
475 }
476
477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
478 uint32_t strength = tok->strength;
479 uint32_t low = lows[fStrength*3+strength];
480 uint32_t high = highs[fStrength*3+strength];
481 uint32_t maxByte = 0;
482 if(strength == UCOL_TERTIARY) {
483 maxByte = 0x3F;
484 } else if(strength == UCOL_PRIMARY) {
485 maxByte = 0xFE;
486 } else {
487 maxByte = 0xFF;
488 }
489
490 uint32_t count = tok->toInsert;
491
492 if(low >= high && strength > UCOL_PRIMARY) {
493 int32_t s = strength;
494 for(;;) {
495 s--;
496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
497 if(strength == UCOL_SECONDARY) {
498 if (low < UCOL_COMMON_TOP2<<24 ) {
499 // Override if low range is less than UCOL_COMMON_TOP2.
500 low = UCOL_COMMON_TOP2<<24;
501 }
502 high = 0xFFFFFFFF;
503 } else {
504 // Override if low range is less than UCOL_COMMON_BOT3.
505 if ( low < UCOL_COMMON_BOT3<<24 ) {
506 low = UCOL_COMMON_BOT3<<24;
507 }
508 high = 0x40000000;
509 }
510 break;
511 }
512 if(s<0) {
513 *status = U_INTERNAL_PROGRAM_ERROR;
514 return 0;
515 }
516 }
517 }
518
519 if(low < 0x02000000) {
520 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
521 // See http://site.icu-project.org/design/collation/bytes
522 low = 0x02000000;
523 }
524
525 if(strength == UCOL_SECONDARY) { /* similar as simple */
526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
527 low = UCOL_COMMON_TOP2<<24;
528 }
529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
530 high = UCOL_COMMON_TOP2<<24;
531 }
532 if(low < (UCOL_COMMON_BOT2<<24)) {
533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
535 //g->current = UCOL_COMMON_BOT2<<24;
536 return g->current;
537 }
538 }
539
540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
541 if(g->noOfRanges == 0) {
542 *status = U_INTERNAL_PROGRAM_ERROR;
543 }
544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
545 return g->current;
546 }
547
548 static
549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
550 uint32_t i = 0;
551 UChar c;
552
553 if(U_FAILURE(*status)) {
554 return 0;
555 }
556
557 if(sourceLen > resLen) {
558 *status = U_MEMORY_ALLOCATION_ERROR;
559 return 0;
560 }
561
562 for(i = 0; i < sourceLen; i++) {
563 c = source[i];
564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
565 switch(c - 0x3000) {
566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
568 c++;
569 break;
570 case 0xF5:
571 c = 0x30AB;
572 break;
573 case 0xF6:
574 c = 0x30B1;
575 break;
576 }
577 }
578 resBuf[i] = c;
579 }
580 return sourceLen;
581 }
582
583 static
584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
585 uint32_t i = 0;
586 UChar c;
587
588 if(U_FAILURE(*status)) {
589 return 0;
590 }
591
592 if(sourceLen > resLen) {
593 *status = U_MEMORY_ALLOCATION_ERROR;
594 return 0;
595 }
596
597 for(i = 0; i < sourceLen; i++) {
598 c = source[i];
599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
600 switch(c - 0x3000) {
601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
603 c--;
604 break;
605 case 0xAB:
606 c = 0x30F5;
607 break;
608 case 0xB1:
609 c = 0x30F6;
610 break;
611 }
612 }
613 resBuf[i] = c;
614 }
615 return sourceLen;
616 }
617
618 U_NAMESPACE_BEGIN
619
620 static
621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
622 uint32_t i = 0;
623 UChar n[128];
624 uint32_t nLen = 0;
625 uint32_t uCount = 0, lCount = 0;
626
627 collIterate s;
628 uint32_t order = 0;
629
630 if(U_FAILURE(*status)) {
631 return UCOL_LOWER_CASE;
632 }
633
634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
635 if(U_SUCCESS(*status)) {
636 for(i = 0; i < nLen; i++) {
637 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
638 order = ucol_getNextCE(UCA, &s, status);
639 if(isContinuation(order)) {
640 *status = U_INTERNAL_PROGRAM_ERROR;
641 return UCOL_LOWER_CASE;
642 }
643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
644 uCount++;
645 } else {
646 if(u_islower(n[i])) {
647 lCount++;
648 } else if(U_SUCCESS(*status)) {
649 UChar sk[1], lk[1];
650 u_toSmallKana(&n[i], 1, sk, 1, status);
651 u_toLargeKana(&n[i], 1, lk, 1, status);
652 if(sk[0] == n[i] && lk[0] != n[i]) {
653 lCount++;
654 }
655 }
656 }
657 }
658 }
659
660 if(uCount != 0 && lCount != 0) {
661 return UCOL_MIXED_CASE;
662 } else if(uCount != 0) {
663 return UCOL_UPPER_CASE;
664 } else {
665 return UCOL_LOWER_CASE;
666 }
667 }
668
669
670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
671 /* this one makes the table and stuff */
672 uint32_t noOfBytes[3];
673 uint32_t i;
674
675 for(i = 0; i<3; i++) {
676 ucol_countBytes(CEparts[i], noOfBytes[i]);
677 }
678
679 /* Here we have to pack CEs from parts */
680
681 uint32_t CEi = 0;
682 uint32_t value = 0;
683
684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
685 if(CEi > 0) {
686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
687 } else {
688 value = 0;
689 }
690
691 if(2*CEi<noOfBytes[0]) {
692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
693 }
694 if(CEi<noOfBytes[1]) {
695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
696 }
697 if(CEi<noOfBytes[2]) {
698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
699 }
700 tok->CEs[CEi] = value;
701 CEi++;
702 }
703 if(CEi == 0) { /* totally ignorable */
704 tok->noOfCEs = 1;
705 tok->CEs[0] = 0;
706 } else { /* there is at least something */
707 tok->noOfCEs = CEi;
708 }
709
710
711 // we want to set case bits here and now, not later.
712 // Case bits handling
713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
715 int32_t cSize = (tok->source & 0xFF000000) >> 24;
716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
717
718 if(cSize > 1) {
719 // Do it manually
720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
721 } else {
722 // Copy it from the UCA
723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
724 tok->CEs[0] |= (caseCE & 0xC0);
725 }
726 }
727
728 #if UCOL_DEBUG==2
729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
730 for(i = 0; i<tok->noOfCEs; i++) {
731 fprintf(stderr, "%08X ", tok->CEs[i]);
732 }
733 fprintf(stderr, "\n");
734 #endif
735 }
736
737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
740
741 UColToken *tok = lh->last;
742 uint32_t t[UCOL_STRENGTH_LIMIT];
743
744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
745
746 tok->toInsert = 1;
747 t[tok->strength] = 1;
748
749 while(tok->previous != NULL) {
750 if(tok->previous->strength < tok->strength) { /* going up */
751 t[tok->strength] = 0;
752 t[tok->previous->strength]++;
753 } else if(tok->previous->strength > tok->strength) { /* going down */
754 t[tok->previous->strength] = 1;
755 } else {
756 t[tok->strength]++;
757 }
758 tok=tok->previous;
759 tok->toInsert = t[tok->strength];
760 }
761
762 tok->toInsert = t[tok->strength];
763 ucol_inv_getGapPositions(src, lh, status);
764
765 #if UCOL_DEBUG
766 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
767 int32_t j = 2;
768 for(j = 2; j >= 0; j--) {
769 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
770 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
771 }
772 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
773
774 do {
775 fprintf(stderr,"%i", tok->strength);
776 tok = tok->next;
777 } while(tok != NULL);
778 fprintf(stderr, "\n");
779
780 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
781
782 do {
783 fprintf(stderr,"%i", tok->toInsert);
784 tok = tok->next;
785 } while(tok != NULL);
786 #endif
787
788 tok = lh->first;
789 uint32_t fStrength = UCOL_IDENTICAL;
790 uint32_t initStrength = UCOL_IDENTICAL;
791
792
793 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
794 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
795 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
796
797 while (tok != NULL && U_SUCCESS(*status)) {
798 fStrength = tok->strength;
799 if(fStrength < initStrength) {
800 initStrength = fStrength;
801 if(lh->pos[fStrength] == -1) {
802 while(lh->pos[fStrength] == -1 && fStrength > 0) {
803 fStrength--;
804 }
805 if(lh->pos[fStrength] == -1) {
806 *status = U_INTERNAL_PROGRAM_ERROR;
807 return;
808 }
809 }
810 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
811 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
812 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
813 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
814 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
815 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
817 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
818 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
819 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
820 } else { /* primaries */
821 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
822 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
823 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
825 }
826 } else {
827 if(tok->strength == UCOL_TERTIARY) {
828 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
829 } else if(tok->strength == UCOL_SECONDARY) {
830 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
831 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
832 } else if(tok->strength == UCOL_PRIMARY) {
833 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
834 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
835 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
836 }
837 }
838 ucol_doCE(src, CEparts, tok, status);
839 tok = tok->next;
840 }
841 }
842
843 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
844 UCAElements el;
845 UColToken *tok = lh->first;
846 UColToken *expt = NULL;
847 uint32_t i = 0, j = 0;
848 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
849
850 while(tok != NULL && U_SUCCESS(*status)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing */
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok->expansion != 0) {
855 uint32_t len = tok->expansion >> 24;
856 uint32_t currentSequenceLen = len;
857 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
859 UColToken exp;
860 exp.source = currentSequenceLen | expOffset;
861 exp.rulesToParseHdl = &(src->source);
862
863 while(len > 0) {
864 currentSequenceLen = len;
865 while(currentSequenceLen > 0) {
866 exp.source = (currentSequenceLen << 24) | expOffset;
867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy = expt->noOfCEs;
869 for(j = 0; j<noOfCEsToCopy; j++) {
870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
871 }
872 tok->noOfExpCEs += noOfCEsToCopy;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset += currentSequenceLen; //noOfCEsToCopy;
876 len -= currentSequenceLen; //noOfCEsToCopy;
877 break;
878 } else {
879 currentSequenceLen--;
880 }
881 }
882 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them into expansion */
886 collIterate s;
887 uint32_t order = 0;
888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
889
890 for(;;) {
891 order = ucol_getNextCE(src->UCA, &s, status);
892 if(order == UCOL_NO_MORE_CES) {
893 break;
894 }
895 tok->expCEs[tok->noOfExpCEs++] = order;
896 }
897 expOffset++;
898 len--;
899 }
900 }
901 } else {
902 tok->noOfExpCEs = 0;
903 }
904
905 /* set the ucaelement with obtained values */
906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
907 /* copy CEs */
908 for(i = 0; i<tok->noOfCEs; i++) {
909 el.CEs[i] = tok->CEs[i];
910 }
911 for(i = 0; i<tok->noOfExpCEs; i++) {
912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
913 }
914
915 /* copy UChars */
916 // We kept prefix and source kind of together, as it is a kind of a contraction.
917 // However, now we have to slice the prefix off the main thing -
918 el.prefix = el.prefixChars;
919 el.cPoints = el.uchars;
920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el.prefixSize = tok->prefix>>24;
924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
925
926 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
928 } else {
929 el.prefixSize = 0;
930 *el.prefix = 0;
931
932 el.cSize = (tok->source >> 24);
933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
934 }
935 if(src->UCA != NULL) {
936 for(i = 0; i<el.cSize; i++) {
937 if(UCOL_ISJAMO(el.cPoints[i])) {
938 t->image->jamoSpecial = TRUE;
939 }
940 }
941 if (!src->buildCCTabFlag && el.cSize > 0) {
942 // Check the trailing canonical combining class (tccc) of the last character.
943 const UChar *s = el.cPoints + el.cSize;
944 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
945 if ((fcd & 0xff) != 0) {
946 src->buildCCTabFlag = TRUE;
947 }
948 }
949 }
950
951 /* and then, add it */
952 #if UCOL_DEBUG==2
953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
954 #endif
955 uprv_uca_addAnElement(t, &el, status);
956
957 #if UCOL_DEBUG_DUPLICATES
958 if(*status != U_ZERO_ERROR) {
959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
960 *status = U_ZERO_ERROR;
961 }
962 #endif
963
964 tok = tok->next;
965 }
966 }
967
968 U_CDECL_BEGIN
969 static UBool U_CALLCONV
970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
971 UErrorCode status = U_ZERO_ERROR;
972 tempUCATable *t = (tempUCATable *)context;
973 if(value == 0) {
974 while(start < limit) {
975 uint32_t CE = utrie_get32(t->mapping, start, NULL);
976 if(CE == UCOL_NOT_FOUND) {
977 UCAElements el;
978 el.isThai = FALSE;
979 el.prefixSize = 0;
980 el.prefixChars[0] = 0;
981 el.prefix = el.prefixChars;
982 el.cPoints = el.uchars;
983
984 el.cSize = 0;
985 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
986
987 el.noOfCEs = 1;
988 el.CEs[0] = 0;
989 uprv_uca_addAnElement(t, &el, &status);
990
991 }
992 start++;
993 }
994 }
995 if(U_FAILURE(status)) {
996 return FALSE;
997 } else {
998 return TRUE;
999 }
1000 }
1001 U_CDECL_END
1002
1003 static void
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005 UChar32 start, UChar32 end,
1006 UErrorCode *status)
1007 {
1008 //UChar decomp[256];
1009 uint32_t CE = UCOL_NOT_FOUND;
1010 UChar32 u = 0;
1011 UCAElements el;
1012 el.isThai = FALSE;
1013 el.prefixSize = 0;
1014 el.prefixChars[0] = 0;
1015 collIterate colIt;
1016
1017 if(U_SUCCESS(*status)) {
1018 for(u = start; u<=end; u++) {
1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting element. */
1021 || ((isCntTableElement(CE)) &&
1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023 )
1024 {
1025 el.cSize = 0;
1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el.cPoints = el.uchars;
1030 //el.cSize = 1;
1031 el.noOfCEs = 0;
1032 el.prefix = el.prefixChars;
1033 el.prefixSize = 0;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy the
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1041 /*
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1043 tag = getCETag(CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1048 } else {
1049 */
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the UCA
1053 // or not.
1054 {
1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056 while(CE != UCOL_NO_MORE_CES) {
1057 CE = ucol_getNextCE(src->UCA, &colIt, status);
1058 if(CE != UCOL_NO_MORE_CES) {
1059 el.CEs[el.noOfCEs++] = CE;
1060 }
1061 }
1062 }
1063 uprv_uca_addAnElement(t, &el, status);
1064 }
1065 }
1066 }
1067 }
1068
1069 U_NAMESPACE_END
1070
1071 U_CFUNC UCATableHeader *
1072 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1073 U_NAMESPACE_USE
1074
1075 uint32_t i = 0;
1076 if(U_FAILURE(*status)) {
1077 return NULL;
1078 }
1079 /*
1080 2. Eliminate the negative lists by doing the following for each non-null negative list:
1081 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082 create new ListHeader X
1083 o reverse the list, add to the end of X's positive list. Reset the strength of the
1084 first item you add, based on the stronger strength levels of the two lists.
1085 */
1086 /*
1087 3. For each ListHeader with a non-null positive list:
1088 */
1089 /*
1090 o Find all character strings with CEs between the baseCE and the
1091 next/previous CE, at the strength of the first token. Add these to the
1092 tailoring.
1093 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1094 tailoring has & x < z...
1095 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1096 */
1097 /* It is possible that this part should be done even while constructing list */
1098 /* The problem is that it is unknown what is going to be the strongest weight */
1099 /* So we might as well do it here */
1100
1101 /*
1102 o Allocate CEs for each token in the list, based on the total number N of the
1103 largest level difference, and the gap G between baseCE and nextCE at that
1104 level. The relation * between the last item and nextCE is the same as the
1105 strongest strength.
1106 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108 Then fit b and c into the secondary gap between a and d, then fit q
1109 into the tertiary gap between b and c.
1110
1111 o Example: baseCE << b <<< q << c * nextCE(X,2)
1112 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113 Then fit q into the tertiary gap between b and c.
1114 o When incrementing primary values, we will not cross high byte
1115 boundaries except where there is only a single-byte primary. That is to
1116 ensure that the script reordering will continue to work.
1117 */
1118 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1119 /* test for NULL */
1120 if (image == NULL) {
1121 *status = U_MEMORY_ALLOCATION_ERROR;
1122 return NULL;
1123 }
1124 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1125
1126 for(i = 0; i<src->resultLen; i++) {
1127 /* now we need to generate the CEs */
1128 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129 /* According to strength */
1130 if(U_SUCCESS(*status)) {
1131 if(src->lh[i].first) { // if there are any elements
1132 // due to the way parser works, subsequent tailorings
1133 // may remove all the elements from a sequence, therefore
1134 // leaving an empty tailoring sequence.
1135 ucol_initBuffers(src, &src->lh[i], status);
1136 }
1137 }
1138 if(U_FAILURE(*status)) {
1139 uprv_free(image);
1140 return NULL;
1141 }
1142 }
1143
1144 if(src->varTop != NULL) { /* stuff the variable top value */
1145 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1146 /* remove it from the list */
1147 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1148 src->varTop->listHeader->first = src->varTop->next;
1149 }
1150 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1151 src->varTop->listHeader->last = src->varTop->previous;
1152 }
1153 if(src->varTop->next != NULL) {
1154 src->varTop->next->previous = src->varTop->previous;
1155 }
1156 if(src->varTop->previous != NULL) {
1157 src->varTop->previous->next = src->varTop->next;
1158 }
1159 }
1160
1161
1162 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1163 if(U_FAILURE(*status)) {
1164 uprv_free(image);
1165 return NULL;
1166 }
1167
1168
1169 /* After this, we have assigned CE values to all regular CEs */
1170 /* now we will go through list once more and resolve expansions, */
1171 /* make UCAElements structs and add them to table */
1172 for(i = 0; i<src->resultLen; i++) {
1173 /* now we need to generate the CEs */
1174 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175 /* According to strength */
1176 if(U_SUCCESS(*status)) {
1177 ucol_createElements(src, t, &src->lh[i], status);
1178 }
1179 }
1180
1181 UCAElements el;
1182 el.isThai = FALSE;
1183 el.prefixSize = 0;
1184 el.prefixChars[0] = 0;
1185
1186 /* add latin-1 stuff */
1187 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1188
1189 /* add stuff for copying */
1190 if(src->copySet != NULL) {
1191 int32_t i = 0;
1192 UnicodeSet *set = (UnicodeSet *)src->copySet;
1193 for(i = 0; i < set->getRangeCount(); i++) {
1194 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1195 }
1196 }
1197
1198 if(U_SUCCESS(*status)) {
1199 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1200
1201 uint32_t tailoredCE = UCOL_NOT_FOUND;
1202 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1203 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1204 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1205 // Check for null pointer
1206 if (ucaEl == NULL) {
1207 *status = U_MEMORY_ALLOCATION_ERROR;
1208 return NULL;
1209 }
1210 while(*conts != 0) {
1211 // A continuation is NUL-terminated and NUL-padded
1212 // except if it has the maximum length.
1213 int32_t contractionLength = maxUCAContractionLength;
1214 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1215 --contractionLength;
1216 }
1217 UChar32 first;
1218 int32_t firstLength = 0;
1219 U16_NEXT(conts, firstLength, contractionLength, first);
1220 tailoredCE = utrie_get32(t->mapping, first, NULL);
1221 if(tailoredCE != UCOL_NOT_FOUND) {
1222 UBool needToAdd = TRUE;
1223 if(isCntTableElement(tailoredCE)) {
1224 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1225 needToAdd = FALSE;
1226 }
1227 }
1228 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1229 UCAElements elm;
1230 elm.cPoints = el.uchars;
1231 elm.noOfCEs = 0;
1232 elm.uchars[0] = *conts;
1233 elm.uchars[1] = 0;
1234 elm.cSize = 1;
1235 elm.prefixChars[0] = *(conts+2);
1236 elm.isThai = FALSE;
1237 elm.prefix = elm.prefixChars;
1238 elm.prefixSize = 1;
1239 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1240 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1241 needToAdd = TRUE;
1242 }
1243 }
1244 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1245 needToAdd = FALSE;
1246 }
1247
1248 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1249 if (*(conts+1) != 0) { // contractions
1250 el.prefix = el.prefixChars;
1251 el.prefixSize = 0;
1252 el.cPoints = el.uchars;
1253 el.noOfCEs = 0;
1254 u_memcpy(el.uchars, conts, contractionLength);
1255 el.cSize = contractionLength;
1256 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1257 }
1258 else { // pre-context character
1259 UChar str[4] = { 0 };
1260 int32_t len=0;
1261 int32_t preKeyLen=0;
1262
1263 el.cPoints = el.uchars;
1264 el.noOfCEs = 0;
1265 el.uchars[0] = *conts;
1266 el.uchars[1] = 0;
1267 el.cSize = 1;
1268 el.prefixChars[0] = *(conts+2);
1269 el.prefix = el.prefixChars;
1270 el.prefixSize = 1;
1271 if (el.prefixChars[0]!=0) {
1272 // get CE of prefix character first
1273 str[0]=el.prefixChars[0];
1274 str[1]=0;
1275 ucol_setText(ucaEl, str, 1, status);
1276 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1277 != UCOL_NULLORDER) {
1278 preKeyLen++; // count number of keys for prefix character
1279 }
1280 str[len++] = el.prefixChars[0];
1281 }
1282
1283 str[len++] = el.uchars[0];
1284 str[len]=0;
1285 ucol_setText(ucaEl, str, len, status);
1286 // Skip the keys for prefix character, then copy the rest to el.
1287 while ((preKeyLen-->0) &&
1288 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1289 continue;
1290 }
1291
1292 }
1293 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1294 el.noOfCEs++;
1295 }
1296 uprv_uca_addAnElement(t, &el, status);
1297 }
1298
1299 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1300 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1301 }
1302 conts+=maxUCAContractionLength;
1303 }
1304 ucol_closeElements(ucaEl);
1305 }
1306
1307 // Add completely ignorable elements
1308 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1309
1310 // add tailoring characters related canonical closures
1311 uprv_uca_canonicalClosure(t, src, NULL, status);
1312
1313 /* still need to produce compatibility closure */
1314
1315 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1316
1317 uprv_uca_closeTempTable(t);
1318 uprv_free(image);
1319
1320 return myData;
1321 }
1322
1323 U_CDECL_BEGIN
1324 static UBool U_CALLCONV
1325 ucol_bld_cleanup(void)
1326 {
1327 udata_close(invUCA_DATA_MEM);
1328 invUCA_DATA_MEM = NULL;
1329 _staticInvUCA = NULL;
1330 return TRUE;
1331 }
1332 U_CDECL_END
1333
1334 U_CAPI const InverseUCATableHeader * U_EXPORT2
1335 ucol_initInverseUCA(UErrorCode *status)
1336 {
1337 if(U_FAILURE(*status)) return NULL;
1338
1339 UBool needsInit;
1340 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1341
1342 if(needsInit) {
1343 InverseUCATableHeader *newInvUCA = NULL;
1344 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1345
1346 if(U_FAILURE(*status)) {
1347 if (result) {
1348 udata_close(result);
1349 }
1350 // This is not needed, as we are talking about
1351 // memory we got from UData
1352 //uprv_free(newInvUCA);
1353 }
1354
1355 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1356 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1357 UCollator *UCA = ucol_initUCA(status);
1358 // UCA versions of UCA and inverse UCA should match
1359 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1360 *status = U_INVALID_FORMAT_ERROR;
1361 udata_close(result);
1362 return NULL;
1363 }
1364
1365 umtx_lock(NULL);
1366 if(_staticInvUCA == NULL) {
1367 invUCA_DATA_MEM = result;
1368 _staticInvUCA = newInvUCA;
1369 result = NULL;
1370 newInvUCA = NULL;
1371 }
1372 umtx_unlock(NULL);
1373
1374 if(newInvUCA != NULL) {
1375 udata_close(result);
1376 // This is not needed, as we are talking about
1377 // memory we got from UData
1378 //uprv_free(newInvUCA);
1379 }
1380 else {
1381 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1382 }
1383 }
1384 }
1385 return _staticInvUCA;
1386 }
1387
1388 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1389 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1390 */
1391 static const char* ReorderingTokenNames[] = {
1392 "SPACE",
1393 "PUNCT",
1394 "SYMBOL",
1395 "CURRENCY",
1396 "DIGIT",
1397 NULL
1398 };
1399
1400 static void toUpper(const char* src, char* dst, uint32_t length) {
1401 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1402 *dst = uprv_toupper(*src);
1403 }
1404 *dst = '\0';
1405 }
1406
1407 U_INTERNAL int32_t U_EXPORT2
1408 ucol_findReorderingEntry(const char* name) {
1409 char buffer[32];
1410 toUpper(name, buffer, 32);
1411 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1412 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1413 return entry + UCOL_REORDER_CODE_FIRST;
1414 }
1415 }
1416 return USCRIPT_INVALID_CODE;
1417 }
1418
1419 #endif /* #if !UCONFIG_NO_COLLATION */