]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_bld.cpp
ICU-461.17.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
31 #include "ucol_bld.h"
32 #include "ucol_elm.h"
33 #include "ucol_cnt.h"
34 #include "ucln_in.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "cstring.h"
38
39 static const InverseUCATableHeader* _staticInvUCA = NULL;
40 static UDataMemory* invUCA_DATA_MEM = NULL;
41
42 U_CDECL_BEGIN
43 static UBool U_CALLCONV
44 isAcceptableInvUCA(void * /*context*/,
45 const char * /*type*/, const char * /*name*/,
46 const UDataInfo *pInfo)
47 {
48 /* context, type & name are intentionally not used */
49 if( pInfo->size>=20 &&
50 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
51 pInfo->charsetFamily==U_CHARSET_FAMILY &&
52 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
53 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
54 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
55 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
56 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
57 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
58 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
59 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
60 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
61 )
62 {
63 UVersionInfo UCDVersion;
64 u_getUnicodeVersion(UCDVersion);
65 return (pInfo->dataVersion[0]==UCDVersion[0] &&
66 pInfo->dataVersion[1]==UCDVersion[1]);
67 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
68 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
69 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
70 } else {
71 return FALSE;
72 }
73 }
74 U_CDECL_END
75
76 /*
77 * Takes two CEs (lead and continuation) and
78 * compares them as CEs should be compared:
79 * primary vs. primary, secondary vs. secondary
80 * tertiary vs. tertiary
81 */
82 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
83 uint32_t s1 = source0, s2, t1 = target0, t2;
84 if(isContinuation(source1)) {
85 s2 = source1;
86 } else {
87 s2 = 0;
88 }
89 if(isContinuation(target1)) {
90 t2 = target1;
91 } else {
92 t2 = 0;
93 }
94
95 uint32_t s = 0, t = 0;
96 if(s1 == t1 && s2 == t2) {
97 return 0;
98 }
99 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
100 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
101 if(s < t) {
102 return -1;
103 } else if(s > t) {
104 return 1;
105 } else {
106 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
107 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
108 if(s < t) {
109 return -1;
110 } else if(s > t) {
111 return 1;
112 } else {
113 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
114 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
115 if(s < t) {
116 return -1;
117 } else {
118 return 1;
119 }
120 }
121 }
122 }
123
124 static
125 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
126 uint32_t bottom = 0, top = src->invUCA->tableSize;
127 uint32_t i = 0;
128 uint32_t first = 0, second = 0;
129 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
130 int32_t res = 0;
131
132 while(bottom < top-1) {
133 i = (top+bottom)/2;
134 first = *(CETable+3*i);
135 second = *(CETable+3*i+1);
136 res = compareCEs(first, second, CE, SecondCE);
137 if(res > 0) {
138 top = i;
139 } else if(res < 0) {
140 bottom = i;
141 } else {
142 break;
143 }
144 }
145
146 /* weiv: */
147 /* in searching for elements, I have removed the failure */
148 /* The reason for this is that the builder does not rely */
149 /* on search mechanism telling it that it didn't find an */
150 /* element. However, indirect positioning relies on being */
151 /* able to find the elements around any CE, even if it is */
152 /* not defined in the UCA. */
153 return i;
154 /*
155 if((first == CE && second == SecondCE)) {
156 return i;
157 } else {
158 return -1;
159 }
160 */
161 }
162
163 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
164 0xFFFF0000,
165 0xFFFFFF00,
166 0xFFFFFFFF
167 };
168
169 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
170 uint32_t CE, uint32_t contCE,
171 uint32_t *nextCE, uint32_t *nextContCE,
172 uint32_t strength)
173 {
174 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
175 int32_t iCE;
176
177 iCE = ucol_inv_findCE(src, CE, contCE);
178
179 if(iCE<0) {
180 *nextCE = UCOL_NOT_FOUND;
181 return -1;
182 }
183
184 CE &= strengthMask[strength];
185 contCE &= strengthMask[strength];
186
187 *nextCE = CE;
188 *nextContCE = contCE;
189
190 while((*nextCE & strengthMask[strength]) == CE
191 && (*nextContCE & strengthMask[strength]) == contCE)
192 {
193 *nextCE = (*(CETable+3*(++iCE)));
194 *nextContCE = (*(CETable+3*(iCE)+1));
195 }
196
197 return iCE;
198 }
199
200 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
201 uint32_t CE, uint32_t contCE,
202 uint32_t *prevCE, uint32_t *prevContCE,
203 uint32_t strength)
204 {
205 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
206 int32_t iCE;
207
208 iCE = ucol_inv_findCE(src, CE, contCE);
209
210 if(iCE<0) {
211 *prevCE = UCOL_NOT_FOUND;
212 return -1;
213 }
214
215 CE &= strengthMask[strength];
216 contCE &= strengthMask[strength];
217
218 *prevCE = CE;
219 *prevContCE = contCE;
220
221 while((*prevCE & strengthMask[strength]) == CE
222 && (*prevContCE & strengthMask[strength])== contCE
223 && iCE > 0) /* this condition should prevent falling off the edge of the world */
224 {
225 /* here, we end up in a singularity - zero */
226 *prevCE = (*(CETable+3*(--iCE)));
227 *prevContCE = (*(CETable+3*(iCE)+1));
228 }
229
230 return iCE;
231 }
232
233 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
234 uint32_t prevCE, uint32_t prevContCE)
235 {
236 if(prevCE == CE && prevContCE == contCE) {
237 return UCOL_IDENTICAL;
238 }
239 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
240 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
241 {
242 return UCOL_PRIMARY;
243 }
244 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
245 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
246 {
247 return UCOL_SECONDARY;
248 }
249 return UCOL_TERTIARY;
250 }
251
252
253 /*static
254 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
255
256 uint32_t CE = lh->baseCE;
257 uint32_t SecondCE = lh->baseContCE;
258
259 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
260 uint32_t previousCE, previousContCE;
261 int32_t iCE;
262
263 iCE = ucol_inv_findCE(src, CE, SecondCE);
264
265 if(iCE<0) {
266 return -1;
267 }
268
269 CE &= strengthMask[strength];
270 SecondCE &= strengthMask[strength];
271
272 previousCE = CE;
273 previousContCE = SecondCE;
274
275 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
276 previousCE = (*(CETable+3*(--iCE)));
277 previousContCE = (*(CETable+3*(iCE)+1));
278 }
279 lh->previousCE = previousCE;
280 lh->previousContCE = previousContCE;
281
282 return iCE;
283 }*/
284
285 static
286 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
287 uint32_t CE = lh->baseCE;
288 uint32_t SecondCE = lh->baseContCE;
289
290 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
291 uint32_t nextCE, nextContCE;
292 int32_t iCE;
293
294 iCE = ucol_inv_findCE(src, CE, SecondCE);
295
296 if(iCE<0) {
297 return -1;
298 }
299
300 CE &= strengthMask[strength];
301 SecondCE &= strengthMask[strength];
302
303 nextCE = CE;
304 nextContCE = SecondCE;
305
306 while((nextCE & strengthMask[strength]) == CE
307 && (nextContCE & strengthMask[strength]) == SecondCE)
308 {
309 nextCE = (*(CETable+3*(++iCE)));
310 nextContCE = (*(CETable+3*(iCE)+1));
311 }
312
313 lh->nextCE = nextCE;
314 lh->nextContCE = nextContCE;
315
316 return iCE;
317 }
318
319 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
320 /* reset all the gaps */
321 int32_t i = 0;
322 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
323 uint32_t st = 0;
324 uint32_t t1, t2;
325 int32_t pos;
326
327 UColToken *tok = lh->first;
328 uint32_t tokStrength = tok->strength;
329
330 for(i = 0; i<3; i++) {
331 lh->gapsHi[3*i] = 0;
332 lh->gapsHi[3*i+1] = 0;
333 lh->gapsHi[3*i+2] = 0;
334 lh->gapsLo[3*i] = 0;
335 lh->gapsLo[3*i+1] = 0;
336 lh->gapsLo[3*i+2] = 0;
337 lh->numStr[i] = 0;
338 lh->fStrToken[i] = NULL;
339 lh->lStrToken[i] = NULL;
340 lh->pos[i] = -1;
341 }
342
343 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
344
345 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
346 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
347 lh->pos[0] = 0;
348 t1 = lh->baseCE;
349 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
350 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
351 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
352 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
353 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
354 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
355
356 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
357 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
358
359 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
360 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
361 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
362 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
363 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
364 lh->pos[0] = 0;
365 t1 = lh->baseCE;
366 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
367 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
368 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
369 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
370 t1 = lh->nextCE;
371 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
372 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
373 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
374 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
375 } else {
376 for(;;) {
377 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
378 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
379 lh->fStrToken[tokStrength] = tok;
380 } else { /* The CE must be implicit, since it's not in the table */
381 /* Error */
382 *status = U_INTERNAL_PROGRAM_ERROR;
383 }
384 }
385
386 while(tok != NULL && tok->strength >= tokStrength) {
387 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
388 lh->lStrToken[tokStrength] = tok;
389 }
390 tok = tok->next;
391 }
392 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
393 /* check if previous interval is the same and merge the intervals if it is so */
394 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
395 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
396 lh->fStrToken[tokStrength+1] = NULL;
397 lh->lStrToken[tokStrength+1] = NULL;
398 lh->pos[tokStrength+1] = -1;
399 }
400 }
401 if(tok != NULL) {
402 tokStrength = tok->strength;
403 } else {
404 break;
405 }
406 }
407 for(st = 0; st < 3; st++) {
408 if((pos = lh->pos[st]) >= 0) {
409 t1 = *(CETable+3*(pos));
410 t2 = *(CETable+3*(pos)+1);
411 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
412 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
413 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
414 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
415 //pos--;
416 //t1 = *(CETable+3*(pos));
417 //t2 = *(CETable+3*(pos)+1);
418 t1 = lh->baseCE;
419 t2 = lh->baseContCE;
420 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
421 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
422 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
423 }
424 }
425 }
426 }
427
428
429 #define ucol_countBytes(value, noOfBytes) \
430 { \
431 uint32_t mask = 0xFFFFFFFF; \
432 (noOfBytes) = 0; \
433 while(mask != 0) { \
434 if(((value) & mask) != 0) { \
435 (noOfBytes)++; \
436 } \
437 mask >>= 8; \
438 } \
439 }
440
441 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
442 if(U_SUCCESS(*status)) {
443 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
444 }
445 return g->current;
446 }
447
448 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
449 /* TODO: rename to enum names */
450 uint32_t high, low, count=1;
451 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
452
453 if(strength == UCOL_SECONDARY) {
454 low = UCOL_COMMON_TOP2<<24;
455 high = 0xFFFFFFFF;
456 count = 0xFF - UCOL_COMMON_TOP2;
457 } else {
458 low = UCOL_BYTE_COMMON << 24; //0x05000000;
459 high = 0x40000000;
460 count = 0x40 - UCOL_BYTE_COMMON;
461 }
462
463 if(tok->next != NULL && tok->next->strength == strength) {
464 count = tok->next->toInsert;
465 }
466
467 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
468 g->current = UCOL_BYTE_COMMON<<24;
469
470 if(g->noOfRanges == 0) {
471 *status = U_INTERNAL_PROGRAM_ERROR;
472 }
473 return g->current;
474 }
475
476 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
477 uint32_t strength = tok->strength;
478 uint32_t low = lows[fStrength*3+strength];
479 uint32_t high = highs[fStrength*3+strength];
480 uint32_t maxByte = 0;
481 if(strength == UCOL_TERTIARY) {
482 maxByte = 0x3F;
483 } else if(strength == UCOL_PRIMARY) {
484 maxByte = 0xFE;
485 } else {
486 maxByte = 0xFF;
487 }
488
489 uint32_t count = tok->toInsert;
490
491 if(low >= high && strength > UCOL_PRIMARY) {
492 int32_t s = strength;
493 for(;;) {
494 s--;
495 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
496 if(strength == UCOL_SECONDARY) {
497 if (low < UCOL_COMMON_TOP2<<24 ) {
498 // Override if low range is less than UCOL_COMMON_TOP2.
499 low = UCOL_COMMON_TOP2<<24;
500 }
501 high = 0xFFFFFFFF;
502 } else {
503 // Override if low range is less than UCOL_COMMON_BOT3.
504 if ( low < UCOL_COMMON_BOT3<<24 ) {
505 low = UCOL_COMMON_BOT3<<24;
506 }
507 high = 0x40000000;
508 }
509 break;
510 }
511 if(s<0) {
512 *status = U_INTERNAL_PROGRAM_ERROR;
513 return 0;
514 }
515 }
516 }
517
518 if(low < 0x02000000) {
519 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
520 // See http://site.icu-project.org/design/collation/bytes
521 low = 0x02000000;
522 }
523
524 if(strength == UCOL_SECONDARY) { /* similar as simple */
525 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
526 low = UCOL_COMMON_TOP2<<24;
527 }
528 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529 high = UCOL_COMMON_TOP2<<24;
530 }
531 if(low < (UCOL_COMMON_BOT2<<24)) {
532 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
533 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
534 //g->current = UCOL_COMMON_BOT2<<24;
535 return g->current;
536 }
537 }
538
539 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
540 if(g->noOfRanges == 0) {
541 *status = U_INTERNAL_PROGRAM_ERROR;
542 }
543 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
544 return g->current;
545 }
546
547 static
548 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
549 uint32_t i = 0;
550 UChar c;
551
552 if(U_FAILURE(*status)) {
553 return 0;
554 }
555
556 if(sourceLen > resLen) {
557 *status = U_MEMORY_ALLOCATION_ERROR;
558 return 0;
559 }
560
561 for(i = 0; i < sourceLen; i++) {
562 c = source[i];
563 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
564 switch(c - 0x3000) {
565 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
566 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
567 c++;
568 break;
569 case 0xF5:
570 c = 0x30AB;
571 break;
572 case 0xF6:
573 c = 0x30B1;
574 break;
575 }
576 }
577 resBuf[i] = c;
578 }
579 return sourceLen;
580 }
581
582 static
583 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
584 uint32_t i = 0;
585 UChar c;
586
587 if(U_FAILURE(*status)) {
588 return 0;
589 }
590
591 if(sourceLen > resLen) {
592 *status = U_MEMORY_ALLOCATION_ERROR;
593 return 0;
594 }
595
596 for(i = 0; i < sourceLen; i++) {
597 c = source[i];
598 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
599 switch(c - 0x3000) {
600 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
601 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
602 c--;
603 break;
604 case 0xAB:
605 c = 0x30F5;
606 break;
607 case 0xB1:
608 c = 0x30F6;
609 break;
610 }
611 }
612 resBuf[i] = c;
613 }
614 return sourceLen;
615 }
616
617 U_NAMESPACE_BEGIN
618
619 static
620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
621 uint32_t i = 0;
622 UChar n[128];
623 uint32_t nLen = 0;
624 uint32_t uCount = 0, lCount = 0;
625
626 collIterate s;
627 uint32_t order = 0;
628
629 if(U_FAILURE(*status)) {
630 return UCOL_LOWER_CASE;
631 }
632
633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
634 if(U_SUCCESS(*status)) {
635 for(i = 0; i < nLen; i++) {
636 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
637 order = ucol_getNextCE(UCA, &s, status);
638 if(isContinuation(order)) {
639 *status = U_INTERNAL_PROGRAM_ERROR;
640 return UCOL_LOWER_CASE;
641 }
642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
643 uCount++;
644 } else {
645 if(u_islower(n[i])) {
646 lCount++;
647 } else if(U_SUCCESS(*status)) {
648 UChar sk[1], lk[1];
649 u_toSmallKana(&n[i], 1, sk, 1, status);
650 u_toLargeKana(&n[i], 1, lk, 1, status);
651 if(sk[0] == n[i] && lk[0] != n[i]) {
652 lCount++;
653 }
654 }
655 }
656 }
657 }
658
659 if(uCount != 0 && lCount != 0) {
660 return UCOL_MIXED_CASE;
661 } else if(uCount != 0) {
662 return UCOL_UPPER_CASE;
663 } else {
664 return UCOL_LOWER_CASE;
665 }
666 }
667
668
669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
670 /* this one makes the table and stuff */
671 uint32_t noOfBytes[3];
672 uint32_t i;
673
674 for(i = 0; i<3; i++) {
675 ucol_countBytes(CEparts[i], noOfBytes[i]);
676 }
677
678 /* Here we have to pack CEs from parts */
679
680 uint32_t CEi = 0;
681 uint32_t value = 0;
682
683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
684 if(CEi > 0) {
685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
686 } else {
687 value = 0;
688 }
689
690 if(2*CEi<noOfBytes[0]) {
691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
692 }
693 if(CEi<noOfBytes[1]) {
694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
695 }
696 if(CEi<noOfBytes[2]) {
697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
698 }
699 tok->CEs[CEi] = value;
700 CEi++;
701 }
702 if(CEi == 0) { /* totally ignorable */
703 tok->noOfCEs = 1;
704 tok->CEs[0] = 0;
705 } else { /* there is at least something */
706 tok->noOfCEs = CEi;
707 }
708
709
710 // we want to set case bits here and now, not later.
711 // Case bits handling
712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
714 int32_t cSize = (tok->source & 0xFF000000) >> 24;
715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
716
717 if(cSize > 1) {
718 // Do it manually
719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
720 } else {
721 // Copy it from the UCA
722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
723 tok->CEs[0] |= (caseCE & 0xC0);
724 }
725 }
726
727 #if UCOL_DEBUG==2
728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
729 for(i = 0; i<tok->noOfCEs; i++) {
730 fprintf(stderr, "%08X ", tok->CEs[i]);
731 }
732 fprintf(stderr, "\n");
733 #endif
734 }
735
736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
739
740 UColToken *tok = lh->last;
741 uint32_t t[UCOL_STRENGTH_LIMIT];
742
743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
744
745 tok->toInsert = 1;
746 t[tok->strength] = 1;
747
748 while(tok->previous != NULL) {
749 if(tok->previous->strength < tok->strength) { /* going up */
750 t[tok->strength] = 0;
751 t[tok->previous->strength]++;
752 } else if(tok->previous->strength > tok->strength) { /* going down */
753 t[tok->previous->strength] = 1;
754 } else {
755 t[tok->strength]++;
756 }
757 tok=tok->previous;
758 tok->toInsert = t[tok->strength];
759 }
760
761 tok->toInsert = t[tok->strength];
762 ucol_inv_getGapPositions(src, lh, status);
763
764 #if UCOL_DEBUG
765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
766 int32_t j = 2;
767 for(j = 2; j >= 0; j--) {
768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
770 }
771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
772
773 do {
774 fprintf(stderr,"%i", tok->strength);
775 tok = tok->next;
776 } while(tok != NULL);
777 fprintf(stderr, "\n");
778
779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780
781 do {
782 fprintf(stderr,"%i", tok->toInsert);
783 tok = tok->next;
784 } while(tok != NULL);
785 #endif
786
787 tok = lh->first;
788 uint32_t fStrength = UCOL_IDENTICAL;
789 uint32_t initStrength = UCOL_IDENTICAL;
790
791
792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
795
796 while (tok != NULL && U_SUCCESS(*status)) {
797 fStrength = tok->strength;
798 if(fStrength < initStrength) {
799 initStrength = fStrength;
800 if(lh->pos[fStrength] == -1) {
801 while(lh->pos[fStrength] == -1 && fStrength > 0) {
802 fStrength--;
803 }
804 if(lh->pos[fStrength] == -1) {
805 *status = U_INTERNAL_PROGRAM_ERROR;
806 return;
807 }
808 }
809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
819 } else { /* primaries */
820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824 }
825 } else {
826 if(tok->strength == UCOL_TERTIARY) {
827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
828 } else if(tok->strength == UCOL_SECONDARY) {
829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
831 } else if(tok->strength == UCOL_PRIMARY) {
832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
835 }
836 }
837 ucol_doCE(src, CEparts, tok, status);
838 tok = tok->next;
839 }
840 }
841
842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
843 UCAElements el;
844 UColToken *tok = lh->first;
845 UColToken *expt = NULL;
846 uint32_t i = 0, j = 0;
847 UChar32 fcdHighStart;
848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
849
850 while(tok != NULL && U_SUCCESS(*status)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing */
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok->expansion != 0) {
855 uint32_t len = tok->expansion >> 24;
856 uint32_t currentSequenceLen = len;
857 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
859 UColToken exp;
860 exp.source = currentSequenceLen | expOffset;
861 exp.rulesToParseHdl = &(src->source);
862
863 while(len > 0) {
864 currentSequenceLen = len;
865 while(currentSequenceLen > 0) {
866 exp.source = (currentSequenceLen << 24) | expOffset;
867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy = expt->noOfCEs;
869 for(j = 0; j<noOfCEsToCopy; j++) {
870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
871 }
872 tok->noOfExpCEs += noOfCEsToCopy;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset += currentSequenceLen; //noOfCEsToCopy;
876 len -= currentSequenceLen; //noOfCEsToCopy;
877 break;
878 } else {
879 currentSequenceLen--;
880 }
881 }
882 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them into expansion */
886 collIterate s;
887 uint32_t order = 0;
888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
889
890 for(;;) {
891 order = ucol_getNextCE(src->UCA, &s, status);
892 if(order == UCOL_NO_MORE_CES) {
893 break;
894 }
895 tok->expCEs[tok->noOfExpCEs++] = order;
896 }
897 expOffset++;
898 len--;
899 }
900 }
901 } else {
902 tok->noOfExpCEs = 0;
903 }
904
905 /* set the ucaelement with obtained values */
906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
907 /* copy CEs */
908 for(i = 0; i<tok->noOfCEs; i++) {
909 el.CEs[i] = tok->CEs[i];
910 }
911 for(i = 0; i<tok->noOfExpCEs; i++) {
912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
913 }
914
915 /* copy UChars */
916 // We kept prefix and source kind of together, as it is a kind of a contraction.
917 // However, now we have to slice the prefix off the main thing -
918 el.prefix = el.prefixChars;
919 el.cPoints = el.uchars;
920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el.prefixSize = tok->prefix>>24;
924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
925
926 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
928 } else {
929 el.prefixSize = 0;
930 *el.prefix = 0;
931
932 el.cSize = (tok->source >> 24);
933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
934 }
935 if(src->UCA != NULL) {
936 for(i = 0; i<el.cSize; i++) {
937 if(UCOL_ISJAMO(el.cPoints[i])) {
938 t->image->jamoSpecial = TRUE;
939 }
940 }
941 if (!src->buildCCTabFlag && el.cSize > 0) {
942 // Check the trailing canonical combining class (tccc) of the last character.
943 const UChar *s = el.cPoints + el.cSize;
944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
945 if ((fcd & 0xff) != 0) {
946 src->buildCCTabFlag = TRUE;
947 }
948 }
949 }
950
951 /* and then, add it */
952 #if UCOL_DEBUG==2
953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
954 #endif
955 uprv_uca_addAnElement(t, &el, status);
956
957 #if UCOL_DEBUG_DUPLICATES
958 if(*status != U_ZERO_ERROR) {
959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
960 *status = U_ZERO_ERROR;
961 }
962 #endif
963
964 tok = tok->next;
965 }
966 }
967
968 U_CDECL_BEGIN
969 static UBool U_CALLCONV
970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
971 UErrorCode status = U_ZERO_ERROR;
972 tempUCATable *t = (tempUCATable *)context;
973 if(value == 0) {
974 while(start < limit) {
975 uint32_t CE = utrie_get32(t->mapping, start, NULL);
976 if(CE == UCOL_NOT_FOUND) {
977 UCAElements el;
978 el.isThai = FALSE;
979 el.prefixSize = 0;
980 el.prefixChars[0] = 0;
981 el.prefix = el.prefixChars;
982 el.cPoints = el.uchars;
983
984 el.cSize = 0;
985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
986
987 el.noOfCEs = 1;
988 el.CEs[0] = 0;
989 uprv_uca_addAnElement(t, &el, &status);
990
991 }
992 start++;
993 }
994 }
995 if(U_FAILURE(status)) {
996 return FALSE;
997 } else {
998 return TRUE;
999 }
1000 }
1001 U_CDECL_END
1002
1003 static void
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005 UChar32 start, UChar32 end,
1006 UErrorCode *status)
1007 {
1008 //UChar decomp[256];
1009 uint32_t CE = UCOL_NOT_FOUND;
1010 UChar32 u = 0;
1011 UCAElements el;
1012 el.isThai = FALSE;
1013 el.prefixSize = 0;
1014 el.prefixChars[0] = 0;
1015 collIterate colIt;
1016
1017 if(U_SUCCESS(*status)) {
1018 for(u = start; u<=end; u++) {
1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting element. */
1021 || ((isCntTableElement(CE)) &&
1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023 )
1024 {
1025 el.cSize = 0;
1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el.cPoints = el.uchars;
1030 //el.cSize = 1;
1031 el.noOfCEs = 0;
1032 el.prefix = el.prefixChars;
1033 el.prefixSize = 0;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy the
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1041 /*
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1043 tag = getCETag(CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1048 } else {
1049 */
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the UCA
1053 // or not.
1054 {
1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056 while(CE != UCOL_NO_MORE_CES) {
1057 CE = ucol_getNextCE(src->UCA, &colIt, status);
1058 if(CE != UCOL_NO_MORE_CES) {
1059 el.CEs[el.noOfCEs++] = CE;
1060 }
1061 }
1062 }
1063 uprv_uca_addAnElement(t, &el, status);
1064 }
1065 }
1066 }
1067 }
1068
1069 U_NAMESPACE_END
1070
1071 U_CFUNC UCATableHeader *
1072 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1073 U_NAMESPACE_USE
1074
1075 uint32_t i = 0;
1076 if(U_FAILURE(*status)) {
1077 return NULL;
1078 }
1079 /*
1080 2. Eliminate the negative lists by doing the following for each non-null negative list:
1081 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082 create new ListHeader X
1083 o reverse the list, add to the end of X's positive list. Reset the strength of the
1084 first item you add, based on the stronger strength levels of the two lists.
1085 */
1086 /*
1087 3. For each ListHeader with a non-null positive list:
1088 */
1089 /*
1090 o Find all character strings with CEs between the baseCE and the
1091 next/previous CE, at the strength of the first token. Add these to the
1092 tailoring.
1093 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1094 tailoring has & x < z...
1095 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1096 */
1097 /* It is possible that this part should be done even while constructing list */
1098 /* The problem is that it is unknown what is going to be the strongest weight */
1099 /* So we might as well do it here */
1100
1101 /*
1102 o Allocate CEs for each token in the list, based on the total number N of the
1103 largest level difference, and the gap G between baseCE and nextCE at that
1104 level. The relation * between the last item and nextCE is the same as the
1105 strongest strength.
1106 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108 Then fit b and c into the secondary gap between a and d, then fit q
1109 into the tertiary gap between b and c.
1110
1111 o Example: baseCE << b <<< q << c * nextCE(X,2)
1112 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113 Then fit q into the tertiary gap between b and c.
1114 o When incrementing primary values, we will not cross high byte
1115 boundaries except where there is only a single-byte primary. That is to
1116 ensure that the script reordering will continue to work.
1117 */
1118 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1119 /* test for NULL */
1120 if (image == NULL) {
1121 *status = U_MEMORY_ALLOCATION_ERROR;
1122 return NULL;
1123 }
1124 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1125
1126 for(i = 0; i<src->resultLen; i++) {
1127 /* now we need to generate the CEs */
1128 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129 /* According to strength */
1130 if(U_SUCCESS(*status)) {
1131 if(src->lh[i].first) { // if there are any elements
1132 // due to the way parser works, subsequent tailorings
1133 // may remove all the elements from a sequence, therefore
1134 // leaving an empty tailoring sequence.
1135 ucol_initBuffers(src, &src->lh[i], status);
1136 }
1137 }
1138 if(U_FAILURE(*status)) {
1139 uprv_free(image);
1140 return NULL;
1141 }
1142 }
1143
1144 if(src->varTop != NULL) { /* stuff the variable top value */
1145 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1146 /* remove it from the list */
1147 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1148 src->varTop->listHeader->first = src->varTop->next;
1149 }
1150 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1151 src->varTop->listHeader->last = src->varTop->previous;
1152 }
1153 if(src->varTop->next != NULL) {
1154 src->varTop->next->previous = src->varTop->previous;
1155 }
1156 if(src->varTop->previous != NULL) {
1157 src->varTop->previous->next = src->varTop->next;
1158 }
1159 }
1160
1161
1162 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1163 if(U_FAILURE(*status)) {
1164 uprv_free(image);
1165 return NULL;
1166 }
1167
1168
1169 /* After this, we have assigned CE values to all regular CEs */
1170 /* now we will go through list once more and resolve expansions, */
1171 /* make UCAElements structs and add them to table */
1172 for(i = 0; i<src->resultLen; i++) {
1173 /* now we need to generate the CEs */
1174 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175 /* According to strength */
1176 if(U_SUCCESS(*status)) {
1177 ucol_createElements(src, t, &src->lh[i], status);
1178 }
1179 }
1180
1181 UCAElements el;
1182 el.isThai = FALSE;
1183 el.prefixSize = 0;
1184 el.prefixChars[0] = 0;
1185
1186 /* add latin-1 stuff */
1187 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1188
1189 /* add stuff for copying */
1190 if(src->copySet != NULL) {
1191 int32_t i = 0;
1192 UnicodeSet *set = (UnicodeSet *)src->copySet;
1193 for(i = 0; i < set->getRangeCount(); i++) {
1194 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1195 }
1196 }
1197
1198 if(U_SUCCESS(*status)) {
1199 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1200
1201 uint32_t tailoredCE = UCOL_NOT_FOUND;
1202 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1203 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1204 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1205 // Check for null pointer
1206 if (ucaEl == NULL) {
1207 *status = U_MEMORY_ALLOCATION_ERROR;
1208 return NULL;
1209 }
1210 while(*conts != 0) {
1211 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1212 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1213 if(tailoredCE != UCOL_NOT_FOUND) {
1214 UBool needToAdd = TRUE;
1215 if(isCntTableElement(tailoredCE)) {
1216 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1217 needToAdd = FALSE;
1218 }
1219 }
1220 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1221 UCAElements elm;
1222 elm.cPoints = el.uchars;
1223 elm.noOfCEs = 0;
1224 elm.uchars[0] = *conts;
1225 elm.uchars[1] = 0;
1226 elm.cSize = 1;
1227 elm.prefixChars[0] = *(conts+2);
1228 elm.isThai = FALSE;
1229 elm.prefix = elm.prefixChars;
1230 elm.prefixSize = 1;
1231 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1232 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1233 needToAdd = TRUE;
1234 }
1235 }
1236 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1237 needToAdd = FALSE;
1238 }
1239
1240 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1241 if (*(conts+1) != 0) { // contractions
1242 el.prefix = el.prefixChars;
1243 el.prefixSize = 0;
1244 el.cPoints = el.uchars;
1245 el.noOfCEs = 0;
1246 el.uchars[0] = *conts;
1247 el.uchars[1] = *(conts+1);
1248 if(*(conts+2)!=0) {
1249 el.uchars[2] = *(conts+2);
1250 el.cSize = 3;
1251 } else {
1252 el.cSize = 2;
1253 }
1254 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1255 }
1256 else { // pre-context character
1257 UChar str[4] = { 0 };
1258 int32_t len=0;
1259 int32_t preKeyLen=0;
1260
1261 el.cPoints = el.uchars;
1262 el.noOfCEs = 0;
1263 el.uchars[0] = *conts;
1264 el.uchars[1] = 0;
1265 el.cSize = 1;
1266 el.prefixChars[0] = *(conts+2);
1267 el.prefix = el.prefixChars;
1268 el.prefixSize = 1;
1269 if (el.prefixChars[0]!=0) {
1270 // get CE of prefix character first
1271 str[0]=el.prefixChars[0];
1272 str[1]=0;
1273 ucol_setText(ucaEl, str, 1, status);
1274 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1275 != UCOL_NULLORDER) {
1276 preKeyLen++; // count number of keys for prefix character
1277 }
1278 str[len++] = el.prefixChars[0];
1279 }
1280
1281 str[len++] = el.uchars[0];
1282 str[len]=0;
1283 ucol_setText(ucaEl, str, len, status);
1284 // Skip the keys for prefix character, then copy the rest to el.
1285 while ((preKeyLen-->0) &&
1286 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1287 continue;
1288 }
1289
1290 }
1291 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1292 el.noOfCEs++;
1293 }
1294 uprv_uca_addAnElement(t, &el, status);
1295 }
1296
1297 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1298 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1299 }
1300 conts+=3;
1301 }
1302 ucol_closeElements(ucaEl);
1303 }
1304
1305 // Add completely ignorable elements
1306 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1307
1308 // add tailoring characters related canonical closures
1309 uprv_uca_canonicalClosure(t, src, NULL, status);
1310
1311 /* still need to produce compatibility closure */
1312
1313 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1314
1315 uprv_uca_closeTempTable(t);
1316 uprv_free(image);
1317
1318 return myData;
1319 }
1320
1321 U_CDECL_BEGIN
1322 static UBool U_CALLCONV
1323 ucol_bld_cleanup(void)
1324 {
1325 udata_close(invUCA_DATA_MEM);
1326 invUCA_DATA_MEM = NULL;
1327 _staticInvUCA = NULL;
1328 return TRUE;
1329 }
1330 U_CDECL_END
1331
1332 U_CAPI const InverseUCATableHeader * U_EXPORT2
1333 ucol_initInverseUCA(UErrorCode *status)
1334 {
1335 if(U_FAILURE(*status)) return NULL;
1336
1337 UBool needsInit;
1338 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1339
1340 if(needsInit) {
1341 InverseUCATableHeader *newInvUCA = NULL;
1342 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1343
1344 if(U_FAILURE(*status)) {
1345 if (result) {
1346 udata_close(result);
1347 }
1348 // This is not needed, as we are talking about
1349 // memory we got from UData
1350 //uprv_free(newInvUCA);
1351 }
1352
1353 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1354 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1355 UCollator *UCA = ucol_initUCA(status);
1356 // UCA versions of UCA and inverse UCA should match
1357 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1358 *status = U_INVALID_FORMAT_ERROR;
1359 udata_close(result);
1360 return NULL;
1361 }
1362
1363 umtx_lock(NULL);
1364 if(_staticInvUCA == NULL) {
1365 invUCA_DATA_MEM = result;
1366 _staticInvUCA = newInvUCA;
1367 result = NULL;
1368 newInvUCA = NULL;
1369 }
1370 umtx_unlock(NULL);
1371
1372 if(newInvUCA != NULL) {
1373 udata_close(result);
1374 // This is not needed, as we are talking about
1375 // memory we got from UData
1376 //uprv_free(newInvUCA);
1377 }
1378 else {
1379 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1380 }
1381 }
1382 }
1383 return _staticInvUCA;
1384 }
1385
1386 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1387 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1388 */
1389 static const char* ReorderingTokenNames[] = {
1390 "SPACE",
1391 "PUNCT",
1392 "SYMBOL",
1393 "CURRENCY",
1394 "DIGIT",
1395 NULL
1396 };
1397
1398 static void toUpper(const char* src, char* dst, uint32_t length) {
1399 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1400 *dst = toupper(*src);
1401 }
1402 *dst = '\0';
1403 }
1404
1405 U_INTERNAL int32_t U_EXPORT2
1406 ucol_findReorderingEntry(const char* name) {
1407 char buffer[32];
1408 toUpper(name, buffer, 32);
1409 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1410 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1411 return entry + UCOL_REORDER_CODE_FIRST;
1412 }
1413 }
1414 return USCRIPT_INVALID_CODE;
1415 }
1416
1417 #endif /* #if !UCONFIG_NO_COLLATION */