]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_bld.cpp
ICU-511.32.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
51004dcb 4* Copyright (C) 2001-2012, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_bld.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
16* This module builds a collator based on the rule set.
46f4442e 17*
b75a7d8f
A
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucoleitr.h"
46f4442e 25#include "unicode/udata.h"
b75a7d8f 26#include "unicode/uchar.h"
b75a7d8f 27#include "unicode/uniset.h"
729e4ab9
A
28#include "unicode/uscript.h"
29#include "unicode/ustring.h"
4388f060 30#include "unicode/utf16.h"
729e4ab9 31#include "normalizer2impl.h"
46f4442e
A
32#include "ucol_bld.h"
33#include "ucol_elm.h"
34#include "ucol_cnt.h"
35#include "ucln_in.h"
36#include "umutex.h"
46f4442e 37#include "cmemory.h"
729e4ab9 38#include "cstring.h"
b75a7d8f 39
51004dcb
A
40#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41
374ca955 42static const InverseUCATableHeader* _staticInvUCA = NULL;
b75a7d8f
A
43static UDataMemory* invUCA_DATA_MEM = NULL;
44
45U_CDECL_BEGIN
46static UBool U_CALLCONV
46f4442e
A
47isAcceptableInvUCA(void * /*context*/,
48 const char * /*type*/, const char * /*name*/,
49 const UDataInfo *pInfo)
50{
51 /* context, type & name are intentionally not used */
b75a7d8f
A
52 if( pInfo->size>=20 &&
53 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
54 pInfo->charsetFamily==U_CHARSET_FAMILY &&
374ca955
A
55 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
56 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
57 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
58 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
59 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
60 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
61 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
62 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
63 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
46f4442e
A
64 )
65 {
b75a7d8f
A
66 UVersionInfo UCDVersion;
67 u_getUnicodeVersion(UCDVersion);
46f4442e
A
68 return (pInfo->dataVersion[0]==UCDVersion[0] &&
69 pInfo->dataVersion[1]==UCDVersion[1]);
70 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
71 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
72 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
b75a7d8f
A
73 } else {
74 return FALSE;
75 }
76}
77U_CDECL_END
78
46f4442e
A
79/*
80* Takes two CEs (lead and continuation) and
81* compares them as CEs should be compared:
82* primary vs. primary, secondary vs. secondary
83* tertiary vs. tertiary
84*/
73c04bcf 85static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
46f4442e
A
86 uint32_t s1 = source0, s2, t1 = target0, t2;
87 if(isContinuation(source1)) {
88 s2 = source1;
89 } else {
90 s2 = 0;
91 }
92 if(isContinuation(target1)) {
93 t2 = target1;
73c04bcf 94 } else {
46f4442e
A
95 t2 = 0;
96 }
97
98 uint32_t s = 0, t = 0;
99 if(s1 == t1 && s2 == t2) {
100 return 0;
101 }
102 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
103 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
104 if(s < t) {
73c04bcf 105 return -1;
46f4442e 106 } else if(s > t) {
73c04bcf 107 return 1;
46f4442e
A
108 } else {
109 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
110 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
111 if(s < t) {
112 return -1;
113 } else if(s > t) {
114 return 1;
115 } else {
116 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
117 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
118 if(s < t) {
119 return -1;
120 } else {
121 return 1;
122 }
123 }
73c04bcf 124 }
73c04bcf
A
125}
126
b75a7d8f 127static
374ca955 128int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
46f4442e
A
129 uint32_t bottom = 0, top = src->invUCA->tableSize;
130 uint32_t i = 0;
131 uint32_t first = 0, second = 0;
132 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
133 int32_t res = 0;
134
135 while(bottom < top-1) {
136 i = (top+bottom)/2;
137 first = *(CETable+3*i);
138 second = *(CETable+3*i+1);
139 res = compareCEs(first, second, CE, SecondCE);
140 if(res > 0) {
141 top = i;
142 } else if(res < 0) {
143 bottom = i;
144 } else {
145 break;
146 }
b75a7d8f 147 }
46f4442e
A
148
149 /* weiv: */
150 /* in searching for elements, I have removed the failure */
151 /* The reason for this is that the builder does not rely */
152 /* on search mechanism telling it that it didn't find an */
153 /* element. However, indirect positioning relies on being */
154 /* able to find the elements around any CE, even if it is */
155 /* not defined in the UCA. */
156 return i;
157 /*
158 if((first == CE && second == SecondCE)) {
b75a7d8f 159 return i;
46f4442e 160 } else {
b75a7d8f 161 return -1;
46f4442e
A
162 }
163 */
b75a7d8f
A
164}
165
166static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
46f4442e
A
167 0xFFFF0000,
168 0xFFFFFF00,
169 0xFFFFFFFF
b75a7d8f
A
170};
171
374ca955 172U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
46f4442e
A
173 uint32_t CE, uint32_t contCE,
174 uint32_t *nextCE, uint32_t *nextContCE,
175 uint32_t strength)
176{
177 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
178 int32_t iCE;
b75a7d8f 179
46f4442e 180 iCE = ucol_inv_findCE(src, CE, contCE);
b75a7d8f 181
46f4442e
A
182 if(iCE<0) {
183 *nextCE = UCOL_NOT_FOUND;
184 return -1;
185 }
b75a7d8f 186
46f4442e
A
187 CE &= strengthMask[strength];
188 contCE &= strengthMask[strength];
b75a7d8f 189
46f4442e
A
190 *nextCE = CE;
191 *nextContCE = contCE;
b75a7d8f 192
46f4442e
A
193 while((*nextCE & strengthMask[strength]) == CE
194 && (*nextContCE & strengthMask[strength]) == contCE)
195 {
196 *nextCE = (*(CETable+3*(++iCE)));
197 *nextContCE = (*(CETable+3*(iCE)+1));
198 }
b75a7d8f 199
46f4442e 200 return iCE;
b75a7d8f
A
201}
202
46f4442e
A
203U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
204 uint32_t CE, uint32_t contCE,
205 uint32_t *prevCE, uint32_t *prevContCE,
206 uint32_t strength)
207{
208 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
209 int32_t iCE;
b75a7d8f 210
46f4442e 211 iCE = ucol_inv_findCE(src, CE, contCE);
b75a7d8f 212
46f4442e
A
213 if(iCE<0) {
214 *prevCE = UCOL_NOT_FOUND;
215 return -1;
216 }
b75a7d8f 217
46f4442e
A
218 CE &= strengthMask[strength];
219 contCE &= strengthMask[strength];
b75a7d8f 220
46f4442e
A
221 *prevCE = CE;
222 *prevContCE = contCE;
b75a7d8f 223
46f4442e
A
224 while((*prevCE & strengthMask[strength]) == CE
225 && (*prevContCE & strengthMask[strength])== contCE
226 && iCE > 0) /* this condition should prevent falling off the edge of the world */
227 {
228 /* here, we end up in a singularity - zero */
229 *prevCE = (*(CETable+3*(--iCE)));
230 *prevContCE = (*(CETable+3*(iCE)+1));
231 }
b75a7d8f 232
46f4442e 233 return iCE;
b75a7d8f
A
234}
235
46f4442e
A
236U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
237 uint32_t prevCE, uint32_t prevContCE)
73c04bcf
A
238{
239 if(prevCE == CE && prevContCE == contCE) {
46f4442e 240 return UCOL_IDENTICAL;
73c04bcf
A
241 }
242 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
46f4442e
A
243 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
244 {
245 return UCOL_PRIMARY;
374ca955 246 }
73c04bcf 247 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
46f4442e
A
248 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
249 {
250 return UCOL_SECONDARY;
73c04bcf 251 }
46f4442e 252 return UCOL_TERTIARY;
374ca955
A
253}
254
255
46f4442e 256/*static
374ca955 257inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
b75a7d8f 258
46f4442e
A
259 uint32_t CE = lh->baseCE;
260 uint32_t SecondCE = lh->baseContCE;
b75a7d8f 261
46f4442e
A
262 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
263 uint32_t previousCE, previousContCE;
264 int32_t iCE;
b75a7d8f 265
46f4442e 266 iCE = ucol_inv_findCE(src, CE, SecondCE);
b75a7d8f 267
46f4442e
A
268 if(iCE<0) {
269 return -1;
270 }
b75a7d8f 271
46f4442e
A
272 CE &= strengthMask[strength];
273 SecondCE &= strengthMask[strength];
b75a7d8f 274
46f4442e
A
275 previousCE = CE;
276 previousContCE = SecondCE;
b75a7d8f 277
46f4442e
A
278 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
279 previousCE = (*(CETable+3*(--iCE)));
280 previousContCE = (*(CETable+3*(iCE)+1));
281 }
282 lh->previousCE = previousCE;
283 lh->previousContCE = previousContCE;
b75a7d8f 284
46f4442e
A
285 return iCE;
286}*/
b75a7d8f
A
287
288static
374ca955 289inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
46f4442e
A
290 uint32_t CE = lh->baseCE;
291 uint32_t SecondCE = lh->baseContCE;
b75a7d8f 292
46f4442e
A
293 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
294 uint32_t nextCE, nextContCE;
295 int32_t iCE;
b75a7d8f 296
46f4442e 297 iCE = ucol_inv_findCE(src, CE, SecondCE);
b75a7d8f 298
46f4442e
A
299 if(iCE<0) {
300 return -1;
301 }
b75a7d8f 302
46f4442e
A
303 CE &= strengthMask[strength];
304 SecondCE &= strengthMask[strength];
b75a7d8f 305
46f4442e
A
306 nextCE = CE;
307 nextContCE = SecondCE;
b75a7d8f 308
46f4442e
A
309 while((nextCE & strengthMask[strength]) == CE
310 && (nextContCE & strengthMask[strength]) == SecondCE)
311 {
312 nextCE = (*(CETable+3*(++iCE)));
313 nextContCE = (*(CETable+3*(iCE)+1));
314 }
b75a7d8f 315
46f4442e
A
316 lh->nextCE = nextCE;
317 lh->nextContCE = nextContCE;
b75a7d8f 318
46f4442e 319 return iCE;
b75a7d8f
A
320}
321
46f4442e
A
322static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
323 /* reset all the gaps */
324 int32_t i = 0;
325 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
326 uint32_t st = 0;
327 uint32_t t1, t2;
328 int32_t pos;
329
330 UColToken *tok = lh->first;
331 uint32_t tokStrength = tok->strength;
332
333 for(i = 0; i<3; i++) {
334 lh->gapsHi[3*i] = 0;
335 lh->gapsHi[3*i+1] = 0;
336 lh->gapsHi[3*i+2] = 0;
337 lh->gapsLo[3*i] = 0;
338 lh->gapsLo[3*i+1] = 0;
339 lh->gapsLo[3*i+2] = 0;
340 lh->numStr[i] = 0;
341 lh->fStrToken[i] = NULL;
342 lh->lStrToken[i] = NULL;
343 lh->pos[i] = -1;
344 }
b75a7d8f 345
46f4442e
A
346 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
347
348 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
349 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
350 lh->pos[0] = 0;
351 t1 = lh->baseCE;
352 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
353 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
354 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
355 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
729e4ab9 356 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
46f4442e
A
357 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
358
729e4ab9 359 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
46f4442e
A
360 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
361
362 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
363 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
364 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
365 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
366 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
367 lh->pos[0] = 0;
368 t1 = lh->baseCE;
369 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
370 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
371 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
372 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
373 t1 = lh->nextCE;
374 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
375 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
376 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
377 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
378 } else {
379 for(;;) {
380 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
381 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
382 lh->fStrToken[tokStrength] = tok;
383 } else { /* The CE must be implicit, since it's not in the table */
384 /* Error */
385 *status = U_INTERNAL_PROGRAM_ERROR;
386 }
387 }
388
389 while(tok != NULL && tok->strength >= tokStrength) {
390 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
391 lh->lStrToken[tokStrength] = tok;
392 }
393 tok = tok->next;
394 }
395 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
396 /* check if previous interval is the same and merge the intervals if it is so */
397 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
398 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
399 lh->fStrToken[tokStrength+1] = NULL;
400 lh->lStrToken[tokStrength+1] = NULL;
401 lh->pos[tokStrength+1] = -1;
402 }
403 }
404 if(tok != NULL) {
405 tokStrength = tok->strength;
406 } else {
407 break;
408 }
b75a7d8f 409 }
46f4442e
A
410 for(st = 0; st < 3; st++) {
411 if((pos = lh->pos[st]) >= 0) {
412 t1 = *(CETable+3*(pos));
413 t2 = *(CETable+3*(pos)+1);
414 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
415 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
416 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
417 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
418 //pos--;
419 //t1 = *(CETable+3*(pos));
420 //t2 = *(CETable+3*(pos)+1);
421 t1 = lh->baseCE;
422 t2 = lh->baseContCE;
423 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
424 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
425 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
426 }
b75a7d8f 427 }
b75a7d8f 428 }
b75a7d8f
A
429}
430
431
432#define ucol_countBytes(value, noOfBytes) \
433{ \
46f4442e
A
434 uint32_t mask = 0xFFFFFFFF; \
435 (noOfBytes) = 0; \
436 while(mask != 0) { \
b75a7d8f 437 if(((value) & mask) != 0) { \
46f4442e 438 (noOfBytes)++; \
b75a7d8f
A
439 } \
440 mask >>= 8; \
46f4442e 441 } \
b75a7d8f
A
442}
443
46f4442e
A
444static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
445 if(U_SUCCESS(*status)) {
446 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
447 }
448 return g->current;
b75a7d8f
A
449}
450
46f4442e
A
451static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
452 /* TODO: rename to enum names */
453 uint32_t high, low, count=1;
454 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
455
456 if(strength == UCOL_SECONDARY) {
457 low = UCOL_COMMON_TOP2<<24;
458 high = 0xFFFFFFFF;
459 count = 0xFF - UCOL_COMMON_TOP2;
460 } else {
461 low = UCOL_BYTE_COMMON << 24; //0x05000000;
462 high = 0x40000000;
463 count = 0x40 - UCOL_BYTE_COMMON;
464 }
465
466 if(tok->next != NULL && tok->next->strength == strength) {
467 count = tok->next->toInsert;
468 }
469
470 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
471 g->current = UCOL_BYTE_COMMON<<24;
472
473 if(g->noOfRanges == 0) {
474 *status = U_INTERNAL_PROGRAM_ERROR;
475 }
476 return g->current;
b75a7d8f
A
477}
478
46f4442e
A
479static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
480 uint32_t strength = tok->strength;
481 uint32_t low = lows[fStrength*3+strength];
482 uint32_t high = highs[fStrength*3+strength];
483 uint32_t maxByte = 0;
484 if(strength == UCOL_TERTIARY) {
485 maxByte = 0x3F;
486 } else if(strength == UCOL_PRIMARY) {
487 maxByte = 0xFE;
488 } else {
489 maxByte = 0xFF;
490 }
491
492 uint32_t count = tok->toInsert;
493
494 if(low >= high && strength > UCOL_PRIMARY) {
495 int32_t s = strength;
496 for(;;) {
497 s--;
498 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
499 if(strength == UCOL_SECONDARY) {
500 if (low < UCOL_COMMON_TOP2<<24 ) {
501 // Override if low range is less than UCOL_COMMON_TOP2.
502 low = UCOL_COMMON_TOP2<<24;
503 }
504 high = 0xFFFFFFFF;
505 } else {
506 // Override if low range is less than UCOL_COMMON_BOT3.
507 if ( low < UCOL_COMMON_BOT3<<24 ) {
508 low = UCOL_COMMON_BOT3<<24;
509 }
510 high = 0x40000000;
511 }
512 break;
513 }
514 if(s<0) {
515 *status = U_INTERNAL_PROGRAM_ERROR;
516 return 0;
517 }
b75a7d8f 518 }
b75a7d8f 519 }
b75a7d8f 520
729e4ab9
A
521 if(low < 0x02000000) {
522 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
523 // See http://site.icu-project.org/design/collation/bytes
524 low = 0x02000000;
46f4442e 525 }
b75a7d8f 526
46f4442e
A
527 if(strength == UCOL_SECONDARY) { /* similar as simple */
528 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529 low = UCOL_COMMON_TOP2<<24;
530 }
531 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
532 high = UCOL_COMMON_TOP2<<24;
533 }
534 if(low < (UCOL_COMMON_BOT2<<24)) {
535 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
536 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
537 //g->current = UCOL_COMMON_BOT2<<24;
538 return g->current;
539 }
b75a7d8f 540 }
46f4442e
A
541
542 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
543 if(g->noOfRanges == 0) {
544 *status = U_INTERNAL_PROGRAM_ERROR;
b75a7d8f 545 }
46f4442e
A
546 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
547 return g->current;
b75a7d8f
A
548}
549
374ca955
A
550static
551uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
46f4442e
A
552 uint32_t i = 0;
553 UChar c;
554
555 if(U_FAILURE(*status)) {
556 return 0;
557 }
558
559 if(sourceLen > resLen) {
560 *status = U_MEMORY_ALLOCATION_ERROR;
561 return 0;
562 }
563
564 for(i = 0; i < sourceLen; i++) {
565 c = source[i];
566 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
567 switch(c - 0x3000) {
568 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
569 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
570 c++;
571 break;
572 case 0xF5:
573 c = 0x30AB;
574 break;
575 case 0xF6:
576 c = 0x30B1;
577 break;
578 }
579 }
580 resBuf[i] = c;
374ca955 581 }
46f4442e 582 return sourceLen;
374ca955
A
583}
584
585static
586uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
46f4442e
A
587 uint32_t i = 0;
588 UChar c;
589
590 if(U_FAILURE(*status)) {
591 return 0;
374ca955 592 }
46f4442e
A
593
594 if(sourceLen > resLen) {
595 *status = U_MEMORY_ALLOCATION_ERROR;
596 return 0;
597 }
598
599 for(i = 0; i < sourceLen; i++) {
600 c = source[i];
601 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
602 switch(c - 0x3000) {
603 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
604 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
605 c--;
606 break;
607 case 0xAB:
608 c = 0x30F5;
609 break;
610 case 0xB1:
611 c = 0x30F6;
612 break;
613 }
614 }
615 resBuf[i] = c;
616 }
617 return sourceLen;
374ca955
A
618}
619
729e4ab9
A
620U_NAMESPACE_BEGIN
621
374ca955
A
622static
623uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
46f4442e
A
624 uint32_t i = 0;
625 UChar n[128];
626 uint32_t nLen = 0;
627 uint32_t uCount = 0, lCount = 0;
628
629 collIterate s;
630 uint32_t order = 0;
631
632 if(U_FAILURE(*status)) {
374ca955 633 return UCOL_LOWER_CASE;
46f4442e
A
634 }
635
636 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
637 if(U_SUCCESS(*status)) {
638 for(i = 0; i < nLen; i++) {
729e4ab9 639 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
46f4442e
A
640 order = ucol_getNextCE(UCA, &s, status);
641 if(isContinuation(order)) {
642 *status = U_INTERNAL_PROGRAM_ERROR;
643 return UCOL_LOWER_CASE;
644 }
645 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
646 uCount++;
647 } else {
648 if(u_islower(n[i])) {
649 lCount++;
729e4ab9 650 } else if(U_SUCCESS(*status)) {
46f4442e
A
651 UChar sk[1], lk[1];
652 u_toSmallKana(&n[i], 1, sk, 1, status);
653 u_toLargeKana(&n[i], 1, lk, 1, status);
654 if(sk[0] == n[i] && lk[0] != n[i]) {
655 lCount++;
656 }
657 }
658 }
374ca955 659 }
374ca955 660 }
46f4442e
A
661
662 if(uCount != 0 && lCount != 0) {
663 return UCOL_MIXED_CASE;
664 } else if(uCount != 0) {
665 return UCOL_UPPER_CASE;
666 } else {
667 return UCOL_LOWER_CASE;
668 }
374ca955
A
669}
670
671
672U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
46f4442e
A
673 /* this one makes the table and stuff */
674 uint32_t noOfBytes[3];
675 uint32_t i;
b75a7d8f 676
46f4442e
A
677 for(i = 0; i<3; i++) {
678 ucol_countBytes(CEparts[i], noOfBytes[i]);
679 }
b75a7d8f 680
46f4442e 681 /* Here we have to pack CEs from parts */
b75a7d8f 682
46f4442e
A
683 uint32_t CEi = 0;
684 uint32_t value = 0;
b75a7d8f 685
46f4442e
A
686 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
687 if(CEi > 0) {
688 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
689 } else {
690 value = 0;
691 }
b75a7d8f 692
46f4442e
A
693 if(2*CEi<noOfBytes[0]) {
694 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
695 }
696 if(CEi<noOfBytes[1]) {
697 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
698 }
699 if(CEi<noOfBytes[2]) {
700 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
701 }
702 tok->CEs[CEi] = value;
703 CEi++;
b75a7d8f 704 }
46f4442e
A
705 if(CEi == 0) { /* totally ignorable */
706 tok->noOfCEs = 1;
707 tok->CEs[0] = 0;
708 } else { /* there is at least something */
709 tok->noOfCEs = CEi;
b75a7d8f 710 }
46f4442e
A
711
712
713 // we want to set case bits here and now, not later.
714 // Case bits handling
715 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
716 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
717 int32_t cSize = (tok->source & 0xFF000000) >> 24;
718 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
719
720 if(cSize > 1) {
721 // Do it manually
722 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
723 } else {
724 // Copy it from the UCA
725 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
726 tok->CEs[0] |= (caseCE & 0xC0);
727 }
73c04bcf 728 }
374ca955 729
b75a7d8f 730#if UCOL_DEBUG==2
46f4442e
A
731 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
732 for(i = 0; i<tok->noOfCEs; i++) {
733 fprintf(stderr, "%08X ", tok->CEs[i]);
734 }
735 fprintf(stderr, "\n");
b75a7d8f
A
736#endif
737}
738
739U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
46f4442e
A
740 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
741 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
b75a7d8f 742
46f4442e
A
743 UColToken *tok = lh->last;
744 uint32_t t[UCOL_STRENGTH_LIMIT];
b75a7d8f 745
46f4442e 746 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
b75a7d8f 747
51004dcb
A
748 /* must initialize ranges to avoid memory check warnings */
749 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
750 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
751 }
752
46f4442e
A
753 tok->toInsert = 1;
754 t[tok->strength] = 1;
b75a7d8f 755
46f4442e
A
756 while(tok->previous != NULL) {
757 if(tok->previous->strength < tok->strength) { /* going up */
758 t[tok->strength] = 0;
759 t[tok->previous->strength]++;
760 } else if(tok->previous->strength > tok->strength) { /* going down */
761 t[tok->previous->strength] = 1;
762 } else {
763 t[tok->strength]++;
764 }
765 tok=tok->previous;
766 tok->toInsert = t[tok->strength];
b75a7d8f 767 }
b75a7d8f 768
46f4442e
A
769 tok->toInsert = t[tok->strength];
770 ucol_inv_getGapPositions(src, lh, status);
b75a7d8f
A
771
772#if UCOL_DEBUG
46f4442e
A
773 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
774 int32_t j = 2;
775 for(j = 2; j >= 0; j--) {
776 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
777 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
778 }
729e4ab9 779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
b75a7d8f 780
46f4442e
A
781 do {
782 fprintf(stderr,"%i", tok->strength);
783 tok = tok->next;
784 } while(tok != NULL);
785 fprintf(stderr, "\n");
b75a7d8f 786
729e4ab9 787 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
b75a7d8f 788
46f4442e
A
789 do {
790 fprintf(stderr,"%i", tok->toInsert);
791 tok = tok->next;
792 } while(tok != NULL);
793#endif
b75a7d8f 794
46f4442e
A
795 tok = lh->first;
796 uint32_t fStrength = UCOL_IDENTICAL;
797 uint32_t initStrength = UCOL_IDENTICAL;
798
799
800 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
801 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
802 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
803
804 while (tok != NULL && U_SUCCESS(*status)) {
805 fStrength = tok->strength;
806 if(fStrength < initStrength) {
807 initStrength = fStrength;
808 if(lh->pos[fStrength] == -1) {
809 while(lh->pos[fStrength] == -1 && fStrength > 0) {
810 fStrength--;
811 }
812 if(lh->pos[fStrength] == -1) {
813 *status = U_INTERNAL_PROGRAM_ERROR;
814 return;
815 }
816 }
817 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
818 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
819 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
820 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
821 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
823 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
824 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
825 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
826 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
827 } else { /* primaries */
828 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
829 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
830 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
831 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
832 }
833 } else {
834 if(tok->strength == UCOL_TERTIARY) {
835 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
836 } else if(tok->strength == UCOL_SECONDARY) {
837 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
838 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
839 } else if(tok->strength == UCOL_PRIMARY) {
840 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
841 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
842 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
843 }
b75a7d8f 844 }
46f4442e
A
845 ucol_doCE(src, CEparts, tok, status);
846 tok = tok->next;
b75a7d8f 847 }
b75a7d8f
A
848}
849
b75a7d8f 850U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
46f4442e
A
851 UCAElements el;
852 UColToken *tok = lh->first;
853 UColToken *expt = NULL;
854 uint32_t i = 0, j = 0;
4388f060 855 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
46f4442e
A
856
857 while(tok != NULL && U_SUCCESS(*status)) {
858 /* first, check if there are any expansions */
859 /* if there are expansions, we need to do a little bit more processing */
860 /* since parts of expansion can be tailored, while others are not */
861 if(tok->expansion != 0) {
862 uint32_t len = tok->expansion >> 24;
863 uint32_t currentSequenceLen = len;
864 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
865 //uint32_t exp = currentSequenceLen | expOffset;
866 UColToken exp;
867 exp.source = currentSequenceLen | expOffset;
729e4ab9 868 exp.rulesToParseHdl = &(src->source);
46f4442e
A
869
870 while(len > 0) {
871 currentSequenceLen = len;
872 while(currentSequenceLen > 0) {
873 exp.source = (currentSequenceLen << 24) | expOffset;
874 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
875 uint32_t noOfCEsToCopy = expt->noOfCEs;
876 for(j = 0; j<noOfCEsToCopy; j++) {
877 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
878 }
879 tok->noOfExpCEs += noOfCEsToCopy;
880 // Smart people never try to add codepoints and CEs.
881 // For some odd reason, it won't work.
882 expOffset += currentSequenceLen; //noOfCEsToCopy;
883 len -= currentSequenceLen; //noOfCEsToCopy;
884 break;
885 } else {
886 currentSequenceLen--;
887 }
888 }
889 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
890 /* will have to get one from UCA */
891 /* first, get the UChars from the rules */
892 /* then pick CEs out until there is no more and stuff them into expansion */
893 collIterate s;
894 uint32_t order = 0;
729e4ab9 895 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
46f4442e
A
896
897 for(;;) {
898 order = ucol_getNextCE(src->UCA, &s, status);
899 if(order == UCOL_NO_MORE_CES) {
900 break;
901 }
902 tok->expCEs[tok->noOfExpCEs++] = order;
903 }
904 expOffset++;
905 len--;
906 }
b75a7d8f 907 }
46f4442e
A
908 } else {
909 tok->noOfExpCEs = 0;
b75a7d8f 910 }
b75a7d8f 911
46f4442e
A
912 /* set the ucaelement with obtained values */
913 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
914 /* copy CEs */
915 for(i = 0; i<tok->noOfCEs; i++) {
916 el.CEs[i] = tok->CEs[i];
917 }
918 for(i = 0; i<tok->noOfExpCEs; i++) {
919 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
920 }
b75a7d8f 921
46f4442e
A
922 /* copy UChars */
923 // We kept prefix and source kind of together, as it is a kind of a contraction.
924 // However, now we have to slice the prefix off the main thing -
925 el.prefix = el.prefixChars;
926 el.cPoints = el.uchars;
927 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
928 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
929 // decomposed elements to the unsaf table.
930 el.prefixSize = tok->prefix>>24;
931 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
932
933 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
934 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
935 } else {
936 el.prefixSize = 0;
937 *el.prefix = 0;
b75a7d8f 938
46f4442e
A
939 el.cSize = (tok->source >> 24);
940 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
941 }
942 if(src->UCA != NULL) {
46f4442e
A
943 for(i = 0; i<el.cSize; i++) {
944 if(UCOL_ISJAMO(el.cPoints[i])) {
945 t->image->jamoSpecial = TRUE;
946 }
46f4442e 947 }
729e4ab9
A
948 if (!src->buildCCTabFlag && el.cSize > 0) {
949 // Check the trailing canonical combining class (tccc) of the last character.
950 const UChar *s = el.cPoints + el.cSize;
4388f060 951 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
729e4ab9
A
952 if ((fcd & 0xff) != 0) {
953 src->buildCCTabFlag = TRUE;
954 }
46f4442e 955 }
b75a7d8f 956 }
b75a7d8f 957
46f4442e 958 /* and then, add it */
b75a7d8f 959#if UCOL_DEBUG==2
46f4442e 960 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
b75a7d8f 961#endif
46f4442e 962 uprv_uca_addAnElement(t, &el, status);
b75a7d8f 963
b75a7d8f 964#if UCOL_DEBUG_DUPLICATES
46f4442e
A
965 if(*status != U_ZERO_ERROR) {
966 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
967 *status = U_ZERO_ERROR;
968 }
b75a7d8f
A
969#endif
970
46f4442e
A
971 tok = tok->next;
972 }
b75a7d8f
A
973}
974
975U_CDECL_BEGIN
976static UBool U_CALLCONV
977_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
46f4442e
A
978 UErrorCode status = U_ZERO_ERROR;
979 tempUCATable *t = (tempUCATable *)context;
980 if(value == 0) {
981 while(start < limit) {
982 uint32_t CE = utrie_get32(t->mapping, start, NULL);
983 if(CE == UCOL_NOT_FOUND) {
984 UCAElements el;
985 el.isThai = FALSE;
986 el.prefixSize = 0;
987 el.prefixChars[0] = 0;
988 el.prefix = el.prefixChars;
989 el.cPoints = el.uchars;
990
991 el.cSize = 0;
4388f060 992 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
46f4442e
A
993
994 el.noOfCEs = 1;
995 el.CEs[0] = 0;
996 uprv_uca_addAnElement(t, &el, &status);
b75a7d8f 997
46f4442e
A
998 }
999 start++;
1000 }
1001 }
1002 if(U_FAILURE(status)) {
1003 return FALSE;
1004 } else {
1005 return TRUE;
b75a7d8f 1006 }
b75a7d8f
A
1007}
1008U_CDECL_END
1009
46f4442e 1010static void
b75a7d8f 1011ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
46f4442e
A
1012 UChar32 start, UChar32 end,
1013 UErrorCode *status)
1014{
1015 //UChar decomp[256];
1016 uint32_t CE = UCOL_NOT_FOUND;
1017 UChar32 u = 0;
1018 UCAElements el;
1019 el.isThai = FALSE;
1020 el.prefixSize = 0;
1021 el.prefixChars[0] = 0;
1022 collIterate colIt;
1023
1024 if(U_SUCCESS(*status)) {
1025 for(u = start; u<=end; u++) {
1026 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1027 /* this test is for contractions that are missing the starting element. */
1028 || ((isCntTableElement(CE)) &&
1029 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1030 )
1031 {
1032 el.cSize = 0;
1033 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1034 //decomp[0] = (UChar)u;
1035 //el.uchars[0] = (UChar)u;
1036 el.cPoints = el.uchars;
1037 //el.cSize = 1;
1038 el.noOfCEs = 0;
1039 el.prefix = el.prefixChars;
1040 el.prefixSize = 0;
1041 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1042 // We actually want to check whether this element is a special
1043 // If it is an implicit element (hangul, CJK - we want to copy the
1044 // special, not the resolved CEs) - for hangul, copying resolved
1045 // would just make things the same (there is an expansion and it
1046 // takes approximately the same amount of time to resolve as
1047 // falling back to the UCA).
1048 /*
1049 UTRIE_GET32(src->UCA->mapping, u, CE);
1050 tag = getCETag(CE);
1051 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1052 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1053 || tag == LEAD_SURROGATE_TAG) {
1054 el.CEs[el.noOfCEs++] = CE;
1055 } else {
1056 */
1057 // It turns out that it does not make sense to keep implicits
1058 // unresolved. The cost of resolving them is big enough so that
1059 // it doesn't make any difference whether we have to go to the UCA
1060 // or not.
1061 {
729e4ab9 1062 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
46f4442e
A
1063 while(CE != UCOL_NO_MORE_CES) {
1064 CE = ucol_getNextCE(src->UCA, &colIt, status);
1065 if(CE != UCOL_NO_MORE_CES) {
1066 el.CEs[el.noOfCEs++] = CE;
1067 }
1068 }
1069 }
1070 uprv_uca_addAnElement(t, &el, status);
b75a7d8f 1071 }
b75a7d8f 1072 }
b75a7d8f 1073 }
b75a7d8f
A
1074}
1075
729e4ab9
A
1076U_NAMESPACE_END
1077
1078U_CFUNC UCATableHeader *
1079ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
46f4442e 1080 U_NAMESPACE_USE
b75a7d8f 1081
46f4442e 1082 uint32_t i = 0;
b75a7d8f 1083 if(U_FAILURE(*status)) {
46f4442e 1084 return NULL;
b75a7d8f 1085 }
46f4442e
A
1086 /*
1087 2. Eliminate the negative lists by doing the following for each non-null negative list:
1088 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1089 create new ListHeader X
1090 o reverse the list, add to the end of X's positive list. Reset the strength of the
1091 first item you add, based on the stronger strength levels of the two lists.
1092 */
1093 /*
1094 3. For each ListHeader with a non-null positive list:
1095 */
1096 /*
1097 o Find all character strings with CEs between the baseCE and the
1098 next/previous CE, at the strength of the first token. Add these to the
1099 tailoring.
1100 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1101 tailoring has & x < z...
1102 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1103 */
1104 /* It is possible that this part should be done even while constructing list */
1105 /* The problem is that it is unknown what is going to be the strongest weight */
1106 /* So we might as well do it here */
1107
1108 /*
1109 o Allocate CEs for each token in the list, based on the total number N of the
1110 largest level difference, and the gap G between baseCE and nextCE at that
1111 level. The relation * between the last item and nextCE is the same as the
1112 strongest strength.
1113 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1114 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1115 Then fit b and c into the secondary gap between a and d, then fit q
1116 into the tertiary gap between b and c.
1117
1118 o Example: baseCE << b <<< q << c * nextCE(X,2)
1119 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1120 Then fit q into the tertiary gap between b and c.
1121 o When incrementing primary values, we will not cross high byte
1122 boundaries except where there is only a single-byte primary. That is to
1123 ensure that the script reordering will continue to work.
1124 */
1125 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1126 /* test for NULL */
1127 if (image == NULL) {
1128 *status = U_MEMORY_ALLOCATION_ERROR;
1129 return NULL;
b75a7d8f 1130 }
46f4442e
A
1131 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1132
1133 for(i = 0; i<src->resultLen; i++) {
1134 /* now we need to generate the CEs */
1135 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1136 /* According to strength */
1137 if(U_SUCCESS(*status)) {
1138 if(src->lh[i].first) { // if there are any elements
1139 // due to the way parser works, subsequent tailorings
1140 // may remove all the elements from a sequence, therefore
1141 // leaving an empty tailoring sequence.
1142 ucol_initBuffers(src, &src->lh[i], status);
1143 }
1144 }
1145 if(U_FAILURE(*status)) {
1146 uprv_free(image);
1147 return NULL;
1148 }
b75a7d8f 1149 }
46f4442e
A
1150
1151 if(src->varTop != NULL) { /* stuff the variable top value */
1152 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1153 /* remove it from the list */
1154 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1155 src->varTop->listHeader->first = src->varTop->next;
1156 }
1157 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1158 src->varTop->listHeader->last = src->varTop->previous;
1159 }
1160 if(src->varTop->next != NULL) {
1161 src->varTop->next->previous = src->varTop->previous;
1162 }
1163 if(src->varTop->previous != NULL) {
1164 src->varTop->previous->next = src->varTop->next;
1165 }
b75a7d8f 1166 }
b75a7d8f
A
1167
1168
46f4442e
A
1169 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1170 if(U_FAILURE(*status)) {
1171 uprv_free(image);
1172 return NULL;
1173 }
b75a7d8f
A
1174
1175
46f4442e
A
1176 /* After this, we have assigned CE values to all regular CEs */
1177 /* now we will go through list once more and resolve expansions, */
1178 /* make UCAElements structs and add them to table */
1179 for(i = 0; i<src->resultLen; i++) {
1180 /* now we need to generate the CEs */
1181 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1182 /* According to strength */
1183 if(U_SUCCESS(*status)) {
1184 ucol_createElements(src, t, &src->lh[i], status);
1185 }
b75a7d8f 1186 }
b75a7d8f 1187
46f4442e
A
1188 UCAElements el;
1189 el.isThai = FALSE;
1190 el.prefixSize = 0;
1191 el.prefixChars[0] = 0;
b75a7d8f 1192
46f4442e
A
1193 /* add latin-1 stuff */
1194 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
b75a7d8f 1195
46f4442e
A
1196 /* add stuff for copying */
1197 if(src->copySet != NULL) {
1198 int32_t i = 0;
1199 UnicodeSet *set = (UnicodeSet *)src->copySet;
1200 for(i = 0; i < set->getRangeCount(); i++) {
1201 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
b75a7d8f 1202 }
46f4442e 1203 }
b75a7d8f 1204
46f4442e
A
1205 if(U_SUCCESS(*status)) {
1206 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1207
1208 uint32_t tailoredCE = UCOL_NOT_FOUND;
46f4442e 1209 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
4388f060 1210 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
46f4442e
A
1211 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1212 // Check for null pointer
1213 if (ucaEl == NULL) {
4388f060
A
1214 *status = U_MEMORY_ALLOCATION_ERROR;
1215 return NULL;
b75a7d8f 1216 }
46f4442e 1217 while(*conts != 0) {
4388f060
A
1218 // A continuation is NUL-terminated and NUL-padded
1219 // except if it has the maximum length.
1220 int32_t contractionLength = maxUCAContractionLength;
1221 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1222 --contractionLength;
1223 }
1224 UChar32 first;
1225 int32_t firstLength = 0;
1226 U16_NEXT(conts, firstLength, contractionLength, first);
1227 tailoredCE = utrie_get32(t->mapping, first, NULL);
46f4442e
A
1228 if(tailoredCE != UCOL_NOT_FOUND) {
1229 UBool needToAdd = TRUE;
1230 if(isCntTableElement(tailoredCE)) {
4388f060 1231 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
46f4442e
A
1232 needToAdd = FALSE;
1233 }
1234 }
1235 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1236 UCAElements elm;
1237 elm.cPoints = el.uchars;
1238 elm.noOfCEs = 0;
1239 elm.uchars[0] = *conts;
1240 elm.uchars[1] = 0;
1241 elm.cSize = 1;
1242 elm.prefixChars[0] = *(conts+2);
1243 elm.isThai = FALSE;
1244 elm.prefix = elm.prefixChars;
1245 elm.prefixSize = 1;
1246 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1247 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1248 needToAdd = TRUE;
1249 }
1250 }
4388f060 1251 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
46f4442e
A
1252 needToAdd = FALSE;
1253 }
1254
1255 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1256 if (*(conts+1) != 0) { // contractions
1257 el.prefix = el.prefixChars;
1258 el.prefixSize = 0;
1259 el.cPoints = el.uchars;
1260 el.noOfCEs = 0;
4388f060
A
1261 u_memcpy(el.uchars, conts, contractionLength);
1262 el.cSize = contractionLength;
46f4442e
A
1263 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1264 }
1265 else { // pre-context character
1266 UChar str[4] = { 0 };
1267 int32_t len=0;
1268 int32_t preKeyLen=0;
1269
1270 el.cPoints = el.uchars;
1271 el.noOfCEs = 0;
1272 el.uchars[0] = *conts;
1273 el.uchars[1] = 0;
1274 el.cSize = 1;
1275 el.prefixChars[0] = *(conts+2);
1276 el.prefix = el.prefixChars;
1277 el.prefixSize = 1;
1278 if (el.prefixChars[0]!=0) {
1279 // get CE of prefix character first
1280 str[0]=el.prefixChars[0];
1281 str[1]=0;
1282 ucol_setText(ucaEl, str, 1, status);
1283 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1284 != UCOL_NULLORDER) {
1285 preKeyLen++; // count number of keys for prefix character
1286 }
1287 str[len++] = el.prefixChars[0];
1288 }
1289
1290 str[len++] = el.uchars[0];
1291 str[len]=0;
1292 ucol_setText(ucaEl, str, len, status);
1293 // Skip the keys for prefix character, then copy the rest to el.
1294 while ((preKeyLen-->0) &&
1295 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1296 continue;
1297 }
1298
1299 }
1300 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1301 el.noOfCEs++;
1302 }
1303 uprv_uca_addAnElement(t, &el, status);
1304 }
1305
4388f060
A
1306 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1307 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
46f4442e 1308 }
4388f060 1309 conts+=maxUCAContractionLength;
46f4442e
A
1310 }
1311 ucol_closeElements(ucaEl);
b75a7d8f 1312 }
b75a7d8f 1313
46f4442e
A
1314 // Add completely ignorable elements
1315 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
b75a7d8f 1316
46f4442e 1317 // add tailoring characters related canonical closures
729e4ab9 1318 uprv_uca_canonicalClosure(t, src, NULL, status);
b75a7d8f
A
1319
1320 /* still need to produce compatibility closure */
1321
46f4442e 1322 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
b75a7d8f 1323
46f4442e
A
1324 uprv_uca_closeTempTable(t);
1325 uprv_free(image);
b75a7d8f 1326
46f4442e 1327 return myData;
b75a7d8f
A
1328}
1329
374ca955
A
1330U_CDECL_BEGIN
1331static UBool U_CALLCONV
b75a7d8f
A
1332ucol_bld_cleanup(void)
1333{
1334 udata_close(invUCA_DATA_MEM);
1335 invUCA_DATA_MEM = NULL;
374ca955 1336 _staticInvUCA = NULL;
b75a7d8f
A
1337 return TRUE;
1338}
374ca955 1339U_CDECL_END
b75a7d8f
A
1340
1341U_CAPI const InverseUCATableHeader * U_EXPORT2
1342ucol_initInverseUCA(UErrorCode *status)
1343{
1344 if(U_FAILURE(*status)) return NULL;
1345
46f4442e
A
1346 UBool needsInit;
1347 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1348
1349 if(needsInit) {
b75a7d8f 1350 InverseUCATableHeader *newInvUCA = NULL;
729e4ab9 1351 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
46f4442e 1352
b75a7d8f
A
1353 if(U_FAILURE(*status)) {
1354 if (result) {
1355 udata_close(result);
1356 }
1357 // This is not needed, as we are talking about
1358 // memory we got from UData
1359 //uprv_free(newInvUCA);
1360 }
46f4442e 1361
b75a7d8f
A
1362 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1363 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1364 UCollator *UCA = ucol_initUCA(status);
1365 // UCA versions of UCA and inverse UCA should match
1366 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
46f4442e
A
1367 *status = U_INVALID_FORMAT_ERROR;
1368 udata_close(result);
1369 return NULL;
b75a7d8f 1370 }
46f4442e 1371
b75a7d8f 1372 umtx_lock(NULL);
374ca955 1373 if(_staticInvUCA == NULL) {
b75a7d8f 1374 invUCA_DATA_MEM = result;
729e4ab9 1375 _staticInvUCA = newInvUCA;
b75a7d8f
A
1376 result = NULL;
1377 newInvUCA = NULL;
1378 }
1379 umtx_unlock(NULL);
46f4442e 1380
b75a7d8f
A
1381 if(newInvUCA != NULL) {
1382 udata_close(result);
1383 // This is not needed, as we are talking about
1384 // memory we got from UData
1385 //uprv_free(newInvUCA);
1386 }
1387 else {
374ca955 1388 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
b75a7d8f
A
1389 }
1390 }
1391 }
374ca955 1392 return _staticInvUCA;
b75a7d8f
A
1393}
1394
729e4ab9
A
1395/* This is the data that is used for non-script reordering codes. These _must_ be kept
1396 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1397 */
51004dcb 1398static const char * const ReorderingTokenNames[] = {
729e4ab9
A
1399 "SPACE",
1400 "PUNCT",
1401 "SYMBOL",
1402 "CURRENCY",
51004dcb 1403 "DIGIT"
729e4ab9
A
1404};
1405
1406static void toUpper(const char* src, char* dst, uint32_t length) {
1407 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
4388f060 1408 *dst = uprv_toupper(*src);
729e4ab9
A
1409 }
1410 *dst = '\0';
1411}
1412
1413U_INTERNAL int32_t U_EXPORT2
1414ucol_findReorderingEntry(const char* name) {
1415 char buffer[32];
1416 toUpper(name, buffer, 32);
51004dcb 1417 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
729e4ab9
A
1418 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1419 return entry + UCOL_REORDER_CODE_FIRST;
1420 }
1421 }
1422 return USCRIPT_INVALID_CODE;
1423}
1424
b75a7d8f 1425#endif /* #if !UCONFIG_NO_COLLATION */