]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_bld.cpp
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
729e4ab9 4* Copyright (C) 2001-2011, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_bld.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
16* This module builds a collator based on the rule set.
46f4442e 17*
b75a7d8f
A
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucoleitr.h"
46f4442e 25#include "unicode/udata.h"
b75a7d8f 26#include "unicode/uchar.h"
b75a7d8f 27#include "unicode/uniset.h"
729e4ab9
A
28#include "unicode/uscript.h"
29#include "unicode/ustring.h"
30#include "normalizer2impl.h"
46f4442e
A
31#include "ucol_bld.h"
32#include "ucol_elm.h"
33#include "ucol_cnt.h"
34#include "ucln_in.h"
35#include "umutex.h"
46f4442e 36#include "cmemory.h"
729e4ab9 37#include "cstring.h"
b75a7d8f 38
374ca955 39static const InverseUCATableHeader* _staticInvUCA = NULL;
b75a7d8f
A
40static UDataMemory* invUCA_DATA_MEM = NULL;
41
42U_CDECL_BEGIN
43static UBool U_CALLCONV
46f4442e
A
44isAcceptableInvUCA(void * /*context*/,
45 const char * /*type*/, const char * /*name*/,
46 const UDataInfo *pInfo)
47{
48 /* context, type & name are intentionally not used */
b75a7d8f
A
49 if( pInfo->size>=20 &&
50 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
51 pInfo->charsetFamily==U_CHARSET_FAMILY &&
374ca955
A
52 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
53 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
54 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
55 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
56 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
57 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
58 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
59 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
60 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
46f4442e
A
61 )
62 {
b75a7d8f
A
63 UVersionInfo UCDVersion;
64 u_getUnicodeVersion(UCDVersion);
46f4442e
A
65 return (pInfo->dataVersion[0]==UCDVersion[0] &&
66 pInfo->dataVersion[1]==UCDVersion[1]);
67 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
68 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
69 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
b75a7d8f
A
70 } else {
71 return FALSE;
72 }
73}
74U_CDECL_END
75
46f4442e
A
76/*
77* Takes two CEs (lead and continuation) and
78* compares them as CEs should be compared:
79* primary vs. primary, secondary vs. secondary
80* tertiary vs. tertiary
81*/
73c04bcf 82static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
46f4442e
A
83 uint32_t s1 = source0, s2, t1 = target0, t2;
84 if(isContinuation(source1)) {
85 s2 = source1;
86 } else {
87 s2 = 0;
88 }
89 if(isContinuation(target1)) {
90 t2 = target1;
73c04bcf 91 } else {
46f4442e
A
92 t2 = 0;
93 }
94
95 uint32_t s = 0, t = 0;
96 if(s1 == t1 && s2 == t2) {
97 return 0;
98 }
99 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
100 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
101 if(s < t) {
73c04bcf 102 return -1;
46f4442e 103 } else if(s > t) {
73c04bcf 104 return 1;
46f4442e
A
105 } else {
106 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
107 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
108 if(s < t) {
109 return -1;
110 } else if(s > t) {
111 return 1;
112 } else {
113 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
114 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
115 if(s < t) {
116 return -1;
117 } else {
118 return 1;
119 }
120 }
73c04bcf 121 }
73c04bcf
A
122}
123
b75a7d8f 124static
374ca955 125int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
46f4442e
A
126 uint32_t bottom = 0, top = src->invUCA->tableSize;
127 uint32_t i = 0;
128 uint32_t first = 0, second = 0;
129 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
130 int32_t res = 0;
131
132 while(bottom < top-1) {
133 i = (top+bottom)/2;
134 first = *(CETable+3*i);
135 second = *(CETable+3*i+1);
136 res = compareCEs(first, second, CE, SecondCE);
137 if(res > 0) {
138 top = i;
139 } else if(res < 0) {
140 bottom = i;
141 } else {
142 break;
143 }
b75a7d8f 144 }
46f4442e
A
145
146 /* weiv: */
147 /* in searching for elements, I have removed the failure */
148 /* The reason for this is that the builder does not rely */
149 /* on search mechanism telling it that it didn't find an */
150 /* element. However, indirect positioning relies on being */
151 /* able to find the elements around any CE, even if it is */
152 /* not defined in the UCA. */
153 return i;
154 /*
155 if((first == CE && second == SecondCE)) {
b75a7d8f 156 return i;
46f4442e 157 } else {
b75a7d8f 158 return -1;
46f4442e
A
159 }
160 */
b75a7d8f
A
161}
162
163static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
46f4442e
A
164 0xFFFF0000,
165 0xFFFFFF00,
166 0xFFFFFFFF
b75a7d8f
A
167};
168
374ca955 169U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
46f4442e
A
170 uint32_t CE, uint32_t contCE,
171 uint32_t *nextCE, uint32_t *nextContCE,
172 uint32_t strength)
173{
174 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
175 int32_t iCE;
b75a7d8f 176
46f4442e 177 iCE = ucol_inv_findCE(src, CE, contCE);
b75a7d8f 178
46f4442e
A
179 if(iCE<0) {
180 *nextCE = UCOL_NOT_FOUND;
181 return -1;
182 }
b75a7d8f 183
46f4442e
A
184 CE &= strengthMask[strength];
185 contCE &= strengthMask[strength];
b75a7d8f 186
46f4442e
A
187 *nextCE = CE;
188 *nextContCE = contCE;
b75a7d8f 189
46f4442e
A
190 while((*nextCE & strengthMask[strength]) == CE
191 && (*nextContCE & strengthMask[strength]) == contCE)
192 {
193 *nextCE = (*(CETable+3*(++iCE)));
194 *nextContCE = (*(CETable+3*(iCE)+1));
195 }
b75a7d8f 196
46f4442e 197 return iCE;
b75a7d8f
A
198}
199
46f4442e
A
200U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
201 uint32_t CE, uint32_t contCE,
202 uint32_t *prevCE, uint32_t *prevContCE,
203 uint32_t strength)
204{
205 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
206 int32_t iCE;
b75a7d8f 207
46f4442e 208 iCE = ucol_inv_findCE(src, CE, contCE);
b75a7d8f 209
46f4442e
A
210 if(iCE<0) {
211 *prevCE = UCOL_NOT_FOUND;
212 return -1;
213 }
b75a7d8f 214
46f4442e
A
215 CE &= strengthMask[strength];
216 contCE &= strengthMask[strength];
b75a7d8f 217
46f4442e
A
218 *prevCE = CE;
219 *prevContCE = contCE;
b75a7d8f 220
46f4442e
A
221 while((*prevCE & strengthMask[strength]) == CE
222 && (*prevContCE & strengthMask[strength])== contCE
223 && iCE > 0) /* this condition should prevent falling off the edge of the world */
224 {
225 /* here, we end up in a singularity - zero */
226 *prevCE = (*(CETable+3*(--iCE)));
227 *prevContCE = (*(CETable+3*(iCE)+1));
228 }
b75a7d8f 229
46f4442e 230 return iCE;
b75a7d8f
A
231}
232
46f4442e
A
233U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
234 uint32_t prevCE, uint32_t prevContCE)
73c04bcf
A
235{
236 if(prevCE == CE && prevContCE == contCE) {
46f4442e 237 return UCOL_IDENTICAL;
73c04bcf
A
238 }
239 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
46f4442e
A
240 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
241 {
242 return UCOL_PRIMARY;
374ca955 243 }
73c04bcf 244 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
46f4442e
A
245 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
246 {
247 return UCOL_SECONDARY;
73c04bcf 248 }
46f4442e 249 return UCOL_TERTIARY;
374ca955
A
250}
251
252
46f4442e 253/*static
374ca955 254inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
b75a7d8f 255
46f4442e
A
256 uint32_t CE = lh->baseCE;
257 uint32_t SecondCE = lh->baseContCE;
b75a7d8f 258
46f4442e
A
259 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
260 uint32_t previousCE, previousContCE;
261 int32_t iCE;
b75a7d8f 262
46f4442e 263 iCE = ucol_inv_findCE(src, CE, SecondCE);
b75a7d8f 264
46f4442e
A
265 if(iCE<0) {
266 return -1;
267 }
b75a7d8f 268
46f4442e
A
269 CE &= strengthMask[strength];
270 SecondCE &= strengthMask[strength];
b75a7d8f 271
46f4442e
A
272 previousCE = CE;
273 previousContCE = SecondCE;
b75a7d8f 274
46f4442e
A
275 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
276 previousCE = (*(CETable+3*(--iCE)));
277 previousContCE = (*(CETable+3*(iCE)+1));
278 }
279 lh->previousCE = previousCE;
280 lh->previousContCE = previousContCE;
b75a7d8f 281
46f4442e
A
282 return iCE;
283}*/
b75a7d8f
A
284
285static
374ca955 286inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
46f4442e
A
287 uint32_t CE = lh->baseCE;
288 uint32_t SecondCE = lh->baseContCE;
b75a7d8f 289
46f4442e
A
290 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
291 uint32_t nextCE, nextContCE;
292 int32_t iCE;
b75a7d8f 293
46f4442e 294 iCE = ucol_inv_findCE(src, CE, SecondCE);
b75a7d8f 295
46f4442e
A
296 if(iCE<0) {
297 return -1;
298 }
b75a7d8f 299
46f4442e
A
300 CE &= strengthMask[strength];
301 SecondCE &= strengthMask[strength];
b75a7d8f 302
46f4442e
A
303 nextCE = CE;
304 nextContCE = SecondCE;
b75a7d8f 305
46f4442e
A
306 while((nextCE & strengthMask[strength]) == CE
307 && (nextContCE & strengthMask[strength]) == SecondCE)
308 {
309 nextCE = (*(CETable+3*(++iCE)));
310 nextContCE = (*(CETable+3*(iCE)+1));
311 }
b75a7d8f 312
46f4442e
A
313 lh->nextCE = nextCE;
314 lh->nextContCE = nextContCE;
b75a7d8f 315
46f4442e 316 return iCE;
b75a7d8f
A
317}
318
46f4442e
A
319static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
320 /* reset all the gaps */
321 int32_t i = 0;
322 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
323 uint32_t st = 0;
324 uint32_t t1, t2;
325 int32_t pos;
326
327 UColToken *tok = lh->first;
328 uint32_t tokStrength = tok->strength;
329
330 for(i = 0; i<3; i++) {
331 lh->gapsHi[3*i] = 0;
332 lh->gapsHi[3*i+1] = 0;
333 lh->gapsHi[3*i+2] = 0;
334 lh->gapsLo[3*i] = 0;
335 lh->gapsLo[3*i+1] = 0;
336 lh->gapsLo[3*i+2] = 0;
337 lh->numStr[i] = 0;
338 lh->fStrToken[i] = NULL;
339 lh->lStrToken[i] = NULL;
340 lh->pos[i] = -1;
341 }
b75a7d8f 342
46f4442e
A
343 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
344
345 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
346 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
347 lh->pos[0] = 0;
348 t1 = lh->baseCE;
349 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
350 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
351 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
352 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
729e4ab9 353 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
46f4442e
A
354 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
355
729e4ab9 356 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
46f4442e
A
357 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
358
359 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
360 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
361 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
362 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
363 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
364 lh->pos[0] = 0;
365 t1 = lh->baseCE;
366 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
367 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
368 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
369 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
370 t1 = lh->nextCE;
371 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
372 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
373 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
374 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
375 } else {
376 for(;;) {
377 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
378 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
379 lh->fStrToken[tokStrength] = tok;
380 } else { /* The CE must be implicit, since it's not in the table */
381 /* Error */
382 *status = U_INTERNAL_PROGRAM_ERROR;
383 }
384 }
385
386 while(tok != NULL && tok->strength >= tokStrength) {
387 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
388 lh->lStrToken[tokStrength] = tok;
389 }
390 tok = tok->next;
391 }
392 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
393 /* check if previous interval is the same and merge the intervals if it is so */
394 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
395 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
396 lh->fStrToken[tokStrength+1] = NULL;
397 lh->lStrToken[tokStrength+1] = NULL;
398 lh->pos[tokStrength+1] = -1;
399 }
400 }
401 if(tok != NULL) {
402 tokStrength = tok->strength;
403 } else {
404 break;
405 }
b75a7d8f 406 }
46f4442e
A
407 for(st = 0; st < 3; st++) {
408 if((pos = lh->pos[st]) >= 0) {
409 t1 = *(CETable+3*(pos));
410 t2 = *(CETable+3*(pos)+1);
411 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
412 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
413 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
414 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
415 //pos--;
416 //t1 = *(CETable+3*(pos));
417 //t2 = *(CETable+3*(pos)+1);
418 t1 = lh->baseCE;
419 t2 = lh->baseContCE;
420 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
421 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
422 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
423 }
b75a7d8f 424 }
b75a7d8f 425 }
b75a7d8f
A
426}
427
428
429#define ucol_countBytes(value, noOfBytes) \
430{ \
46f4442e
A
431 uint32_t mask = 0xFFFFFFFF; \
432 (noOfBytes) = 0; \
433 while(mask != 0) { \
b75a7d8f 434 if(((value) & mask) != 0) { \
46f4442e 435 (noOfBytes)++; \
b75a7d8f
A
436 } \
437 mask >>= 8; \
46f4442e 438 } \
b75a7d8f
A
439}
440
46f4442e
A
441static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
442 if(U_SUCCESS(*status)) {
443 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
444 }
445 return g->current;
b75a7d8f
A
446}
447
46f4442e
A
448static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
449 /* TODO: rename to enum names */
450 uint32_t high, low, count=1;
451 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
452
453 if(strength == UCOL_SECONDARY) {
454 low = UCOL_COMMON_TOP2<<24;
455 high = 0xFFFFFFFF;
456 count = 0xFF - UCOL_COMMON_TOP2;
457 } else {
458 low = UCOL_BYTE_COMMON << 24; //0x05000000;
459 high = 0x40000000;
460 count = 0x40 - UCOL_BYTE_COMMON;
461 }
462
463 if(tok->next != NULL && tok->next->strength == strength) {
464 count = tok->next->toInsert;
465 }
466
467 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
468 g->current = UCOL_BYTE_COMMON<<24;
469
470 if(g->noOfRanges == 0) {
471 *status = U_INTERNAL_PROGRAM_ERROR;
472 }
473 return g->current;
b75a7d8f
A
474}
475
46f4442e
A
476static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
477 uint32_t strength = tok->strength;
478 uint32_t low = lows[fStrength*3+strength];
479 uint32_t high = highs[fStrength*3+strength];
480 uint32_t maxByte = 0;
481 if(strength == UCOL_TERTIARY) {
482 maxByte = 0x3F;
483 } else if(strength == UCOL_PRIMARY) {
484 maxByte = 0xFE;
485 } else {
486 maxByte = 0xFF;
487 }
488
489 uint32_t count = tok->toInsert;
490
491 if(low >= high && strength > UCOL_PRIMARY) {
492 int32_t s = strength;
493 for(;;) {
494 s--;
495 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
496 if(strength == UCOL_SECONDARY) {
497 if (low < UCOL_COMMON_TOP2<<24 ) {
498 // Override if low range is less than UCOL_COMMON_TOP2.
499 low = UCOL_COMMON_TOP2<<24;
500 }
501 high = 0xFFFFFFFF;
502 } else {
503 // Override if low range is less than UCOL_COMMON_BOT3.
504 if ( low < UCOL_COMMON_BOT3<<24 ) {
505 low = UCOL_COMMON_BOT3<<24;
506 }
507 high = 0x40000000;
508 }
509 break;
510 }
511 if(s<0) {
512 *status = U_INTERNAL_PROGRAM_ERROR;
513 return 0;
514 }
b75a7d8f 515 }
b75a7d8f 516 }
b75a7d8f 517
729e4ab9
A
518 if(low < 0x02000000) {
519 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
520 // See http://site.icu-project.org/design/collation/bytes
521 low = 0x02000000;
46f4442e 522 }
b75a7d8f 523
46f4442e
A
524 if(strength == UCOL_SECONDARY) { /* similar as simple */
525 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
526 low = UCOL_COMMON_TOP2<<24;
527 }
528 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529 high = UCOL_COMMON_TOP2<<24;
530 }
531 if(low < (UCOL_COMMON_BOT2<<24)) {
532 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
533 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
534 //g->current = UCOL_COMMON_BOT2<<24;
535 return g->current;
536 }
b75a7d8f 537 }
46f4442e
A
538
539 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
540 if(g->noOfRanges == 0) {
541 *status = U_INTERNAL_PROGRAM_ERROR;
b75a7d8f 542 }
46f4442e
A
543 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
544 return g->current;
b75a7d8f
A
545}
546
374ca955
A
547static
548uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
46f4442e
A
549 uint32_t i = 0;
550 UChar c;
551
552 if(U_FAILURE(*status)) {
553 return 0;
554 }
555
556 if(sourceLen > resLen) {
557 *status = U_MEMORY_ALLOCATION_ERROR;
558 return 0;
559 }
560
561 for(i = 0; i < sourceLen; i++) {
562 c = source[i];
563 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
564 switch(c - 0x3000) {
565 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
566 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
567 c++;
568 break;
569 case 0xF5:
570 c = 0x30AB;
571 break;
572 case 0xF6:
573 c = 0x30B1;
574 break;
575 }
576 }
577 resBuf[i] = c;
374ca955 578 }
46f4442e 579 return sourceLen;
374ca955
A
580}
581
582static
583uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
46f4442e
A
584 uint32_t i = 0;
585 UChar c;
586
587 if(U_FAILURE(*status)) {
588 return 0;
374ca955 589 }
46f4442e
A
590
591 if(sourceLen > resLen) {
592 *status = U_MEMORY_ALLOCATION_ERROR;
593 return 0;
594 }
595
596 for(i = 0; i < sourceLen; i++) {
597 c = source[i];
598 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
599 switch(c - 0x3000) {
600 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
601 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
602 c--;
603 break;
604 case 0xAB:
605 c = 0x30F5;
606 break;
607 case 0xB1:
608 c = 0x30F6;
609 break;
610 }
611 }
612 resBuf[i] = c;
613 }
614 return sourceLen;
374ca955
A
615}
616
729e4ab9
A
617U_NAMESPACE_BEGIN
618
374ca955
A
619static
620uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
46f4442e
A
621 uint32_t i = 0;
622 UChar n[128];
623 uint32_t nLen = 0;
624 uint32_t uCount = 0, lCount = 0;
625
626 collIterate s;
627 uint32_t order = 0;
628
629 if(U_FAILURE(*status)) {
374ca955 630 return UCOL_LOWER_CASE;
46f4442e
A
631 }
632
633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
634 if(U_SUCCESS(*status)) {
635 for(i = 0; i < nLen; i++) {
729e4ab9 636 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
46f4442e
A
637 order = ucol_getNextCE(UCA, &s, status);
638 if(isContinuation(order)) {
639 *status = U_INTERNAL_PROGRAM_ERROR;
640 return UCOL_LOWER_CASE;
641 }
642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
643 uCount++;
644 } else {
645 if(u_islower(n[i])) {
646 lCount++;
729e4ab9 647 } else if(U_SUCCESS(*status)) {
46f4442e
A
648 UChar sk[1], lk[1];
649 u_toSmallKana(&n[i], 1, sk, 1, status);
650 u_toLargeKana(&n[i], 1, lk, 1, status);
651 if(sk[0] == n[i] && lk[0] != n[i]) {
652 lCount++;
653 }
654 }
655 }
374ca955 656 }
374ca955 657 }
46f4442e
A
658
659 if(uCount != 0 && lCount != 0) {
660 return UCOL_MIXED_CASE;
661 } else if(uCount != 0) {
662 return UCOL_UPPER_CASE;
663 } else {
664 return UCOL_LOWER_CASE;
665 }
374ca955
A
666}
667
668
669U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
46f4442e
A
670 /* this one makes the table and stuff */
671 uint32_t noOfBytes[3];
672 uint32_t i;
b75a7d8f 673
46f4442e
A
674 for(i = 0; i<3; i++) {
675 ucol_countBytes(CEparts[i], noOfBytes[i]);
676 }
b75a7d8f 677
46f4442e 678 /* Here we have to pack CEs from parts */
b75a7d8f 679
46f4442e
A
680 uint32_t CEi = 0;
681 uint32_t value = 0;
b75a7d8f 682
46f4442e
A
683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
684 if(CEi > 0) {
685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
686 } else {
687 value = 0;
688 }
b75a7d8f 689
46f4442e
A
690 if(2*CEi<noOfBytes[0]) {
691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
692 }
693 if(CEi<noOfBytes[1]) {
694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
695 }
696 if(CEi<noOfBytes[2]) {
697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
698 }
699 tok->CEs[CEi] = value;
700 CEi++;
b75a7d8f 701 }
46f4442e
A
702 if(CEi == 0) { /* totally ignorable */
703 tok->noOfCEs = 1;
704 tok->CEs[0] = 0;
705 } else { /* there is at least something */
706 tok->noOfCEs = CEi;
b75a7d8f 707 }
46f4442e
A
708
709
710 // we want to set case bits here and now, not later.
711 // Case bits handling
712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
714 int32_t cSize = (tok->source & 0xFF000000) >> 24;
715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
716
717 if(cSize > 1) {
718 // Do it manually
719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
720 } else {
721 // Copy it from the UCA
722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
723 tok->CEs[0] |= (caseCE & 0xC0);
724 }
73c04bcf 725 }
374ca955 726
b75a7d8f 727#if UCOL_DEBUG==2
46f4442e
A
728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
729 for(i = 0; i<tok->noOfCEs; i++) {
730 fprintf(stderr, "%08X ", tok->CEs[i]);
731 }
732 fprintf(stderr, "\n");
b75a7d8f
A
733#endif
734}
735
736U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
46f4442e
A
737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
b75a7d8f 739
46f4442e
A
740 UColToken *tok = lh->last;
741 uint32_t t[UCOL_STRENGTH_LIMIT];
b75a7d8f 742
46f4442e 743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
b75a7d8f 744
46f4442e
A
745 tok->toInsert = 1;
746 t[tok->strength] = 1;
b75a7d8f 747
46f4442e
A
748 while(tok->previous != NULL) {
749 if(tok->previous->strength < tok->strength) { /* going up */
750 t[tok->strength] = 0;
751 t[tok->previous->strength]++;
752 } else if(tok->previous->strength > tok->strength) { /* going down */
753 t[tok->previous->strength] = 1;
754 } else {
755 t[tok->strength]++;
756 }
757 tok=tok->previous;
758 tok->toInsert = t[tok->strength];
b75a7d8f 759 }
b75a7d8f 760
46f4442e
A
761 tok->toInsert = t[tok->strength];
762 ucol_inv_getGapPositions(src, lh, status);
b75a7d8f
A
763
764#if UCOL_DEBUG
46f4442e
A
765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
766 int32_t j = 2;
767 for(j = 2; j >= 0; j--) {
768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
770 }
729e4ab9 771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
b75a7d8f 772
46f4442e
A
773 do {
774 fprintf(stderr,"%i", tok->strength);
775 tok = tok->next;
776 } while(tok != NULL);
777 fprintf(stderr, "\n");
b75a7d8f 778
729e4ab9 779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
b75a7d8f 780
46f4442e
A
781 do {
782 fprintf(stderr,"%i", tok->toInsert);
783 tok = tok->next;
784 } while(tok != NULL);
785#endif
b75a7d8f 786
46f4442e
A
787 tok = lh->first;
788 uint32_t fStrength = UCOL_IDENTICAL;
789 uint32_t initStrength = UCOL_IDENTICAL;
790
791
792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
795
796 while (tok != NULL && U_SUCCESS(*status)) {
797 fStrength = tok->strength;
798 if(fStrength < initStrength) {
799 initStrength = fStrength;
800 if(lh->pos[fStrength] == -1) {
801 while(lh->pos[fStrength] == -1 && fStrength > 0) {
802 fStrength--;
803 }
804 if(lh->pos[fStrength] == -1) {
805 *status = U_INTERNAL_PROGRAM_ERROR;
806 return;
807 }
808 }
809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
819 } else { /* primaries */
820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824 }
825 } else {
826 if(tok->strength == UCOL_TERTIARY) {
827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
828 } else if(tok->strength == UCOL_SECONDARY) {
829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
831 } else if(tok->strength == UCOL_PRIMARY) {
832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
835 }
b75a7d8f 836 }
46f4442e
A
837 ucol_doCE(src, CEparts, tok, status);
838 tok = tok->next;
b75a7d8f 839 }
b75a7d8f
A
840}
841
b75a7d8f 842U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
46f4442e
A
843 UCAElements el;
844 UColToken *tok = lh->first;
845 UColToken *expt = NULL;
846 uint32_t i = 0, j = 0;
729e4ab9
A
847 UChar32 fcdHighStart;
848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
46f4442e
A
849
850 while(tok != NULL && U_SUCCESS(*status)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing */
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok->expansion != 0) {
855 uint32_t len = tok->expansion >> 24;
856 uint32_t currentSequenceLen = len;
857 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
859 UColToken exp;
860 exp.source = currentSequenceLen | expOffset;
729e4ab9 861 exp.rulesToParseHdl = &(src->source);
46f4442e
A
862
863 while(len > 0) {
864 currentSequenceLen = len;
865 while(currentSequenceLen > 0) {
866 exp.source = (currentSequenceLen << 24) | expOffset;
867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy = expt->noOfCEs;
869 for(j = 0; j<noOfCEsToCopy; j++) {
870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
871 }
872 tok->noOfExpCEs += noOfCEsToCopy;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset += currentSequenceLen; //noOfCEsToCopy;
876 len -= currentSequenceLen; //noOfCEsToCopy;
877 break;
878 } else {
879 currentSequenceLen--;
880 }
881 }
882 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them into expansion */
886 collIterate s;
887 uint32_t order = 0;
729e4ab9 888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
46f4442e
A
889
890 for(;;) {
891 order = ucol_getNextCE(src->UCA, &s, status);
892 if(order == UCOL_NO_MORE_CES) {
893 break;
894 }
895 tok->expCEs[tok->noOfExpCEs++] = order;
896 }
897 expOffset++;
898 len--;
899 }
b75a7d8f 900 }
46f4442e
A
901 } else {
902 tok->noOfExpCEs = 0;
b75a7d8f 903 }
b75a7d8f 904
46f4442e
A
905 /* set the ucaelement with obtained values */
906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
907 /* copy CEs */
908 for(i = 0; i<tok->noOfCEs; i++) {
909 el.CEs[i] = tok->CEs[i];
910 }
911 for(i = 0; i<tok->noOfExpCEs; i++) {
912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
913 }
b75a7d8f 914
46f4442e
A
915 /* copy UChars */
916 // We kept prefix and source kind of together, as it is a kind of a contraction.
917 // However, now we have to slice the prefix off the main thing -
918 el.prefix = el.prefixChars;
919 el.cPoints = el.uchars;
920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el.prefixSize = tok->prefix>>24;
924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
925
926 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
928 } else {
929 el.prefixSize = 0;
930 *el.prefix = 0;
b75a7d8f 931
46f4442e
A
932 el.cSize = (tok->source >> 24);
933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
934 }
935 if(src->UCA != NULL) {
46f4442e
A
936 for(i = 0; i<el.cSize; i++) {
937 if(UCOL_ISJAMO(el.cPoints[i])) {
938 t->image->jamoSpecial = TRUE;
939 }
46f4442e 940 }
729e4ab9
A
941 if (!src->buildCCTabFlag && el.cSize > 0) {
942 // Check the trailing canonical combining class (tccc) of the last character.
943 const UChar *s = el.cPoints + el.cSize;
944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
945 if ((fcd & 0xff) != 0) {
946 src->buildCCTabFlag = TRUE;
947 }
46f4442e 948 }
b75a7d8f 949 }
b75a7d8f 950
46f4442e 951 /* and then, add it */
b75a7d8f 952#if UCOL_DEBUG==2
46f4442e 953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
b75a7d8f 954#endif
46f4442e 955 uprv_uca_addAnElement(t, &el, status);
b75a7d8f 956
b75a7d8f 957#if UCOL_DEBUG_DUPLICATES
46f4442e
A
958 if(*status != U_ZERO_ERROR) {
959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
960 *status = U_ZERO_ERROR;
961 }
b75a7d8f
A
962#endif
963
46f4442e
A
964 tok = tok->next;
965 }
b75a7d8f
A
966}
967
968U_CDECL_BEGIN
969static UBool U_CALLCONV
970_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
46f4442e
A
971 UErrorCode status = U_ZERO_ERROR;
972 tempUCATable *t = (tempUCATable *)context;
973 if(value == 0) {
974 while(start < limit) {
975 uint32_t CE = utrie_get32(t->mapping, start, NULL);
976 if(CE == UCOL_NOT_FOUND) {
977 UCAElements el;
978 el.isThai = FALSE;
979 el.prefixSize = 0;
980 el.prefixChars[0] = 0;
981 el.prefix = el.prefixChars;
982 el.cPoints = el.uchars;
983
984 el.cSize = 0;
985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
986
987 el.noOfCEs = 1;
988 el.CEs[0] = 0;
989 uprv_uca_addAnElement(t, &el, &status);
b75a7d8f 990
46f4442e
A
991 }
992 start++;
993 }
994 }
995 if(U_FAILURE(status)) {
996 return FALSE;
997 } else {
998 return TRUE;
b75a7d8f 999 }
b75a7d8f
A
1000}
1001U_CDECL_END
1002
46f4442e 1003static void
b75a7d8f 1004ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
46f4442e
A
1005 UChar32 start, UChar32 end,
1006 UErrorCode *status)
1007{
1008 //UChar decomp[256];
1009 uint32_t CE = UCOL_NOT_FOUND;
1010 UChar32 u = 0;
1011 UCAElements el;
1012 el.isThai = FALSE;
1013 el.prefixSize = 0;
1014 el.prefixChars[0] = 0;
1015 collIterate colIt;
1016
1017 if(U_SUCCESS(*status)) {
1018 for(u = start; u<=end; u++) {
1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting element. */
1021 || ((isCntTableElement(CE)) &&
1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023 )
1024 {
1025 el.cSize = 0;
1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el.cPoints = el.uchars;
1030 //el.cSize = 1;
1031 el.noOfCEs = 0;
1032 el.prefix = el.prefixChars;
1033 el.prefixSize = 0;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy the
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1041 /*
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1043 tag = getCETag(CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1048 } else {
1049 */
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the UCA
1053 // or not.
1054 {
729e4ab9 1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
46f4442e
A
1056 while(CE != UCOL_NO_MORE_CES) {
1057 CE = ucol_getNextCE(src->UCA, &colIt, status);
1058 if(CE != UCOL_NO_MORE_CES) {
1059 el.CEs[el.noOfCEs++] = CE;
1060 }
1061 }
1062 }
1063 uprv_uca_addAnElement(t, &el, status);
b75a7d8f 1064 }
b75a7d8f 1065 }
b75a7d8f 1066 }
b75a7d8f
A
1067}
1068
729e4ab9
A
1069U_NAMESPACE_END
1070
1071U_CFUNC UCATableHeader *
1072ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
46f4442e 1073 U_NAMESPACE_USE
b75a7d8f 1074
46f4442e 1075 uint32_t i = 0;
b75a7d8f 1076 if(U_FAILURE(*status)) {
46f4442e 1077 return NULL;
b75a7d8f 1078 }
46f4442e
A
1079 /*
1080 2. Eliminate the negative lists by doing the following for each non-null negative list:
1081 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082 create new ListHeader X
1083 o reverse the list, add to the end of X's positive list. Reset the strength of the
1084 first item you add, based on the stronger strength levels of the two lists.
1085 */
1086 /*
1087 3. For each ListHeader with a non-null positive list:
1088 */
1089 /*
1090 o Find all character strings with CEs between the baseCE and the
1091 next/previous CE, at the strength of the first token. Add these to the
1092 tailoring.
1093 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1094 tailoring has & x < z...
1095 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1096 */
1097 /* It is possible that this part should be done even while constructing list */
1098 /* The problem is that it is unknown what is going to be the strongest weight */
1099 /* So we might as well do it here */
1100
1101 /*
1102 o Allocate CEs for each token in the list, based on the total number N of the
1103 largest level difference, and the gap G between baseCE and nextCE at that
1104 level. The relation * between the last item and nextCE is the same as the
1105 strongest strength.
1106 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108 Then fit b and c into the secondary gap between a and d, then fit q
1109 into the tertiary gap between b and c.
1110
1111 o Example: baseCE << b <<< q << c * nextCE(X,2)
1112 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113 Then fit q into the tertiary gap between b and c.
1114 o When incrementing primary values, we will not cross high byte
1115 boundaries except where there is only a single-byte primary. That is to
1116 ensure that the script reordering will continue to work.
1117 */
1118 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1119 /* test for NULL */
1120 if (image == NULL) {
1121 *status = U_MEMORY_ALLOCATION_ERROR;
1122 return NULL;
b75a7d8f 1123 }
46f4442e
A
1124 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1125
1126 for(i = 0; i<src->resultLen; i++) {
1127 /* now we need to generate the CEs */
1128 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129 /* According to strength */
1130 if(U_SUCCESS(*status)) {
1131 if(src->lh[i].first) { // if there are any elements
1132 // due to the way parser works, subsequent tailorings
1133 // may remove all the elements from a sequence, therefore
1134 // leaving an empty tailoring sequence.
1135 ucol_initBuffers(src, &src->lh[i], status);
1136 }
1137 }
1138 if(U_FAILURE(*status)) {
1139 uprv_free(image);
1140 return NULL;
1141 }
b75a7d8f 1142 }
46f4442e
A
1143
1144 if(src->varTop != NULL) { /* stuff the variable top value */
1145 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1146 /* remove it from the list */
1147 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1148 src->varTop->listHeader->first = src->varTop->next;
1149 }
1150 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1151 src->varTop->listHeader->last = src->varTop->previous;
1152 }
1153 if(src->varTop->next != NULL) {
1154 src->varTop->next->previous = src->varTop->previous;
1155 }
1156 if(src->varTop->previous != NULL) {
1157 src->varTop->previous->next = src->varTop->next;
1158 }
b75a7d8f 1159 }
b75a7d8f
A
1160
1161
46f4442e
A
1162 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1163 if(U_FAILURE(*status)) {
1164 uprv_free(image);
1165 return NULL;
1166 }
b75a7d8f
A
1167
1168
46f4442e
A
1169 /* After this, we have assigned CE values to all regular CEs */
1170 /* now we will go through list once more and resolve expansions, */
1171 /* make UCAElements structs and add them to table */
1172 for(i = 0; i<src->resultLen; i++) {
1173 /* now we need to generate the CEs */
1174 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175 /* According to strength */
1176 if(U_SUCCESS(*status)) {
1177 ucol_createElements(src, t, &src->lh[i], status);
1178 }
b75a7d8f 1179 }
b75a7d8f 1180
46f4442e
A
1181 UCAElements el;
1182 el.isThai = FALSE;
1183 el.prefixSize = 0;
1184 el.prefixChars[0] = 0;
b75a7d8f 1185
46f4442e
A
1186 /* add latin-1 stuff */
1187 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
b75a7d8f 1188
46f4442e
A
1189 /* add stuff for copying */
1190 if(src->copySet != NULL) {
1191 int32_t i = 0;
1192 UnicodeSet *set = (UnicodeSet *)src->copySet;
1193 for(i = 0; i < set->getRangeCount(); i++) {
1194 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
b75a7d8f 1195 }
46f4442e 1196 }
b75a7d8f 1197
46f4442e
A
1198 if(U_SUCCESS(*status)) {
1199 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1200
1201 uint32_t tailoredCE = UCOL_NOT_FOUND;
1202 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1203 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1204 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1205 // Check for null pointer
1206 if (ucaEl == NULL) {
1207 *status = U_MEMORY_ALLOCATION_ERROR;
1208 return NULL;
b75a7d8f 1209 }
46f4442e
A
1210 while(*conts != 0) {
1211 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1212 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1213 if(tailoredCE != UCOL_NOT_FOUND) {
1214 UBool needToAdd = TRUE;
1215 if(isCntTableElement(tailoredCE)) {
1216 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1217 needToAdd = FALSE;
1218 }
1219 }
1220 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1221 UCAElements elm;
1222 elm.cPoints = el.uchars;
1223 elm.noOfCEs = 0;
1224 elm.uchars[0] = *conts;
1225 elm.uchars[1] = 0;
1226 elm.cSize = 1;
1227 elm.prefixChars[0] = *(conts+2);
1228 elm.isThai = FALSE;
1229 elm.prefix = elm.prefixChars;
1230 elm.prefixSize = 1;
1231 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1232 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1233 needToAdd = TRUE;
1234 }
1235 }
1236 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1237 needToAdd = FALSE;
1238 }
1239
1240 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1241 if (*(conts+1) != 0) { // contractions
1242 el.prefix = el.prefixChars;
1243 el.prefixSize = 0;
1244 el.cPoints = el.uchars;
1245 el.noOfCEs = 0;
1246 el.uchars[0] = *conts;
1247 el.uchars[1] = *(conts+1);
1248 if(*(conts+2)!=0) {
1249 el.uchars[2] = *(conts+2);
1250 el.cSize = 3;
1251 } else {
1252 el.cSize = 2;
1253 }
1254 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1255 }
1256 else { // pre-context character
1257 UChar str[4] = { 0 };
1258 int32_t len=0;
1259 int32_t preKeyLen=0;
1260
1261 el.cPoints = el.uchars;
1262 el.noOfCEs = 0;
1263 el.uchars[0] = *conts;
1264 el.uchars[1] = 0;
1265 el.cSize = 1;
1266 el.prefixChars[0] = *(conts+2);
1267 el.prefix = el.prefixChars;
1268 el.prefixSize = 1;
1269 if (el.prefixChars[0]!=0) {
1270 // get CE of prefix character first
1271 str[0]=el.prefixChars[0];
1272 str[1]=0;
1273 ucol_setText(ucaEl, str, 1, status);
1274 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1275 != UCOL_NULLORDER) {
1276 preKeyLen++; // count number of keys for prefix character
1277 }
1278 str[len++] = el.prefixChars[0];
1279 }
1280
1281 str[len++] = el.uchars[0];
1282 str[len]=0;
1283 ucol_setText(ucaEl, str, len, status);
1284 // Skip the keys for prefix character, then copy the rest to el.
1285 while ((preKeyLen-->0) &&
1286 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1287 continue;
1288 }
1289
1290 }
1291 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1292 el.noOfCEs++;
1293 }
1294 uprv_uca_addAnElement(t, &el, status);
1295 }
1296
1297 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1298 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1299 }
1300 conts+=3;
1301 }
1302 ucol_closeElements(ucaEl);
b75a7d8f 1303 }
b75a7d8f 1304
46f4442e
A
1305 // Add completely ignorable elements
1306 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
b75a7d8f 1307
46f4442e 1308 // add tailoring characters related canonical closures
729e4ab9 1309 uprv_uca_canonicalClosure(t, src, NULL, status);
b75a7d8f
A
1310
1311 /* still need to produce compatibility closure */
1312
46f4442e 1313 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
b75a7d8f 1314
46f4442e
A
1315 uprv_uca_closeTempTable(t);
1316 uprv_free(image);
b75a7d8f 1317
46f4442e 1318 return myData;
b75a7d8f
A
1319}
1320
374ca955
A
1321U_CDECL_BEGIN
1322static UBool U_CALLCONV
b75a7d8f
A
1323ucol_bld_cleanup(void)
1324{
1325 udata_close(invUCA_DATA_MEM);
1326 invUCA_DATA_MEM = NULL;
374ca955 1327 _staticInvUCA = NULL;
b75a7d8f
A
1328 return TRUE;
1329}
374ca955 1330U_CDECL_END
b75a7d8f
A
1331
1332U_CAPI const InverseUCATableHeader * U_EXPORT2
1333ucol_initInverseUCA(UErrorCode *status)
1334{
1335 if(U_FAILURE(*status)) return NULL;
1336
46f4442e
A
1337 UBool needsInit;
1338 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1339
1340 if(needsInit) {
b75a7d8f 1341 InverseUCATableHeader *newInvUCA = NULL;
729e4ab9 1342 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
46f4442e 1343
b75a7d8f
A
1344 if(U_FAILURE(*status)) {
1345 if (result) {
1346 udata_close(result);
1347 }
1348 // This is not needed, as we are talking about
1349 // memory we got from UData
1350 //uprv_free(newInvUCA);
1351 }
46f4442e 1352
b75a7d8f
A
1353 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1354 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1355 UCollator *UCA = ucol_initUCA(status);
1356 // UCA versions of UCA and inverse UCA should match
1357 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
46f4442e
A
1358 *status = U_INVALID_FORMAT_ERROR;
1359 udata_close(result);
1360 return NULL;
b75a7d8f 1361 }
46f4442e 1362
b75a7d8f 1363 umtx_lock(NULL);
374ca955 1364 if(_staticInvUCA == NULL) {
b75a7d8f 1365 invUCA_DATA_MEM = result;
729e4ab9 1366 _staticInvUCA = newInvUCA;
b75a7d8f
A
1367 result = NULL;
1368 newInvUCA = NULL;
1369 }
1370 umtx_unlock(NULL);
46f4442e 1371
b75a7d8f
A
1372 if(newInvUCA != NULL) {
1373 udata_close(result);
1374 // This is not needed, as we are talking about
1375 // memory we got from UData
1376 //uprv_free(newInvUCA);
1377 }
1378 else {
374ca955 1379 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
b75a7d8f
A
1380 }
1381 }
1382 }
374ca955 1383 return _staticInvUCA;
b75a7d8f
A
1384}
1385
729e4ab9
A
1386/* This is the data that is used for non-script reordering codes. These _must_ be kept
1387 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1388 */
1389static const char* ReorderingTokenNames[] = {
1390 "SPACE",
1391 "PUNCT",
1392 "SYMBOL",
1393 "CURRENCY",
1394 "DIGIT",
1395 NULL
1396};
1397
1398static void toUpper(const char* src, char* dst, uint32_t length) {
1399 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1400 *dst = toupper(*src);
1401 }
1402 *dst = '\0';
1403}
1404
1405U_INTERNAL int32_t U_EXPORT2
1406ucol_findReorderingEntry(const char* name) {
1407 char buffer[32];
1408 toUpper(name, buffer, 32);
1409 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1410 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1411 return entry + UCOL_REORDER_CODE_FIRST;
1412 }
1413 }
1414 return USCRIPT_INVALID_CODE;
1415}
1416
b75a7d8f 1417#endif /* #if !UCONFIG_NO_COLLATION */