]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2001-2003, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucol_bld.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created 02/22/2001 | |
14 | * created by: Vladimir Weinstein | |
15 | * | |
16 | * This module builds a collator based on the rule set. | |
17 | * | |
18 | */ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | ||
22 | #if !UCONFIG_NO_COLLATION | |
23 | ||
24 | #include "unicode/ucoleitr.h" | |
25 | #include "unicode/uchar.h" | |
26 | #include "ucol_bld.h" | |
27 | #include "ucln_in.h" | |
28 | #include "umutex.h" | |
29 | #include "unicode/uniset.h" | |
30 | ||
31 | static const InverseUCATableHeader* invUCA = NULL; | |
32 | static UDataMemory* invUCA_DATA_MEM = NULL; | |
33 | ||
34 | U_CDECL_BEGIN | |
35 | static UBool U_CALLCONV | |
36 | isAcceptableInvUCA(void * /*context*/, | |
37 | const char * /*type*/, const char * /*name*/, | |
38 | const UDataInfo *pInfo){ | |
39 | /* context, type & name are intentionally not used */ | |
40 | if( pInfo->size>=20 && | |
41 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
42 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
43 | pInfo->dataFormat[0]==invUcaDataInfo.dataFormat[0] && /* dataFormat="InvC" */ | |
44 | pInfo->dataFormat[1]==invUcaDataInfo.dataFormat[1] && | |
45 | pInfo->dataFormat[2]==invUcaDataInfo.dataFormat[2] && | |
46 | pInfo->dataFormat[3]==invUcaDataInfo.dataFormat[3] && | |
47 | pInfo->formatVersion[0]==invUcaDataInfo.formatVersion[0] && | |
48 | pInfo->formatVersion[1]>=invUcaDataInfo.formatVersion[1] //&& | |
49 | //pInfo->formatVersion[1]==invUcaDataInfo.formatVersion[1] && | |
50 | //pInfo->formatVersion[2]==invUcaDataInfo.formatVersion[2] && | |
51 | //pInfo->formatVersion[3]==invUcaDataInfo.formatVersion[3] && | |
52 | ) { | |
53 | UVersionInfo UCDVersion; | |
54 | u_getUnicodeVersion(UCDVersion); | |
55 | if(pInfo->dataVersion[0]==UCDVersion[0] && | |
56 | pInfo->dataVersion[1]==UCDVersion[1]) { | |
57 | //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && | |
58 | //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && | |
59 | //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { | |
60 | return TRUE; | |
61 | } else { | |
62 | return FALSE; | |
63 | } | |
64 | } else { | |
65 | return FALSE; | |
66 | } | |
67 | } | |
68 | U_CDECL_END | |
69 | ||
70 | static | |
71 | int32_t ucol_inv_findCE(uint32_t CE, uint32_t SecondCE) { | |
72 | uint32_t bottom = 0, top = invUCA->tableSize; | |
73 | uint32_t i = 0; | |
74 | uint32_t first = 0, second = 0; | |
75 | uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); | |
76 | ||
77 | while(bottom < top-1) { | |
78 | i = (top+bottom)/2; | |
79 | first = *(CETable+3*i); | |
80 | second = *(CETable+3*i+1); | |
81 | if(first > CE) { | |
82 | top = i; | |
83 | } else if(first < CE) { | |
84 | bottom = i; | |
85 | } else { | |
86 | if(second > SecondCE) { | |
87 | top = i; | |
88 | } else if(second < SecondCE) { | |
89 | bottom = i; | |
90 | } else { | |
91 | break; | |
92 | } | |
93 | } | |
94 | } | |
95 | ||
96 | /* weiv: */ | |
97 | /* in searching for elements, I have removed the failure */ | |
98 | /* The reason for this is that the builder does not rely */ | |
99 | /* on search mechanism telling it that it didn't find an */ | |
100 | /* element. However, indirect positioning relies on being */ | |
101 | /* able to find the elements around any CE, even if it is */ | |
102 | /* not defined in the UCA. */ | |
103 | return i; | |
104 | /* | |
105 | if((first == CE && second == SecondCE)) { | |
106 | return i; | |
107 | } else { | |
108 | return -1; | |
109 | } | |
110 | */ | |
111 | } | |
112 | ||
113 | static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { | |
114 | 0xFFFF0000, | |
115 | 0xFFFFFF00, | |
116 | 0xFFFFFFFF | |
117 | }; | |
118 | ||
119 | U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(uint32_t CE, uint32_t contCE, | |
120 | uint32_t *nextCE, uint32_t *nextContCE, | |
121 | uint32_t strength) { | |
122 | uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); | |
123 | int32_t iCE; | |
124 | ||
125 | iCE = ucol_inv_findCE(CE, contCE); | |
126 | ||
127 | if(iCE<0) { | |
128 | *nextCE = UCOL_NOT_FOUND; | |
129 | return -1; | |
130 | } | |
131 | ||
132 | CE &= strengthMask[strength]; | |
133 | contCE &= strengthMask[strength]; | |
134 | ||
135 | *nextCE = CE; | |
136 | *nextContCE = contCE; | |
137 | ||
138 | while((*nextCE & strengthMask[strength]) == CE | |
139 | && (*nextContCE & strengthMask[strength]) == contCE) { | |
140 | *nextCE = (*(CETable+3*(++iCE))); | |
141 | *nextContCE = (*(CETable+3*(iCE)+1)); | |
142 | } | |
143 | ||
144 | return iCE; | |
145 | } | |
146 | ||
147 | U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(uint32_t CE, uint32_t contCE, | |
148 | uint32_t *prevCE, uint32_t *prevContCE, | |
149 | uint32_t strength) { | |
150 | uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); | |
151 | int32_t iCE; | |
152 | ||
153 | iCE = ucol_inv_findCE(CE, contCE); | |
154 | ||
155 | if(iCE<0) { | |
156 | *prevCE = UCOL_NOT_FOUND; | |
157 | return -1; | |
158 | } | |
159 | ||
160 | CE &= strengthMask[strength]; | |
161 | contCE &= strengthMask[strength]; | |
162 | ||
163 | *prevCE = CE; | |
164 | *prevContCE = contCE; | |
165 | ||
166 | while((*prevCE & strengthMask[strength]) == CE | |
167 | && (*prevContCE & strengthMask[strength])== contCE | |
168 | && iCE > 0) { /* this condition should prevent falling off the edge of the world */ | |
169 | /* here, we end up in a singularity - zero */ | |
170 | *prevCE = (*(CETable+3*(--iCE))); | |
171 | *prevContCE = (*(CETable+3*(iCE)+1)); | |
172 | } | |
173 | ||
174 | return iCE; | |
175 | } | |
176 | ||
177 | static | |
178 | inline int32_t ucol_inv_getPrevious(UColTokListHeader *lh, uint32_t strength) { | |
179 | ||
180 | uint32_t CE = lh->baseCE; | |
181 | uint32_t SecondCE = lh->baseContCE; | |
182 | ||
183 | uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); | |
184 | uint32_t previousCE, previousContCE; | |
185 | int32_t iCE; | |
186 | ||
187 | iCE = ucol_inv_findCE(CE, SecondCE); | |
188 | ||
189 | if(iCE<0) { | |
190 | return -1; | |
191 | } | |
192 | ||
193 | CE &= strengthMask[strength]; | |
194 | SecondCE &= strengthMask[strength]; | |
195 | ||
196 | previousCE = CE; | |
197 | previousContCE = SecondCE; | |
198 | ||
199 | while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { | |
200 | previousCE = (*(CETable+3*(--iCE))); | |
201 | previousContCE = (*(CETable+3*(iCE)+1)); | |
202 | } | |
203 | lh->previousCE = previousCE; | |
204 | lh->previousContCE = previousContCE; | |
205 | ||
206 | return iCE; | |
207 | } | |
208 | ||
209 | static | |
210 | inline int32_t ucol_inv_getNext(UColTokListHeader *lh, uint32_t strength) { | |
211 | uint32_t CE = lh->baseCE; | |
212 | uint32_t SecondCE = lh->baseContCE; | |
213 | ||
214 | uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); | |
215 | uint32_t nextCE, nextContCE; | |
216 | int32_t iCE; | |
217 | ||
218 | iCE = ucol_inv_findCE(CE, SecondCE); | |
219 | ||
220 | if(iCE<0) { | |
221 | return -1; | |
222 | } | |
223 | ||
224 | CE &= strengthMask[strength]; | |
225 | SecondCE &= strengthMask[strength]; | |
226 | ||
227 | nextCE = CE; | |
228 | nextContCE = SecondCE; | |
229 | ||
230 | while((nextCE & strengthMask[strength]) == CE | |
231 | && (nextContCE & strengthMask[strength]) == SecondCE) { | |
232 | nextCE = (*(CETable+3*(++iCE))); | |
233 | nextContCE = (*(CETable+3*(iCE)+1)); | |
234 | } | |
235 | ||
236 | lh->nextCE = nextCE; | |
237 | lh->nextContCE = nextContCE; | |
238 | ||
239 | return iCE; | |
240 | } | |
241 | ||
242 | U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { | |
243 | /* reset all the gaps */ | |
244 | int32_t i = 0; | |
245 | uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); | |
246 | uint32_t st = 0; | |
247 | uint32_t t1, t2; | |
248 | int32_t pos; | |
249 | ||
250 | UColToken *tok = lh->first; | |
251 | uint32_t tokStrength = tok->strength; | |
252 | ||
253 | for(i = 0; i<3; i++) { | |
254 | lh->gapsHi[3*i] = 0; | |
255 | lh->gapsHi[3*i+1] = 0; | |
256 | lh->gapsHi[3*i+2] = 0; | |
257 | lh->gapsLo[3*i] = 0; | |
258 | lh->gapsLo[3*i+1] = 0; | |
259 | lh->gapsLo[3*i+2] = 0; | |
260 | lh->numStr[i] = 0; | |
261 | lh->fStrToken[i] = NULL; | |
262 | lh->lStrToken[i] = NULL; | |
263 | lh->pos[i] = -1; | |
264 | } | |
265 | ||
266 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); | |
267 | ||
268 | if(lh->baseCE >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && lh->baseCE < (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
269 | //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ | |
270 | lh->pos[0] = 0; | |
271 | t1 = lh->baseCE; | |
272 | t2 = lh->baseContCE; | |
273 | lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
274 | lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
275 | lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
276 | if(lh->baseCE < 0xEF000000) { | |
277 | /* first implicits have three byte primaries, with a gap of one */ | |
278 | /* so we esentially need to add 2 to the top byte in lh->baseContCE */ | |
279 | t2 += 0x02000000; | |
280 | } else { | |
281 | /* second implicits have four byte primaries, with a gap of IMPLICIT_LAST2_MULTIPLIER_ */ | |
282 | /* Now, this guy is not really accessible here, so until we find a better way to pass it */ | |
283 | /* around, we'll assume that the gap is 1 */ | |
284 | t2 += 0x00020000; | |
285 | } | |
286 | lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
287 | lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
288 | lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
289 | } else if(lh->indirect == TRUE && lh->nextCE != 0) { | |
290 | //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { | |
291 | lh->pos[0] = 0; | |
292 | t1 = lh->baseCE; | |
293 | t2 = lh->baseContCE; | |
294 | lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
295 | lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
296 | lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
297 | t1 = lh->nextCE; | |
298 | t2 = lh->nextContCE; | |
299 | lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
300 | lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
301 | lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
302 | } else { | |
303 | for(;;) { | |
304 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { | |
305 | if((lh->pos[tokStrength] = ucol_inv_getNext(lh, tokStrength)) >= 0) { | |
306 | lh->fStrToken[tokStrength] = tok; | |
307 | } else { /* The CE must be implicit, since it's not in the table */ | |
308 | /* Error */ | |
309 | *status = U_INTERNAL_PROGRAM_ERROR; | |
310 | } | |
311 | } | |
312 | ||
313 | while(tok != NULL && tok->strength >= tokStrength) { | |
314 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { | |
315 | lh->lStrToken[tokStrength] = tok; | |
316 | } | |
317 | tok = tok->next; | |
318 | } | |
319 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { | |
320 | /* check if previous interval is the same and merge the intervals if it is so */ | |
321 | if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { | |
322 | lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; | |
323 | lh->fStrToken[tokStrength+1] = NULL; | |
324 | lh->lStrToken[tokStrength+1] = NULL; | |
325 | lh->pos[tokStrength+1] = -1; | |
326 | } | |
327 | } | |
328 | if(tok != NULL) { | |
329 | tokStrength = tok->strength; | |
330 | } else { | |
331 | break; | |
332 | } | |
333 | } | |
334 | for(st = 0; st < 3; st++) { | |
335 | if((pos = lh->pos[st]) >= 0) { | |
336 | t1 = *(CETable+3*(pos)); | |
337 | t2 = *(CETable+3*(pos)+1); | |
338 | lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
339 | lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
340 | //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
341 | lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; | |
342 | pos--; | |
343 | t1 = *(CETable+3*(pos)); | |
344 | t2 = *(CETable+3*(pos)+1); | |
345 | lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
346 | lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
347 | lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; | |
348 | } | |
349 | } | |
350 | } | |
351 | } | |
352 | ||
353 | ||
354 | #define ucol_countBytes(value, noOfBytes) \ | |
355 | { \ | |
356 | uint32_t mask = 0xFFFFFFFF; \ | |
357 | (noOfBytes) = 0; \ | |
358 | while(mask != 0) { \ | |
359 | if(((value) & mask) != 0) { \ | |
360 | (noOfBytes)++; \ | |
361 | } \ | |
362 | mask >>= 8; \ | |
363 | } \ | |
364 | } | |
365 | ||
366 | U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { | |
367 | if(U_SUCCESS(*status)) { | |
368 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
369 | } | |
370 | return g->current; | |
371 | } | |
372 | ||
373 | U_CFUNC uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { | |
374 | /* TODO: rename to enum names */ | |
375 | uint32_t high, low, count=1; | |
376 | uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; | |
377 | ||
378 | if(strength == UCOL_SECONDARY) { | |
379 | low = UCOL_COMMON_TOP2<<24; | |
380 | high = 0xFFFFFFFF; | |
381 | count = 0xFF - UCOL_COMMON_TOP2; | |
382 | } else { | |
383 | low = UCOL_BYTE_COMMON << 24; //0x05000000; | |
384 | high = 0x40000000; | |
385 | count = 0x40 - UCOL_BYTE_COMMON; | |
386 | } | |
387 | ||
388 | if(tok->next != NULL && tok->next->strength == strength) { | |
389 | count = tok->next->toInsert; | |
390 | } | |
391 | ||
392 | g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); | |
393 | g->current = UCOL_BYTE_COMMON<<24; | |
394 | ||
395 | if(g->noOfRanges == 0) { | |
396 | *status = U_INTERNAL_PROGRAM_ERROR; | |
397 | } | |
398 | return g->current; | |
399 | } | |
400 | ||
401 | U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { | |
402 | uint32_t strength = tok->strength; | |
403 | uint32_t low = lows[fStrength*3+strength]; | |
404 | uint32_t high = highs[fStrength*3+strength]; | |
405 | uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; | |
406 | ||
407 | uint32_t count = tok->toInsert; | |
408 | ||
409 | if(low >= high && strength > UCOL_PRIMARY) { | |
410 | int32_t s = strength; | |
411 | for(;;) { | |
412 | s--; | |
413 | if(lows[fStrength*3+s] != highs[fStrength*3+s]) { | |
414 | if(strength == UCOL_SECONDARY) { | |
415 | low = UCOL_COMMON_TOP2<<24; | |
416 | high = 0xFFFFFFFF; | |
417 | } else { | |
418 | //low = 0x02000000; // This needs to be checked - what if low is | |
419 | // not good... | |
420 | high = 0x40000000; | |
421 | } | |
422 | break; | |
423 | } | |
424 | if(s<0) { | |
425 | *status = U_INTERNAL_PROGRAM_ERROR; | |
426 | return 0; | |
427 | } | |
428 | } | |
429 | } | |
430 | ||
431 | if(low == 0) { | |
432 | low = 0x01000000; | |
433 | } | |
434 | ||
435 | if(strength == UCOL_SECONDARY) { /* similar as simple */ | |
436 | if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { | |
437 | low = UCOL_COMMON_TOP2<<24; | |
438 | } | |
439 | if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { | |
440 | high = UCOL_COMMON_TOP2<<24; | |
441 | } | |
442 | if(low < UCOL_COMMON_BOT2<<24) { | |
443 | g->noOfRanges = ucol_allocWeights(UCOL_COMMON_TOP2<<24, high, count, maxByte, g->ranges); | |
444 | g->current = UCOL_COMMON_BOT2; | |
445 | return g->current; | |
446 | } | |
447 | } | |
448 | ||
449 | g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); | |
450 | if(g->noOfRanges == 0) { | |
451 | *status = U_INTERNAL_PROGRAM_ERROR; | |
452 | } | |
453 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
454 | return g->current; | |
455 | } | |
456 | ||
457 | U_CFUNC void ucol_doCE(uint32_t *CEparts, UColToken *tok) { | |
458 | /* this one makes the table and stuff */ | |
459 | uint32_t noOfBytes[3]; | |
460 | uint32_t i; | |
461 | ||
462 | for(i = 0; i<3; i++) { | |
463 | ucol_countBytes(CEparts[i], noOfBytes[i]); | |
464 | } | |
465 | ||
466 | /* Here we have to pack CEs from parts */ | |
467 | ||
468 | uint32_t CEi = 0; | |
469 | uint32_t value = 0; | |
470 | ||
471 | while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { | |
472 | if(CEi > 0) { | |
473 | value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ | |
474 | } else { | |
475 | value = 0; | |
476 | } | |
477 | ||
478 | if(2*CEi<noOfBytes[0]) { | |
479 | value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; | |
480 | } | |
481 | if(CEi<noOfBytes[1]) { | |
482 | value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; | |
483 | } | |
484 | if(CEi<noOfBytes[2]) { | |
485 | value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); | |
486 | } | |
487 | tok->CEs[CEi] = value; | |
488 | CEi++; | |
489 | } | |
490 | if(CEi == 0) { /* totally ignorable */ | |
491 | tok->noOfCEs = 1; | |
492 | tok->CEs[0] = 0; | |
493 | } else { /* there is at least something */ | |
494 | tok->noOfCEs = CEi; | |
495 | } | |
496 | ||
497 | #if UCOL_DEBUG==2 | |
498 | fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); | |
499 | for(i = 0; i<tok->noOfCEs; i++) { | |
500 | fprintf(stderr, "%08X ", tok->CEs[i]); | |
501 | } | |
502 | fprintf(stderr, "\n"); | |
503 | #endif | |
504 | } | |
505 | ||
506 | U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { | |
507 | ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; | |
508 | uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; | |
509 | ||
510 | UColToken *tok = lh->last; | |
511 | uint32_t t[UCOL_STRENGTH_LIMIT]; | |
512 | ||
513 | uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); | |
514 | ||
515 | tok->toInsert = 1; | |
516 | t[tok->strength] = 1; | |
517 | ||
518 | while(tok->previous != NULL) { | |
519 | if(tok->previous->strength < tok->strength) { /* going up */ | |
520 | t[tok->strength] = 0; | |
521 | t[tok->previous->strength]++; | |
522 | } else if(tok->previous->strength > tok->strength) { /* going down */ | |
523 | t[tok->previous->strength] = 1; | |
524 | } else { | |
525 | t[tok->strength]++; | |
526 | } | |
527 | tok=tok->previous; | |
528 | tok->toInsert = t[tok->strength]; | |
529 | } | |
530 | ||
531 | tok->toInsert = t[tok->strength]; | |
532 | ucol_inv_getGapPositions(src, lh, status); | |
533 | ||
534 | #if UCOL_DEBUG | |
535 | fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); | |
536 | int32_t j = 2; | |
537 | for(j = 2; j >= 0; j--) { | |
538 | fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); | |
539 | fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); | |
540 | } | |
541 | tok=lh->first[UCOL_TOK_POLARITY_POSITIVE]; | |
542 | ||
543 | do { | |
544 | fprintf(stderr,"%i", tok->strength); | |
545 | tok = tok->next; | |
546 | } while(tok != NULL); | |
547 | fprintf(stderr, "\n"); | |
548 | ||
549 | tok=lh->first[UCOL_TOK_POLARITY_POSITIVE]; | |
550 | ||
551 | do { | |
552 | fprintf(stderr,"%i", tok->toInsert); | |
553 | tok = tok->next; | |
554 | } while(tok != NULL); | |
555 | #endif | |
556 | ||
557 | tok = lh->first; | |
558 | uint32_t fStrength = UCOL_IDENTICAL; | |
559 | uint32_t initStrength = UCOL_IDENTICAL; | |
560 | ||
561 | ||
562 | CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; | |
563 | CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; | |
564 | CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; | |
565 | ||
566 | while (tok != NULL && U_SUCCESS(*status)) { | |
567 | fStrength = tok->strength; | |
568 | if(fStrength < initStrength) { | |
569 | initStrength = fStrength; | |
570 | if(lh->pos[fStrength] == -1) { | |
571 | while(lh->pos[fStrength] == -1 && fStrength > 0) { | |
572 | fStrength--; | |
573 | } | |
574 | if(lh->pos[fStrength] == -1) { | |
575 | *status = U_INTERNAL_PROGRAM_ERROR; | |
576 | return; | |
577 | } | |
578 | } | |
579 | if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ | |
580 | CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; | |
581 | CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; | |
582 | /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ | |
583 | CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
584 | } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ | |
585 | CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; | |
586 | /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ | |
587 | CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
588 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
589 | } else { /* primaries */ | |
590 | /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ | |
591 | CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
592 | CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); | |
593 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
594 | } | |
595 | } else { | |
596 | if(tok->strength == UCOL_TERTIARY) { | |
597 | CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); | |
598 | } else if(tok->strength == UCOL_SECONDARY) { | |
599 | CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); | |
600 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
601 | } else if(tok->strength == UCOL_PRIMARY) { | |
602 | CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); | |
603 | CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); | |
604 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
605 | } | |
606 | } | |
607 | ucol_doCE(CEparts, tok); | |
608 | tok = tok->next; | |
609 | } | |
610 | } | |
611 | ||
612 | static | |
613 | uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { | |
614 | uint32_t i = 0; | |
615 | UChar c; | |
616 | ||
617 | if(U_FAILURE(*status)) { | |
618 | return 0; | |
619 | } | |
620 | ||
621 | if(sourceLen > resLen) { | |
622 | *status = U_MEMORY_ALLOCATION_ERROR; | |
623 | return 0; | |
624 | } | |
625 | ||
626 | for(i = 0; i < sourceLen; i++) { | |
627 | c = source[i]; | |
628 | if(0x3042 < c && c < 0x30ef) { /* Kana range */ | |
629 | switch(c - 0x3000) { | |
630 | case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: | |
631 | case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: | |
632 | c++; | |
633 | break; | |
634 | case 0xF5: | |
635 | c = 0x30AB; | |
636 | break; | |
637 | case 0xF6: | |
638 | c = 0x30B1; | |
639 | break; | |
640 | } | |
641 | } | |
642 | resBuf[i] = c; | |
643 | } | |
644 | return sourceLen; | |
645 | } | |
646 | ||
647 | static | |
648 | uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { | |
649 | uint32_t i = 0; | |
650 | UChar c; | |
651 | ||
652 | if(U_FAILURE(*status)) { | |
653 | return 0; | |
654 | } | |
655 | ||
656 | if(sourceLen > resLen) { | |
657 | *status = U_MEMORY_ALLOCATION_ERROR; | |
658 | return 0; | |
659 | } | |
660 | ||
661 | for(i = 0; i < sourceLen; i++) { | |
662 | c = source[i]; | |
663 | if(0x3042 < c && c < 0x30ef) { /* Kana range */ | |
664 | switch(c - 0x3000) { | |
665 | case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: | |
666 | case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: | |
667 | c--; | |
668 | break; | |
669 | case 0xAB: | |
670 | c = 0x30F5; | |
671 | break; | |
672 | case 0xB1: | |
673 | c = 0x30F6; | |
674 | break; | |
675 | } | |
676 | } | |
677 | resBuf[i] = c; | |
678 | } | |
679 | return sourceLen; | |
680 | } | |
681 | ||
682 | static | |
683 | uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { | |
684 | uint32_t i = 0; | |
685 | UChar n[128]; | |
686 | uint32_t nLen = 0; | |
687 | uint32_t uCount = 0, lCount = 0; | |
688 | ||
689 | collIterate s; | |
690 | uint32_t order = 0; | |
691 | ||
692 | if(U_FAILURE(*status)) { | |
693 | return UCOL_LOWER_CASE; | |
694 | } | |
695 | ||
696 | nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); | |
697 | if(U_SUCCESS(*status)) { | |
698 | for(i = 0; i < nLen; i++) { | |
699 | uprv_init_collIterate(UCA, &n[i], 1, &s); | |
700 | order = ucol_getNextCE(UCA, &s, status); | |
701 | if(isContinuation(order)) { | |
702 | *status = U_INTERNAL_PROGRAM_ERROR; | |
703 | return UCOL_LOWER_CASE; | |
704 | } | |
705 | if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { | |
706 | uCount++; | |
707 | } else { | |
708 | if(u_islower(n[i])) { | |
709 | lCount++; | |
710 | } else { | |
711 | UChar sk[1], lk[1]; | |
712 | u_toSmallKana(&n[i], 1, sk, 1, status); | |
713 | u_toLargeKana(&n[i], 1, lk, 1, status); | |
714 | if(sk[0] == n[i] && lk[0] != n[i]) { | |
715 | lCount++; | |
716 | } | |
717 | } | |
718 | } | |
719 | } | |
720 | } | |
721 | ||
722 | if(uCount != 0 && lCount != 0) { | |
723 | return UCOL_MIXED_CASE; | |
724 | } else if(uCount != 0) { | |
725 | return UCOL_UPPER_CASE; | |
726 | } else { | |
727 | return UCOL_LOWER_CASE; | |
728 | } | |
729 | } | |
730 | ||
731 | U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { | |
732 | UCAElements el; | |
733 | UColToken *tok = lh->first; | |
734 | UColToken *expt = NULL; | |
735 | uint32_t i = 0, j = 0; | |
736 | ||
737 | while(tok != NULL && U_SUCCESS(*status)) { | |
738 | /* first, check if there are any expansions */ | |
739 | /* if there are expansions, we need to do a little bit more processing */ | |
740 | /* since parts of expansion can be tailored, while others are not */ | |
741 | if(tok->expansion != 0) { | |
742 | uint32_t len = tok->expansion >> 24; | |
743 | uint32_t currentSequenceLen = len; | |
744 | uint32_t expOffset = tok->expansion & 0x00FFFFFF; | |
745 | //uint32_t exp = currentSequenceLen | expOffset; | |
746 | UColToken exp; | |
747 | exp.source = currentSequenceLen | expOffset; | |
748 | exp.rulesToParse = src->source; | |
749 | ||
750 | while(len > 0) { | |
751 | currentSequenceLen = len; | |
752 | while(currentSequenceLen > 0) { | |
753 | exp.source = (currentSequenceLen << 24) | expOffset; | |
754 | if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ | |
755 | uint32_t noOfCEsToCopy = expt->noOfCEs; | |
756 | for(j = 0; j<noOfCEsToCopy; j++) { | |
757 | tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; | |
758 | } | |
759 | tok->noOfExpCEs += noOfCEsToCopy; | |
760 | // Smart people never try to add codepoints and CEs. | |
761 | // For some odd reason, it won't work. | |
762 | expOffset += currentSequenceLen; //noOfCEsToCopy; | |
763 | len -= currentSequenceLen; //noOfCEsToCopy; | |
764 | break; | |
765 | } else { | |
766 | currentSequenceLen--; | |
767 | } | |
768 | } | |
769 | if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ | |
770 | /* will have to get one from UCA */ | |
771 | /* first, get the UChars from the rules */ | |
772 | /* then pick CEs out until there is no more and stuff them into expansion */ | |
773 | collIterate s; | |
774 | uint32_t order = 0; | |
775 | uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s); | |
776 | ||
777 | for(;;) { | |
778 | order = ucol_getNextCE(src->UCA, &s, status); | |
779 | if(order == UCOL_NO_MORE_CES) { | |
780 | break; | |
781 | } | |
782 | tok->expCEs[tok->noOfExpCEs++] = order; | |
783 | } | |
784 | expOffset++; | |
785 | len--; | |
786 | } | |
787 | } | |
788 | } else { | |
789 | tok->noOfExpCEs = 0; | |
790 | } | |
791 | ||
792 | /* set the ucaelement with obtained values */ | |
793 | el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; | |
794 | /* copy CEs */ | |
795 | for(i = 0; i<tok->noOfCEs; i++) { | |
796 | el.CEs[i] = tok->CEs[i]; | |
797 | } | |
798 | for(i = 0; i<tok->noOfExpCEs; i++) { | |
799 | el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; | |
800 | } | |
801 | ||
802 | /* copy UChars */ | |
803 | // We kept prefix and source kind of together, as it is a kind of a contraction. | |
804 | // However, now we have to slice the prefix off the main thing - | |
805 | el.prefix = el.prefixChars; | |
806 | el.cPoints = el.uchars; | |
807 | if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the | |
808 | // addPrefix function in ucol_elm. The reason is that we need to add both composed AND | |
809 | // decomposed elements to the unsaf table. | |
810 | el.prefixSize = tok->prefix>>24; | |
811 | uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); | |
812 | ||
813 | el.cSize = (tok->source >> 24)-(tok->prefix>>24); | |
814 | uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); | |
815 | } else { | |
816 | el.prefixSize = 0; | |
817 | *el.prefix = 0; | |
818 | ||
819 | el.cSize = (tok->source >> 24); | |
820 | uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); | |
821 | } | |
822 | ||
823 | if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) { | |
824 | el.isThai = TRUE; | |
825 | } else { | |
826 | el.isThai = FALSE; | |
827 | } | |
828 | ||
829 | if(src->UCA != NULL) { | |
830 | for(i = 0; i<el.cSize; i++) { | |
831 | if(UCOL_ISJAMO(el.cPoints[i])) { | |
832 | t->image->jamoSpecial = TRUE; | |
833 | } | |
834 | } | |
835 | } | |
836 | ||
837 | // Case bits handling | |
838 | el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field | |
839 | if(el.cSize > 1) { | |
840 | // Do it manually | |
841 | el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status); | |
842 | } else { | |
843 | // Copy it from the UCA | |
844 | uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status); | |
845 | el.CEs[0] |= (caseCE & 0xC0); | |
846 | } | |
847 | ||
848 | /* and then, add it */ | |
849 | #if UCOL_DEBUG==2 | |
850 | fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); | |
851 | #endif | |
852 | uprv_uca_addAnElement(t, &el, status); | |
853 | ||
854 | #if 0 | |
855 | if(el.cSize > 1) { // this is a contraction, we should check whether a composed form should also be included | |
856 | UChar composed[256]; | |
857 | uint32_t compLen = unorm_normalize(el.cPoints, el.cSize, UNORM_NFC, 0, composed, 256, status);; | |
858 | ||
859 | if(compLen != el.cSize || uprv_memcmp(composed, el.cPoints, el.cSize*sizeof(UChar))) { | |
860 | // composed form of a contraction is different than the decomposed form! | |
861 | // do it! | |
862 | #ifdef UCOL_DEBUG | |
863 | fprintf(stderr, "Adding composed for %04X->%04X\n", *element->cPoints, *composed); | |
864 | #endif | |
865 | el.cSize = compLen; | |
866 | uprv_memcpy(el.cPoints, composed, el.cSize*sizeof(UChar)); | |
867 | uprv_uca_addAnElement(t, &el, status); | |
868 | } | |
869 | } | |
870 | #endif | |
871 | ||
872 | #if UCOL_DEBUG_DUPLICATES | |
873 | if(*status != U_ZERO_ERROR) { | |
874 | fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); | |
875 | *status = U_ZERO_ERROR; | |
876 | } | |
877 | #endif | |
878 | ||
879 | tok = tok->next; | |
880 | } | |
881 | } | |
882 | ||
883 | U_CDECL_BEGIN | |
884 | static UBool U_CALLCONV | |
885 | _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { | |
886 | UErrorCode status = U_ZERO_ERROR; | |
887 | tempUCATable *t = (tempUCATable *)context; | |
888 | if(value == 0) { | |
889 | while(start < limit) { | |
890 | uint32_t CE = utrie_get32(t->mapping, start, NULL); | |
891 | if(CE == UCOL_NOT_FOUND) { | |
892 | UCAElements el; | |
893 | el.isThai = FALSE; | |
894 | el.prefixSize = 0; | |
895 | el.prefixChars[0] = 0; | |
896 | el.prefix = el.prefixChars; | |
897 | el.cPoints = el.uchars; | |
898 | ||
899 | el.cSize = 0; | |
900 | UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start); | |
901 | ||
902 | el.noOfCEs = 1; | |
903 | el.CEs[0] = 0; | |
904 | uprv_uca_addAnElement(t, &el, &status); | |
905 | ||
906 | } | |
907 | start++; | |
908 | } | |
909 | } | |
910 | if(U_FAILURE(status)) { | |
911 | return FALSE; | |
912 | } else { | |
913 | return TRUE; | |
914 | } | |
915 | } | |
916 | U_CDECL_END | |
917 | ||
918 | static void | |
919 | ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, | |
920 | UChar32 start, UChar32 end, | |
921 | UErrorCode *status) { | |
922 | //UChar decomp[256]; | |
923 | uint32_t CE = UCOL_NOT_FOUND; | |
924 | UChar32 u = 0; | |
925 | UCAElements el; | |
926 | el.isThai = FALSE; | |
927 | el.prefixSize = 0; | |
928 | el.prefixChars[0] = 0; | |
929 | collIterate colIt; | |
930 | ||
931 | if(U_SUCCESS(*status)) { | |
932 | for(u = start; u<=end; u++) { | |
933 | if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND | |
934 | /* this test is for contractions that are missing the starting element. */ | |
935 | || ((isCntTableElement(CE)) && | |
936 | (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) | |
937 | ) { | |
938 | el.cSize = 0; | |
939 | U16_APPEND_UNSAFE(el.uchars, el.cSize, u); | |
940 | //decomp[0] = (UChar)u; | |
941 | //el.uchars[0] = (UChar)u; | |
942 | el.cPoints = el.uchars; | |
943 | //el.cSize = 1; | |
944 | el.noOfCEs = 0; | |
945 | el.prefix = el.prefixChars; | |
946 | el.prefixSize = 0; | |
947 | //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); | |
948 | // We actually want to check whether this element is a special | |
949 | // If it is an implicit element (hangul, CJK - we want to copy the | |
950 | // special, not the resolved CEs) - for hangul, copying resolved | |
951 | // would just make things the same (there is an expansion and it | |
952 | // takes approximately the same amount of time to resolve as | |
953 | // falling back to the UCA). | |
954 | /* | |
955 | UTRIE_GET32(src->UCA->mapping, u, CE); | |
956 | tag = getCETag(CE); | |
957 | if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG | |
958 | || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG | |
959 | || tag == LEAD_SURROGATE_TAG) { | |
960 | el.CEs[el.noOfCEs++] = CE; | |
961 | } else { | |
962 | */ | |
963 | // It turns out that it does not make sense to keep implicits | |
964 | // unresolved. The cost of resolving them is big enough so that | |
965 | // it doesn't make any difference whether we have to go to the UCA | |
966 | // or not. | |
967 | { | |
968 | uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt); | |
969 | while(CE != UCOL_NO_MORE_CES) { | |
970 | CE = ucol_getNextCE(src->UCA, &colIt, status); | |
971 | if(CE != UCOL_NO_MORE_CES) { | |
972 | el.CEs[el.noOfCEs++] = CE; | |
973 | } | |
974 | } | |
975 | } | |
976 | uprv_uca_addAnElement(t, &el, status); | |
977 | } | |
978 | } | |
979 | } | |
980 | } | |
981 | ||
982 | UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { | |
983 | uint32_t i = 0; | |
984 | if(U_FAILURE(*status)) { | |
985 | return NULL; | |
986 | } | |
987 | /* | |
988 | 2. Eliminate the negative lists by doing the following for each non-null negative list: | |
989 | o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, | |
990 | create new ListHeader X | |
991 | o reverse the list, add to the end of X's positive list. Reset the strength of the | |
992 | first item you add, based on the stronger strength levels of the two lists. | |
993 | */ | |
994 | /* | |
995 | 3. For each ListHeader with a non-null positive list: | |
996 | */ | |
997 | /* | |
998 | o Find all character strings with CEs between the baseCE and the | |
999 | next/previous CE, at the strength of the first token. Add these to the | |
1000 | tailoring. | |
1001 | ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the | |
1002 | tailoring has & x < z... | |
1003 | ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... | |
1004 | */ | |
1005 | /* It is possible that this part should be done even while constructing list */ | |
1006 | /* The problem is that it is unknown what is going to be the strongest weight */ | |
1007 | /* So we might as well do it here */ | |
1008 | ||
1009 | /* | |
1010 | o Allocate CEs for each token in the list, based on the total number N of the | |
1011 | largest level difference, and the gap G between baseCE and nextCE at that | |
1012 | level. The relation * between the last item and nextCE is the same as the | |
1013 | strongest strength. | |
1014 | o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) | |
1015 | ? There are 3 primary items: a, d, e. Fit them into the primary gap. | |
1016 | Then fit b and c into the secondary gap between a and d, then fit q | |
1017 | into the tertiary gap between b and c. | |
1018 | ||
1019 | o Example: baseCE << b <<< q << c * nextCE(X,2) | |
1020 | ? There are 2 secondary items: b, c. Fit them into the secondary gap. | |
1021 | Then fit q into the tertiary gap between b and c. | |
1022 | o When incrementing primary values, we will not cross high byte | |
1023 | boundaries except where there is only a single-byte primary. That is to | |
1024 | ensure that the script reordering will continue to work. | |
1025 | */ | |
1026 | UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); | |
1027 | /* test for NULL */ | |
1028 | if (image == NULL) { | |
1029 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1030 | return NULL; | |
1031 | } | |
1032 | uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); | |
1033 | ||
1034 | for(i = 0; i<src->resultLen; i++) { | |
1035 | /* now we need to generate the CEs */ | |
1036 | /* We stuff the initial value in the buffers, and increase the appropriate buffer */ | |
1037 | /* According to strength */ | |
1038 | if(U_SUCCESS(*status)) { | |
1039 | ucol_initBuffers(src, &src->lh[i], status); | |
1040 | } | |
1041 | if(U_FAILURE(*status)) { | |
1042 | return NULL; | |
1043 | } | |
1044 | ||
1045 | } | |
1046 | ||
1047 | if(src->varTop != NULL) { /* stuff the variable top value */ | |
1048 | src->opts->variableTopValue = (*(src->varTop->CEs))>>16; | |
1049 | /* remove it from the list */ | |
1050 | if(src->varTop->listHeader->first == src->varTop) { /* first in list */ | |
1051 | src->varTop->listHeader->first = src->varTop->next; | |
1052 | } | |
1053 | if(src->varTop->listHeader->last == src->varTop) { /* first in list */ | |
1054 | src->varTop->listHeader->last = src->varTop->previous; | |
1055 | } | |
1056 | if(src->varTop->next != NULL) { | |
1057 | src->varTop->next->previous = src->varTop->previous; | |
1058 | } | |
1059 | if(src->varTop->previous != NULL) { | |
1060 | src->varTop->previous->next = src->varTop->next; | |
1061 | } | |
1062 | } | |
1063 | ||
1064 | ||
1065 | tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, status); | |
1066 | ||
1067 | ||
1068 | /* After this, we have assigned CE values to all regular CEs */ | |
1069 | /* now we will go through list once more and resolve expansions, */ | |
1070 | /* make UCAElements structs and add them to table */ | |
1071 | for(i = 0; i<src->resultLen; i++) { | |
1072 | /* now we need to generate the CEs */ | |
1073 | /* We stuff the initial value in the buffers, and increase the appropriate buffer */ | |
1074 | /* According to strength */ | |
1075 | if(U_SUCCESS(*status)) { | |
1076 | ucol_createElements(src, t, &src->lh[i], status); | |
1077 | } | |
1078 | } | |
1079 | ||
1080 | UCAElements el; | |
1081 | el.isThai = FALSE; | |
1082 | el.prefixSize = 0; | |
1083 | el.prefixChars[0] = 0; | |
1084 | ||
1085 | /* add latin-1 stuff */ | |
1086 | ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); | |
1087 | ||
1088 | /* add stuff for copying */ | |
1089 | if(src->copySet != NULL) { | |
1090 | int32_t i = 0; | |
1091 | UnicodeSet *set = (UnicodeSet *)src->copySet; | |
1092 | for(i = 0; i < set->getRangeCount(); i++) { | |
1093 | ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); | |
1094 | } | |
1095 | } | |
1096 | ||
1097 | if(U_SUCCESS(*status)) { | |
1098 | /* copy contractions from the UCA - this is felt mostly for cyrillic*/ | |
1099 | ||
1100 | uint32_t tailoredCE = UCOL_NOT_FOUND; | |
1101 | //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants)); | |
1102 | UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); | |
1103 | UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); | |
1104 | while(*conts != 0) { | |
1105 | /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/ | |
1106 | tailoredCE = utrie_get32(t->mapping, *conts, NULL); | |
1107 | if(tailoredCE != UCOL_NOT_FOUND) { | |
1108 | UBool needToAdd = TRUE; | |
1109 | if(isCntTableElement(tailoredCE)) { | |
1110 | if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) { | |
1111 | needToAdd = FALSE; | |
1112 | } | |
1113 | } | |
1114 | if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { | |
1115 | needToAdd = FALSE; | |
1116 | } | |
1117 | ||
1118 | if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. | |
1119 | el.prefix = el.prefixChars; | |
1120 | el.prefixSize = 0; | |
1121 | el.cPoints = el.uchars; | |
1122 | el.noOfCEs = 0; | |
1123 | el.uchars[0] = *conts; | |
1124 | el.uchars[1] = *(conts+1); | |
1125 | if(*(conts+2)!=0) { | |
1126 | el.uchars[2] = *(conts+2); | |
1127 | el.cSize = 3; | |
1128 | } else { | |
1129 | el.cSize = 2; | |
1130 | } | |
1131 | ucol_setText(ucaEl, el.uchars, el.cSize, status); | |
1132 | while ((el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { | |
1133 | el.noOfCEs++; | |
1134 | } | |
1135 | uprv_uca_addAnElement(t, &el, status); | |
1136 | } | |
1137 | ||
1138 | } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { | |
1139 | ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status); | |
1140 | } | |
1141 | conts+=3; | |
1142 | } | |
1143 | ucol_closeElements(ucaEl); | |
1144 | } | |
1145 | ||
1146 | // Add completely ignorable elements | |
1147 | utrie_enum(t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); | |
1148 | ||
1149 | ||
1150 | // canonical closure | |
1151 | uprv_uca_canonicalClosure(t, status); | |
1152 | ||
1153 | ||
1154 | /* still need to produce compatibility closure */ | |
1155 | ||
1156 | UCATableHeader *myData = uprv_uca_assembleTable(t, status); | |
1157 | ||
1158 | uprv_uca_closeTempTable(t); | |
1159 | uprv_free(image); | |
1160 | ||
1161 | return myData; | |
1162 | } | |
1163 | ||
1164 | UBool | |
1165 | ucol_bld_cleanup(void) | |
1166 | { | |
1167 | udata_close(invUCA_DATA_MEM); | |
1168 | invUCA_DATA_MEM = NULL; | |
1169 | invUCA = NULL; | |
1170 | return TRUE; | |
1171 | } | |
1172 | ||
1173 | U_CAPI const InverseUCATableHeader * U_EXPORT2 | |
1174 | ucol_initInverseUCA(UErrorCode *status) | |
1175 | { | |
1176 | if(U_FAILURE(*status)) return NULL; | |
1177 | ||
1178 | umtx_lock(NULL); | |
1179 | UBool f = (invUCA == NULL); | |
1180 | umtx_unlock(NULL); | |
1181 | ||
1182 | if(f) { | |
1183 | InverseUCATableHeader *newInvUCA = NULL; | |
1184 | UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); | |
1185 | ||
1186 | if(U_FAILURE(*status)) { | |
1187 | if (result) { | |
1188 | udata_close(result); | |
1189 | } | |
1190 | // This is not needed, as we are talking about | |
1191 | // memory we got from UData | |
1192 | //uprv_free(newInvUCA); | |
1193 | } | |
1194 | ||
1195 | if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ | |
1196 | newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); | |
1197 | UCollator *UCA = ucol_initUCA(status); | |
1198 | // UCA versions of UCA and inverse UCA should match | |
1199 | if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { | |
1200 | *status = U_INVALID_FORMAT_ERROR; | |
1201 | udata_close(result); | |
1202 | return NULL; | |
1203 | } | |
1204 | ||
1205 | umtx_lock(NULL); | |
1206 | if(invUCA == NULL) { | |
1207 | invUCA = newInvUCA; | |
1208 | invUCA_DATA_MEM = result; | |
1209 | result = NULL; | |
1210 | newInvUCA = NULL; | |
1211 | } | |
1212 | umtx_unlock(NULL); | |
1213 | ||
1214 | if(newInvUCA != NULL) { | |
1215 | udata_close(result); | |
1216 | // This is not needed, as we are talking about | |
1217 | // memory we got from UData | |
1218 | //uprv_free(newInvUCA); | |
1219 | } | |
1220 | else { | |
1221 | ucln_i18n_registerCleanup(); | |
1222 | } | |
1223 | } | |
1224 | } | |
1225 | return invUCA; | |
1226 | } | |
1227 | ||
1228 | #endif /* #if !UCONFIG_NO_COLLATION */ |