]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
51004dcb | 4 | * Copyright (C) 2001-2012, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucol_bld.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created 02/22/2001 | |
14 | * created by: Vladimir Weinstein | |
15 | * | |
16 | * This module builds a collator based on the rule set. | |
46f4442e | 17 | * |
b75a7d8f A |
18 | */ |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | ||
22 | #if !UCONFIG_NO_COLLATION | |
23 | ||
24 | #include "unicode/ucoleitr.h" | |
46f4442e | 25 | #include "unicode/udata.h" |
b75a7d8f | 26 | #include "unicode/uchar.h" |
b75a7d8f | 27 | #include "unicode/uniset.h" |
729e4ab9 A |
28 | #include "unicode/uscript.h" |
29 | #include "unicode/ustring.h" | |
4388f060 | 30 | #include "unicode/utf16.h" |
729e4ab9 | 31 | #include "normalizer2impl.h" |
46f4442e A |
32 | #include "ucol_bld.h" |
33 | #include "ucol_elm.h" | |
34 | #include "ucol_cnt.h" | |
35 | #include "ucln_in.h" | |
36 | #include "umutex.h" | |
46f4442e | 37 | #include "cmemory.h" |
729e4ab9 | 38 | #include "cstring.h" |
b75a7d8f | 39 | |
51004dcb A |
40 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
41 | ||
374ca955 | 42 | static const InverseUCATableHeader* _staticInvUCA = NULL; |
b75a7d8f A |
43 | static UDataMemory* invUCA_DATA_MEM = NULL; |
44 | ||
45 | U_CDECL_BEGIN | |
46 | static UBool U_CALLCONV | |
46f4442e A |
47 | isAcceptableInvUCA(void * /*context*/, |
48 | const char * /*type*/, const char * /*name*/, | |
49 | const UDataInfo *pInfo) | |
50 | { | |
51 | /* context, type & name are intentionally not used */ | |
b75a7d8f A |
52 | if( pInfo->size>=20 && |
53 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
54 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
374ca955 A |
55 | pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ |
56 | pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && | |
57 | pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && | |
58 | pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && | |
59 | pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && | |
60 | pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& | |
61 | //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && | |
62 | //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && | |
63 | //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && | |
46f4442e A |
64 | ) |
65 | { | |
b75a7d8f A |
66 | UVersionInfo UCDVersion; |
67 | u_getUnicodeVersion(UCDVersion); | |
46f4442e A |
68 | return (pInfo->dataVersion[0]==UCDVersion[0] && |
69 | pInfo->dataVersion[1]==UCDVersion[1]); | |
70 | //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && | |
71 | //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && | |
72 | //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { | |
b75a7d8f A |
73 | } else { |
74 | return FALSE; | |
75 | } | |
76 | } | |
77 | U_CDECL_END | |
78 | ||
46f4442e A |
79 | /* |
80 | * Takes two CEs (lead and continuation) and | |
81 | * compares them as CEs should be compared: | |
82 | * primary vs. primary, secondary vs. secondary | |
83 | * tertiary vs. tertiary | |
84 | */ | |
73c04bcf | 85 | static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { |
46f4442e A |
86 | uint32_t s1 = source0, s2, t1 = target0, t2; |
87 | if(isContinuation(source1)) { | |
88 | s2 = source1; | |
89 | } else { | |
90 | s2 = 0; | |
91 | } | |
92 | if(isContinuation(target1)) { | |
93 | t2 = target1; | |
73c04bcf | 94 | } else { |
46f4442e A |
95 | t2 = 0; |
96 | } | |
97 | ||
98 | uint32_t s = 0, t = 0; | |
99 | if(s1 == t1 && s2 == t2) { | |
100 | return 0; | |
101 | } | |
102 | s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); | |
103 | t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); | |
104 | if(s < t) { | |
73c04bcf | 105 | return -1; |
46f4442e | 106 | } else if(s > t) { |
73c04bcf | 107 | return 1; |
46f4442e A |
108 | } else { |
109 | s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; | |
110 | t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; | |
111 | if(s < t) { | |
112 | return -1; | |
113 | } else if(s > t) { | |
114 | return 1; | |
115 | } else { | |
116 | s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); | |
117 | t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); | |
118 | if(s < t) { | |
119 | return -1; | |
120 | } else { | |
121 | return 1; | |
122 | } | |
123 | } | |
73c04bcf | 124 | } |
73c04bcf A |
125 | } |
126 | ||
b75a7d8f | 127 | static |
374ca955 | 128 | int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { |
46f4442e A |
129 | uint32_t bottom = 0, top = src->invUCA->tableSize; |
130 | uint32_t i = 0; | |
131 | uint32_t first = 0, second = 0; | |
132 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
133 | int32_t res = 0; | |
134 | ||
135 | while(bottom < top-1) { | |
136 | i = (top+bottom)/2; | |
137 | first = *(CETable+3*i); | |
138 | second = *(CETable+3*i+1); | |
139 | res = compareCEs(first, second, CE, SecondCE); | |
140 | if(res > 0) { | |
141 | top = i; | |
142 | } else if(res < 0) { | |
143 | bottom = i; | |
144 | } else { | |
145 | break; | |
146 | } | |
b75a7d8f | 147 | } |
46f4442e A |
148 | |
149 | /* weiv: */ | |
150 | /* in searching for elements, I have removed the failure */ | |
151 | /* The reason for this is that the builder does not rely */ | |
152 | /* on search mechanism telling it that it didn't find an */ | |
153 | /* element. However, indirect positioning relies on being */ | |
154 | /* able to find the elements around any CE, even if it is */ | |
155 | /* not defined in the UCA. */ | |
156 | return i; | |
157 | /* | |
158 | if((first == CE && second == SecondCE)) { | |
b75a7d8f | 159 | return i; |
46f4442e | 160 | } else { |
b75a7d8f | 161 | return -1; |
46f4442e A |
162 | } |
163 | */ | |
b75a7d8f A |
164 | } |
165 | ||
166 | static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { | |
46f4442e A |
167 | 0xFFFF0000, |
168 | 0xFFFFFF00, | |
169 | 0xFFFFFFFF | |
b75a7d8f A |
170 | }; |
171 | ||
374ca955 | 172 | U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, |
46f4442e A |
173 | uint32_t CE, uint32_t contCE, |
174 | uint32_t *nextCE, uint32_t *nextContCE, | |
175 | uint32_t strength) | |
176 | { | |
177 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
178 | int32_t iCE; | |
b75a7d8f | 179 | |
46f4442e | 180 | iCE = ucol_inv_findCE(src, CE, contCE); |
b75a7d8f | 181 | |
46f4442e A |
182 | if(iCE<0) { |
183 | *nextCE = UCOL_NOT_FOUND; | |
184 | return -1; | |
185 | } | |
b75a7d8f | 186 | |
46f4442e A |
187 | CE &= strengthMask[strength]; |
188 | contCE &= strengthMask[strength]; | |
b75a7d8f | 189 | |
46f4442e A |
190 | *nextCE = CE; |
191 | *nextContCE = contCE; | |
b75a7d8f | 192 | |
46f4442e A |
193 | while((*nextCE & strengthMask[strength]) == CE |
194 | && (*nextContCE & strengthMask[strength]) == contCE) | |
195 | { | |
196 | *nextCE = (*(CETable+3*(++iCE))); | |
197 | *nextContCE = (*(CETable+3*(iCE)+1)); | |
198 | } | |
b75a7d8f | 199 | |
46f4442e | 200 | return iCE; |
b75a7d8f A |
201 | } |
202 | ||
46f4442e A |
203 | U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, |
204 | uint32_t CE, uint32_t contCE, | |
205 | uint32_t *prevCE, uint32_t *prevContCE, | |
206 | uint32_t strength) | |
207 | { | |
208 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
209 | int32_t iCE; | |
b75a7d8f | 210 | |
46f4442e | 211 | iCE = ucol_inv_findCE(src, CE, contCE); |
b75a7d8f | 212 | |
46f4442e A |
213 | if(iCE<0) { |
214 | *prevCE = UCOL_NOT_FOUND; | |
215 | return -1; | |
216 | } | |
b75a7d8f | 217 | |
46f4442e A |
218 | CE &= strengthMask[strength]; |
219 | contCE &= strengthMask[strength]; | |
b75a7d8f | 220 | |
46f4442e A |
221 | *prevCE = CE; |
222 | *prevContCE = contCE; | |
b75a7d8f | 223 | |
46f4442e A |
224 | while((*prevCE & strengthMask[strength]) == CE |
225 | && (*prevContCE & strengthMask[strength])== contCE | |
226 | && iCE > 0) /* this condition should prevent falling off the edge of the world */ | |
227 | { | |
228 | /* here, we end up in a singularity - zero */ | |
229 | *prevCE = (*(CETable+3*(--iCE))); | |
230 | *prevContCE = (*(CETable+3*(iCE)+1)); | |
231 | } | |
b75a7d8f | 232 | |
46f4442e | 233 | return iCE; |
b75a7d8f A |
234 | } |
235 | ||
46f4442e A |
236 | U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, |
237 | uint32_t prevCE, uint32_t prevContCE) | |
73c04bcf A |
238 | { |
239 | if(prevCE == CE && prevContCE == contCE) { | |
46f4442e | 240 | return UCOL_IDENTICAL; |
73c04bcf A |
241 | } |
242 | if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) | |
46f4442e A |
243 | || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) |
244 | { | |
245 | return UCOL_PRIMARY; | |
374ca955 | 246 | } |
73c04bcf | 247 | if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) |
46f4442e A |
248 | || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) |
249 | { | |
250 | return UCOL_SECONDARY; | |
73c04bcf | 251 | } |
46f4442e | 252 | return UCOL_TERTIARY; |
374ca955 A |
253 | } |
254 | ||
255 | ||
46f4442e | 256 | /*static |
374ca955 | 257 | inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { |
b75a7d8f | 258 | |
46f4442e A |
259 | uint32_t CE = lh->baseCE; |
260 | uint32_t SecondCE = lh->baseContCE; | |
b75a7d8f | 261 | |
46f4442e A |
262 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
263 | uint32_t previousCE, previousContCE; | |
264 | int32_t iCE; | |
b75a7d8f | 265 | |
46f4442e | 266 | iCE = ucol_inv_findCE(src, CE, SecondCE); |
b75a7d8f | 267 | |
46f4442e A |
268 | if(iCE<0) { |
269 | return -1; | |
270 | } | |
b75a7d8f | 271 | |
46f4442e A |
272 | CE &= strengthMask[strength]; |
273 | SecondCE &= strengthMask[strength]; | |
b75a7d8f | 274 | |
46f4442e A |
275 | previousCE = CE; |
276 | previousContCE = SecondCE; | |
b75a7d8f | 277 | |
46f4442e A |
278 | while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { |
279 | previousCE = (*(CETable+3*(--iCE))); | |
280 | previousContCE = (*(CETable+3*(iCE)+1)); | |
281 | } | |
282 | lh->previousCE = previousCE; | |
283 | lh->previousContCE = previousContCE; | |
b75a7d8f | 284 | |
46f4442e A |
285 | return iCE; |
286 | }*/ | |
b75a7d8f A |
287 | |
288 | static | |
374ca955 | 289 | inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { |
46f4442e A |
290 | uint32_t CE = lh->baseCE; |
291 | uint32_t SecondCE = lh->baseContCE; | |
b75a7d8f | 292 | |
46f4442e A |
293 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
294 | uint32_t nextCE, nextContCE; | |
295 | int32_t iCE; | |
b75a7d8f | 296 | |
46f4442e | 297 | iCE = ucol_inv_findCE(src, CE, SecondCE); |
b75a7d8f | 298 | |
46f4442e A |
299 | if(iCE<0) { |
300 | return -1; | |
301 | } | |
b75a7d8f | 302 | |
46f4442e A |
303 | CE &= strengthMask[strength]; |
304 | SecondCE &= strengthMask[strength]; | |
b75a7d8f | 305 | |
46f4442e A |
306 | nextCE = CE; |
307 | nextContCE = SecondCE; | |
b75a7d8f | 308 | |
46f4442e A |
309 | while((nextCE & strengthMask[strength]) == CE |
310 | && (nextContCE & strengthMask[strength]) == SecondCE) | |
311 | { | |
312 | nextCE = (*(CETable+3*(++iCE))); | |
313 | nextContCE = (*(CETable+3*(iCE)+1)); | |
314 | } | |
b75a7d8f | 315 | |
46f4442e A |
316 | lh->nextCE = nextCE; |
317 | lh->nextContCE = nextContCE; | |
b75a7d8f | 318 | |
46f4442e | 319 | return iCE; |
b75a7d8f A |
320 | } |
321 | ||
46f4442e A |
322 | static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { |
323 | /* reset all the gaps */ | |
324 | int32_t i = 0; | |
325 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
326 | uint32_t st = 0; | |
327 | uint32_t t1, t2; | |
328 | int32_t pos; | |
329 | ||
330 | UColToken *tok = lh->first; | |
331 | uint32_t tokStrength = tok->strength; | |
332 | ||
333 | for(i = 0; i<3; i++) { | |
334 | lh->gapsHi[3*i] = 0; | |
335 | lh->gapsHi[3*i+1] = 0; | |
336 | lh->gapsHi[3*i+2] = 0; | |
337 | lh->gapsLo[3*i] = 0; | |
338 | lh->gapsLo[3*i+1] = 0; | |
339 | lh->gapsLo[3*i+2] = 0; | |
340 | lh->numStr[i] = 0; | |
341 | lh->fStrToken[i] = NULL; | |
342 | lh->lStrToken[i] = NULL; | |
343 | lh->pos[i] = -1; | |
344 | } | |
b75a7d8f | 345 | |
46f4442e A |
346 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
347 | ||
348 | if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
349 | //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ | |
350 | lh->pos[0] = 0; | |
351 | t1 = lh->baseCE; | |
352 | t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; | |
353 | lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
354 | lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
355 | lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
729e4ab9 | 356 | uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); |
46f4442e A |
357 | primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); |
358 | ||
729e4ab9 | 359 | t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
46f4442e A |
360 | t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; |
361 | ||
362 | lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
363 | lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
364 | lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
365 | } else if(lh->indirect == TRUE && lh->nextCE != 0) { | |
366 | //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { | |
367 | lh->pos[0] = 0; | |
368 | t1 = lh->baseCE; | |
369 | t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; | |
370 | lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
371 | lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
372 | lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
373 | t1 = lh->nextCE; | |
374 | t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; | |
375 | lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
376 | lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
377 | lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
378 | } else { | |
379 | for(;;) { | |
380 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { | |
381 | if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { | |
382 | lh->fStrToken[tokStrength] = tok; | |
383 | } else { /* The CE must be implicit, since it's not in the table */ | |
384 | /* Error */ | |
385 | *status = U_INTERNAL_PROGRAM_ERROR; | |
386 | } | |
387 | } | |
388 | ||
389 | while(tok != NULL && tok->strength >= tokStrength) { | |
390 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { | |
391 | lh->lStrToken[tokStrength] = tok; | |
392 | } | |
393 | tok = tok->next; | |
394 | } | |
395 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { | |
396 | /* check if previous interval is the same and merge the intervals if it is so */ | |
397 | if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { | |
398 | lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; | |
399 | lh->fStrToken[tokStrength+1] = NULL; | |
400 | lh->lStrToken[tokStrength+1] = NULL; | |
401 | lh->pos[tokStrength+1] = -1; | |
402 | } | |
403 | } | |
404 | if(tok != NULL) { | |
405 | tokStrength = tok->strength; | |
406 | } else { | |
407 | break; | |
408 | } | |
b75a7d8f | 409 | } |
46f4442e A |
410 | for(st = 0; st < 3; st++) { |
411 | if((pos = lh->pos[st]) >= 0) { | |
412 | t1 = *(CETable+3*(pos)); | |
413 | t2 = *(CETable+3*(pos)+1); | |
414 | lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
415 | lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
416 | //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; | |
417 | lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; | |
418 | //pos--; | |
419 | //t1 = *(CETable+3*(pos)); | |
420 | //t2 = *(CETable+3*(pos)+1); | |
421 | t1 = lh->baseCE; | |
422 | t2 = lh->baseContCE; | |
423 | lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
424 | lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; | |
425 | lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; | |
426 | } | |
b75a7d8f | 427 | } |
b75a7d8f | 428 | } |
b75a7d8f A |
429 | } |
430 | ||
431 | ||
432 | #define ucol_countBytes(value, noOfBytes) \ | |
433 | { \ | |
46f4442e A |
434 | uint32_t mask = 0xFFFFFFFF; \ |
435 | (noOfBytes) = 0; \ | |
436 | while(mask != 0) { \ | |
b75a7d8f | 437 | if(((value) & mask) != 0) { \ |
46f4442e | 438 | (noOfBytes)++; \ |
b75a7d8f A |
439 | } \ |
440 | mask >>= 8; \ | |
46f4442e | 441 | } \ |
b75a7d8f A |
442 | } |
443 | ||
46f4442e A |
444 | static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { |
445 | if(U_SUCCESS(*status)) { | |
446 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
447 | } | |
448 | return g->current; | |
b75a7d8f A |
449 | } |
450 | ||
46f4442e A |
451 | static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { |
452 | /* TODO: rename to enum names */ | |
453 | uint32_t high, low, count=1; | |
454 | uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; | |
455 | ||
456 | if(strength == UCOL_SECONDARY) { | |
457 | low = UCOL_COMMON_TOP2<<24; | |
458 | high = 0xFFFFFFFF; | |
459 | count = 0xFF - UCOL_COMMON_TOP2; | |
460 | } else { | |
461 | low = UCOL_BYTE_COMMON << 24; //0x05000000; | |
462 | high = 0x40000000; | |
463 | count = 0x40 - UCOL_BYTE_COMMON; | |
464 | } | |
465 | ||
466 | if(tok->next != NULL && tok->next->strength == strength) { | |
467 | count = tok->next->toInsert; | |
468 | } | |
469 | ||
470 | g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); | |
471 | g->current = UCOL_BYTE_COMMON<<24; | |
472 | ||
473 | if(g->noOfRanges == 0) { | |
474 | *status = U_INTERNAL_PROGRAM_ERROR; | |
475 | } | |
476 | return g->current; | |
b75a7d8f A |
477 | } |
478 | ||
46f4442e A |
479 | static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { |
480 | uint32_t strength = tok->strength; | |
481 | uint32_t low = lows[fStrength*3+strength]; | |
482 | uint32_t high = highs[fStrength*3+strength]; | |
483 | uint32_t maxByte = 0; | |
484 | if(strength == UCOL_TERTIARY) { | |
485 | maxByte = 0x3F; | |
486 | } else if(strength == UCOL_PRIMARY) { | |
487 | maxByte = 0xFE; | |
488 | } else { | |
489 | maxByte = 0xFF; | |
490 | } | |
491 | ||
492 | uint32_t count = tok->toInsert; | |
493 | ||
494 | if(low >= high && strength > UCOL_PRIMARY) { | |
495 | int32_t s = strength; | |
496 | for(;;) { | |
497 | s--; | |
498 | if(lows[fStrength*3+s] != highs[fStrength*3+s]) { | |
499 | if(strength == UCOL_SECONDARY) { | |
500 | if (low < UCOL_COMMON_TOP2<<24 ) { | |
501 | // Override if low range is less than UCOL_COMMON_TOP2. | |
502 | low = UCOL_COMMON_TOP2<<24; | |
503 | } | |
504 | high = 0xFFFFFFFF; | |
505 | } else { | |
506 | // Override if low range is less than UCOL_COMMON_BOT3. | |
507 | if ( low < UCOL_COMMON_BOT3<<24 ) { | |
508 | low = UCOL_COMMON_BOT3<<24; | |
509 | } | |
510 | high = 0x40000000; | |
511 | } | |
512 | break; | |
513 | } | |
514 | if(s<0) { | |
515 | *status = U_INTERNAL_PROGRAM_ERROR; | |
516 | return 0; | |
517 | } | |
b75a7d8f | 518 | } |
b75a7d8f | 519 | } |
b75a7d8f | 520 | |
729e4ab9 A |
521 | if(low < 0x02000000) { |
522 | // We must not use CE weight byte 02, so we set it as the minimum lower bound. | |
523 | // See http://site.icu-project.org/design/collation/bytes | |
524 | low = 0x02000000; | |
46f4442e | 525 | } |
b75a7d8f | 526 | |
46f4442e A |
527 | if(strength == UCOL_SECONDARY) { /* similar as simple */ |
528 | if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { | |
529 | low = UCOL_COMMON_TOP2<<24; | |
530 | } | |
531 | if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { | |
532 | high = UCOL_COMMON_TOP2<<24; | |
533 | } | |
534 | if(low < (UCOL_COMMON_BOT2<<24)) { | |
535 | g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); | |
536 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
537 | //g->current = UCOL_COMMON_BOT2<<24; | |
538 | return g->current; | |
539 | } | |
b75a7d8f | 540 | } |
46f4442e A |
541 | |
542 | g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); | |
543 | if(g->noOfRanges == 0) { | |
544 | *status = U_INTERNAL_PROGRAM_ERROR; | |
b75a7d8f | 545 | } |
46f4442e A |
546 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
547 | return g->current; | |
b75a7d8f A |
548 | } |
549 | ||
374ca955 A |
550 | static |
551 | uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { | |
46f4442e A |
552 | uint32_t i = 0; |
553 | UChar c; | |
554 | ||
555 | if(U_FAILURE(*status)) { | |
556 | return 0; | |
557 | } | |
558 | ||
559 | if(sourceLen > resLen) { | |
560 | *status = U_MEMORY_ALLOCATION_ERROR; | |
561 | return 0; | |
562 | } | |
563 | ||
564 | for(i = 0; i < sourceLen; i++) { | |
565 | c = source[i]; | |
566 | if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ | |
567 | switch(c - 0x3000) { | |
568 | case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: | |
569 | case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: | |
570 | c++; | |
571 | break; | |
572 | case 0xF5: | |
573 | c = 0x30AB; | |
574 | break; | |
575 | case 0xF6: | |
576 | c = 0x30B1; | |
577 | break; | |
578 | } | |
579 | } | |
580 | resBuf[i] = c; | |
374ca955 | 581 | } |
46f4442e | 582 | return sourceLen; |
374ca955 A |
583 | } |
584 | ||
585 | static | |
586 | uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { | |
46f4442e A |
587 | uint32_t i = 0; |
588 | UChar c; | |
589 | ||
590 | if(U_FAILURE(*status)) { | |
591 | return 0; | |
374ca955 | 592 | } |
46f4442e A |
593 | |
594 | if(sourceLen > resLen) { | |
595 | *status = U_MEMORY_ALLOCATION_ERROR; | |
596 | return 0; | |
597 | } | |
598 | ||
599 | for(i = 0; i < sourceLen; i++) { | |
600 | c = source[i]; | |
601 | if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ | |
602 | switch(c - 0x3000) { | |
603 | case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: | |
604 | case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: | |
605 | c--; | |
606 | break; | |
607 | case 0xAB: | |
608 | c = 0x30F5; | |
609 | break; | |
610 | case 0xB1: | |
611 | c = 0x30F6; | |
612 | break; | |
613 | } | |
614 | } | |
615 | resBuf[i] = c; | |
616 | } | |
617 | return sourceLen; | |
374ca955 A |
618 | } |
619 | ||
729e4ab9 A |
620 | U_NAMESPACE_BEGIN |
621 | ||
374ca955 A |
622 | static |
623 | uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { | |
46f4442e A |
624 | uint32_t i = 0; |
625 | UChar n[128]; | |
626 | uint32_t nLen = 0; | |
627 | uint32_t uCount = 0, lCount = 0; | |
628 | ||
629 | collIterate s; | |
630 | uint32_t order = 0; | |
631 | ||
632 | if(U_FAILURE(*status)) { | |
374ca955 | 633 | return UCOL_LOWER_CASE; |
46f4442e A |
634 | } |
635 | ||
636 | nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); | |
637 | if(U_SUCCESS(*status)) { | |
638 | for(i = 0; i < nLen; i++) { | |
729e4ab9 | 639 | uprv_init_collIterate(UCA, &n[i], 1, &s, status); |
46f4442e A |
640 | order = ucol_getNextCE(UCA, &s, status); |
641 | if(isContinuation(order)) { | |
642 | *status = U_INTERNAL_PROGRAM_ERROR; | |
643 | return UCOL_LOWER_CASE; | |
644 | } | |
645 | if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { | |
646 | uCount++; | |
647 | } else { | |
648 | if(u_islower(n[i])) { | |
649 | lCount++; | |
729e4ab9 | 650 | } else if(U_SUCCESS(*status)) { |
46f4442e A |
651 | UChar sk[1], lk[1]; |
652 | u_toSmallKana(&n[i], 1, sk, 1, status); | |
653 | u_toLargeKana(&n[i], 1, lk, 1, status); | |
654 | if(sk[0] == n[i] && lk[0] != n[i]) { | |
655 | lCount++; | |
656 | } | |
657 | } | |
658 | } | |
374ca955 | 659 | } |
374ca955 | 660 | } |
46f4442e A |
661 | |
662 | if(uCount != 0 && lCount != 0) { | |
663 | return UCOL_MIXED_CASE; | |
664 | } else if(uCount != 0) { | |
665 | return UCOL_UPPER_CASE; | |
666 | } else { | |
667 | return UCOL_LOWER_CASE; | |
668 | } | |
374ca955 A |
669 | } |
670 | ||
671 | ||
672 | U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { | |
46f4442e A |
673 | /* this one makes the table and stuff */ |
674 | uint32_t noOfBytes[3]; | |
675 | uint32_t i; | |
b75a7d8f | 676 | |
46f4442e A |
677 | for(i = 0; i<3; i++) { |
678 | ucol_countBytes(CEparts[i], noOfBytes[i]); | |
679 | } | |
b75a7d8f | 680 | |
46f4442e | 681 | /* Here we have to pack CEs from parts */ |
b75a7d8f | 682 | |
46f4442e A |
683 | uint32_t CEi = 0; |
684 | uint32_t value = 0; | |
b75a7d8f | 685 | |
46f4442e A |
686 | while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { |
687 | if(CEi > 0) { | |
688 | value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ | |
689 | } else { | |
690 | value = 0; | |
691 | } | |
b75a7d8f | 692 | |
46f4442e A |
693 | if(2*CEi<noOfBytes[0]) { |
694 | value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; | |
695 | } | |
696 | if(CEi<noOfBytes[1]) { | |
697 | value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; | |
698 | } | |
699 | if(CEi<noOfBytes[2]) { | |
700 | value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); | |
701 | } | |
702 | tok->CEs[CEi] = value; | |
703 | CEi++; | |
b75a7d8f | 704 | } |
46f4442e A |
705 | if(CEi == 0) { /* totally ignorable */ |
706 | tok->noOfCEs = 1; | |
707 | tok->CEs[0] = 0; | |
708 | } else { /* there is at least something */ | |
709 | tok->noOfCEs = CEi; | |
b75a7d8f | 710 | } |
46f4442e A |
711 | |
712 | ||
713 | // we want to set case bits here and now, not later. | |
714 | // Case bits handling | |
715 | if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables | |
716 | tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field | |
717 | int32_t cSize = (tok->source & 0xFF000000) >> 24; | |
718 | UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; | |
719 | ||
720 | if(cSize > 1) { | |
721 | // Do it manually | |
722 | tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); | |
723 | } else { | |
724 | // Copy it from the UCA | |
725 | uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); | |
726 | tok->CEs[0] |= (caseCE & 0xC0); | |
727 | } | |
73c04bcf | 728 | } |
374ca955 | 729 | |
b75a7d8f | 730 | #if UCOL_DEBUG==2 |
46f4442e A |
731 | fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); |
732 | for(i = 0; i<tok->noOfCEs; i++) { | |
733 | fprintf(stderr, "%08X ", tok->CEs[i]); | |
734 | } | |
735 | fprintf(stderr, "\n"); | |
b75a7d8f A |
736 | #endif |
737 | } | |
738 | ||
739 | U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { | |
46f4442e A |
740 | ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; |
741 | uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; | |
b75a7d8f | 742 | |
46f4442e A |
743 | UColToken *tok = lh->last; |
744 | uint32_t t[UCOL_STRENGTH_LIMIT]; | |
b75a7d8f | 745 | |
46f4442e | 746 | uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); |
b75a7d8f | 747 | |
51004dcb A |
748 | /* must initialize ranges to avoid memory check warnings */ |
749 | for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { | |
750 | uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); | |
751 | } | |
752 | ||
46f4442e A |
753 | tok->toInsert = 1; |
754 | t[tok->strength] = 1; | |
b75a7d8f | 755 | |
46f4442e A |
756 | while(tok->previous != NULL) { |
757 | if(tok->previous->strength < tok->strength) { /* going up */ | |
758 | t[tok->strength] = 0; | |
759 | t[tok->previous->strength]++; | |
760 | } else if(tok->previous->strength > tok->strength) { /* going down */ | |
761 | t[tok->previous->strength] = 1; | |
762 | } else { | |
763 | t[tok->strength]++; | |
764 | } | |
765 | tok=tok->previous; | |
766 | tok->toInsert = t[tok->strength]; | |
b75a7d8f | 767 | } |
b75a7d8f | 768 | |
46f4442e A |
769 | tok->toInsert = t[tok->strength]; |
770 | ucol_inv_getGapPositions(src, lh, status); | |
b75a7d8f A |
771 | |
772 | #if UCOL_DEBUG | |
46f4442e A |
773 | fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); |
774 | int32_t j = 2; | |
775 | for(j = 2; j >= 0; j--) { | |
776 | fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); | |
777 | fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); | |
778 | } | |
729e4ab9 | 779 | tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
b75a7d8f | 780 | |
46f4442e A |
781 | do { |
782 | fprintf(stderr,"%i", tok->strength); | |
783 | tok = tok->next; | |
784 | } while(tok != NULL); | |
785 | fprintf(stderr, "\n"); | |
b75a7d8f | 786 | |
729e4ab9 | 787 | tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
b75a7d8f | 788 | |
46f4442e A |
789 | do { |
790 | fprintf(stderr,"%i", tok->toInsert); | |
791 | tok = tok->next; | |
792 | } while(tok != NULL); | |
793 | #endif | |
b75a7d8f | 794 | |
46f4442e A |
795 | tok = lh->first; |
796 | uint32_t fStrength = UCOL_IDENTICAL; | |
797 | uint32_t initStrength = UCOL_IDENTICAL; | |
798 | ||
799 | ||
800 | CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; | |
801 | CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; | |
802 | CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; | |
803 | ||
804 | while (tok != NULL && U_SUCCESS(*status)) { | |
805 | fStrength = tok->strength; | |
806 | if(fStrength < initStrength) { | |
807 | initStrength = fStrength; | |
808 | if(lh->pos[fStrength] == -1) { | |
809 | while(lh->pos[fStrength] == -1 && fStrength > 0) { | |
810 | fStrength--; | |
811 | } | |
812 | if(lh->pos[fStrength] == -1) { | |
813 | *status = U_INTERNAL_PROGRAM_ERROR; | |
814 | return; | |
815 | } | |
816 | } | |
817 | if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ | |
818 | CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; | |
819 | CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; | |
820 | /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ | |
821 | CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
822 | } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ | |
823 | CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; | |
824 | /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ | |
825 | CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
826 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
827 | } else { /* primaries */ | |
828 | /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ | |
829 | CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
830 | CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); | |
831 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
832 | } | |
833 | } else { | |
834 | if(tok->strength == UCOL_TERTIARY) { | |
835 | CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); | |
836 | } else if(tok->strength == UCOL_SECONDARY) { | |
837 | CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); | |
838 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
839 | } else if(tok->strength == UCOL_PRIMARY) { | |
840 | CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); | |
841 | CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); | |
842 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); | |
843 | } | |
b75a7d8f | 844 | } |
46f4442e A |
845 | ucol_doCE(src, CEparts, tok, status); |
846 | tok = tok->next; | |
b75a7d8f | 847 | } |
b75a7d8f A |
848 | } |
849 | ||
b75a7d8f | 850 | U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { |
46f4442e A |
851 | UCAElements el; |
852 | UColToken *tok = lh->first; | |
853 | UColToken *expt = NULL; | |
854 | uint32_t i = 0, j = 0; | |
4388f060 | 855 | const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); |
46f4442e A |
856 | |
857 | while(tok != NULL && U_SUCCESS(*status)) { | |
858 | /* first, check if there are any expansions */ | |
859 | /* if there are expansions, we need to do a little bit more processing */ | |
860 | /* since parts of expansion can be tailored, while others are not */ | |
861 | if(tok->expansion != 0) { | |
862 | uint32_t len = tok->expansion >> 24; | |
863 | uint32_t currentSequenceLen = len; | |
864 | uint32_t expOffset = tok->expansion & 0x00FFFFFF; | |
865 | //uint32_t exp = currentSequenceLen | expOffset; | |
866 | UColToken exp; | |
867 | exp.source = currentSequenceLen | expOffset; | |
729e4ab9 | 868 | exp.rulesToParseHdl = &(src->source); |
46f4442e A |
869 | |
870 | while(len > 0) { | |
871 | currentSequenceLen = len; | |
872 | while(currentSequenceLen > 0) { | |
873 | exp.source = (currentSequenceLen << 24) | expOffset; | |
874 | if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ | |
875 | uint32_t noOfCEsToCopy = expt->noOfCEs; | |
876 | for(j = 0; j<noOfCEsToCopy; j++) { | |
877 | tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; | |
878 | } | |
879 | tok->noOfExpCEs += noOfCEsToCopy; | |
880 | // Smart people never try to add codepoints and CEs. | |
881 | // For some odd reason, it won't work. | |
882 | expOffset += currentSequenceLen; //noOfCEsToCopy; | |
883 | len -= currentSequenceLen; //noOfCEsToCopy; | |
884 | break; | |
885 | } else { | |
886 | currentSequenceLen--; | |
887 | } | |
888 | } | |
889 | if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ | |
890 | /* will have to get one from UCA */ | |
891 | /* first, get the UChars from the rules */ | |
892 | /* then pick CEs out until there is no more and stuff them into expansion */ | |
893 | collIterate s; | |
894 | uint32_t order = 0; | |
729e4ab9 | 895 | uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); |
46f4442e A |
896 | |
897 | for(;;) { | |
898 | order = ucol_getNextCE(src->UCA, &s, status); | |
899 | if(order == UCOL_NO_MORE_CES) { | |
900 | break; | |
901 | } | |
902 | tok->expCEs[tok->noOfExpCEs++] = order; | |
903 | } | |
904 | expOffset++; | |
905 | len--; | |
906 | } | |
b75a7d8f | 907 | } |
46f4442e A |
908 | } else { |
909 | tok->noOfExpCEs = 0; | |
b75a7d8f | 910 | } |
b75a7d8f | 911 | |
46f4442e A |
912 | /* set the ucaelement with obtained values */ |
913 | el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; | |
914 | /* copy CEs */ | |
915 | for(i = 0; i<tok->noOfCEs; i++) { | |
916 | el.CEs[i] = tok->CEs[i]; | |
917 | } | |
918 | for(i = 0; i<tok->noOfExpCEs; i++) { | |
919 | el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; | |
920 | } | |
b75a7d8f | 921 | |
46f4442e A |
922 | /* copy UChars */ |
923 | // We kept prefix and source kind of together, as it is a kind of a contraction. | |
924 | // However, now we have to slice the prefix off the main thing - | |
925 | el.prefix = el.prefixChars; | |
926 | el.cPoints = el.uchars; | |
927 | if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the | |
928 | // addPrefix function in ucol_elm. The reason is that we need to add both composed AND | |
929 | // decomposed elements to the unsaf table. | |
930 | el.prefixSize = tok->prefix>>24; | |
931 | uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); | |
932 | ||
933 | el.cSize = (tok->source >> 24)-(tok->prefix>>24); | |
934 | uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); | |
935 | } else { | |
936 | el.prefixSize = 0; | |
937 | *el.prefix = 0; | |
b75a7d8f | 938 | |
46f4442e A |
939 | el.cSize = (tok->source >> 24); |
940 | uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); | |
941 | } | |
942 | if(src->UCA != NULL) { | |
46f4442e A |
943 | for(i = 0; i<el.cSize; i++) { |
944 | if(UCOL_ISJAMO(el.cPoints[i])) { | |
945 | t->image->jamoSpecial = TRUE; | |
946 | } | |
46f4442e | 947 | } |
729e4ab9 A |
948 | if (!src->buildCCTabFlag && el.cSize > 0) { |
949 | // Check the trailing canonical combining class (tccc) of the last character. | |
950 | const UChar *s = el.cPoints + el.cSize; | |
4388f060 | 951 | uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); |
729e4ab9 A |
952 | if ((fcd & 0xff) != 0) { |
953 | src->buildCCTabFlag = TRUE; | |
954 | } | |
46f4442e | 955 | } |
b75a7d8f | 956 | } |
b75a7d8f | 957 | |
46f4442e | 958 | /* and then, add it */ |
b75a7d8f | 959 | #if UCOL_DEBUG==2 |
46f4442e | 960 | fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); |
b75a7d8f | 961 | #endif |
46f4442e | 962 | uprv_uca_addAnElement(t, &el, status); |
b75a7d8f | 963 | |
b75a7d8f | 964 | #if UCOL_DEBUG_DUPLICATES |
46f4442e A |
965 | if(*status != U_ZERO_ERROR) { |
966 | fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); | |
967 | *status = U_ZERO_ERROR; | |
968 | } | |
b75a7d8f A |
969 | #endif |
970 | ||
46f4442e A |
971 | tok = tok->next; |
972 | } | |
b75a7d8f A |
973 | } |
974 | ||
975 | U_CDECL_BEGIN | |
976 | static UBool U_CALLCONV | |
977 | _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { | |
46f4442e A |
978 | UErrorCode status = U_ZERO_ERROR; |
979 | tempUCATable *t = (tempUCATable *)context; | |
980 | if(value == 0) { | |
981 | while(start < limit) { | |
982 | uint32_t CE = utrie_get32(t->mapping, start, NULL); | |
983 | if(CE == UCOL_NOT_FOUND) { | |
984 | UCAElements el; | |
985 | el.isThai = FALSE; | |
986 | el.prefixSize = 0; | |
987 | el.prefixChars[0] = 0; | |
988 | el.prefix = el.prefixChars; | |
989 | el.cPoints = el.uchars; | |
990 | ||
991 | el.cSize = 0; | |
4388f060 | 992 | U16_APPEND_UNSAFE(el.uchars, el.cSize, start); |
46f4442e A |
993 | |
994 | el.noOfCEs = 1; | |
995 | el.CEs[0] = 0; | |
996 | uprv_uca_addAnElement(t, &el, &status); | |
b75a7d8f | 997 | |
46f4442e A |
998 | } |
999 | start++; | |
1000 | } | |
1001 | } | |
1002 | if(U_FAILURE(status)) { | |
1003 | return FALSE; | |
1004 | } else { | |
1005 | return TRUE; | |
b75a7d8f | 1006 | } |
b75a7d8f A |
1007 | } |
1008 | U_CDECL_END | |
1009 | ||
46f4442e | 1010 | static void |
b75a7d8f | 1011 | ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, |
46f4442e A |
1012 | UChar32 start, UChar32 end, |
1013 | UErrorCode *status) | |
1014 | { | |
1015 | //UChar decomp[256]; | |
1016 | uint32_t CE = UCOL_NOT_FOUND; | |
1017 | UChar32 u = 0; | |
1018 | UCAElements el; | |
1019 | el.isThai = FALSE; | |
1020 | el.prefixSize = 0; | |
1021 | el.prefixChars[0] = 0; | |
1022 | collIterate colIt; | |
1023 | ||
1024 | if(U_SUCCESS(*status)) { | |
1025 | for(u = start; u<=end; u++) { | |
1026 | if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND | |
1027 | /* this test is for contractions that are missing the starting element. */ | |
1028 | || ((isCntTableElement(CE)) && | |
1029 | (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) | |
1030 | ) | |
1031 | { | |
1032 | el.cSize = 0; | |
1033 | U16_APPEND_UNSAFE(el.uchars, el.cSize, u); | |
1034 | //decomp[0] = (UChar)u; | |
1035 | //el.uchars[0] = (UChar)u; | |
1036 | el.cPoints = el.uchars; | |
1037 | //el.cSize = 1; | |
1038 | el.noOfCEs = 0; | |
1039 | el.prefix = el.prefixChars; | |
1040 | el.prefixSize = 0; | |
1041 | //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); | |
1042 | // We actually want to check whether this element is a special | |
1043 | // If it is an implicit element (hangul, CJK - we want to copy the | |
1044 | // special, not the resolved CEs) - for hangul, copying resolved | |
1045 | // would just make things the same (there is an expansion and it | |
1046 | // takes approximately the same amount of time to resolve as | |
1047 | // falling back to the UCA). | |
1048 | /* | |
1049 | UTRIE_GET32(src->UCA->mapping, u, CE); | |
1050 | tag = getCETag(CE); | |
1051 | if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG | |
1052 | || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG | |
1053 | || tag == LEAD_SURROGATE_TAG) { | |
1054 | el.CEs[el.noOfCEs++] = CE; | |
1055 | } else { | |
1056 | */ | |
1057 | // It turns out that it does not make sense to keep implicits | |
1058 | // unresolved. The cost of resolving them is big enough so that | |
1059 | // it doesn't make any difference whether we have to go to the UCA | |
1060 | // or not. | |
1061 | { | |
729e4ab9 | 1062 | uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); |
46f4442e A |
1063 | while(CE != UCOL_NO_MORE_CES) { |
1064 | CE = ucol_getNextCE(src->UCA, &colIt, status); | |
1065 | if(CE != UCOL_NO_MORE_CES) { | |
1066 | el.CEs[el.noOfCEs++] = CE; | |
1067 | } | |
1068 | } | |
1069 | } | |
1070 | uprv_uca_addAnElement(t, &el, status); | |
b75a7d8f | 1071 | } |
b75a7d8f | 1072 | } |
b75a7d8f | 1073 | } |
b75a7d8f A |
1074 | } |
1075 | ||
729e4ab9 A |
1076 | U_NAMESPACE_END |
1077 | ||
1078 | U_CFUNC UCATableHeader * | |
1079 | ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { | |
46f4442e | 1080 | U_NAMESPACE_USE |
b75a7d8f | 1081 | |
46f4442e | 1082 | uint32_t i = 0; |
b75a7d8f | 1083 | if(U_FAILURE(*status)) { |
46f4442e | 1084 | return NULL; |
b75a7d8f | 1085 | } |
46f4442e A |
1086 | /* |
1087 | 2. Eliminate the negative lists by doing the following for each non-null negative list: | |
1088 | o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, | |
1089 | create new ListHeader X | |
1090 | o reverse the list, add to the end of X's positive list. Reset the strength of the | |
1091 | first item you add, based on the stronger strength levels of the two lists. | |
1092 | */ | |
1093 | /* | |
1094 | 3. For each ListHeader with a non-null positive list: | |
1095 | */ | |
1096 | /* | |
1097 | o Find all character strings with CEs between the baseCE and the | |
1098 | next/previous CE, at the strength of the first token. Add these to the | |
1099 | tailoring. | |
1100 | ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the | |
1101 | tailoring has & x < z... | |
1102 | ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... | |
1103 | */ | |
1104 | /* It is possible that this part should be done even while constructing list */ | |
1105 | /* The problem is that it is unknown what is going to be the strongest weight */ | |
1106 | /* So we might as well do it here */ | |
1107 | ||
1108 | /* | |
1109 | o Allocate CEs for each token in the list, based on the total number N of the | |
1110 | largest level difference, and the gap G between baseCE and nextCE at that | |
1111 | level. The relation * between the last item and nextCE is the same as the | |
1112 | strongest strength. | |
1113 | o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) | |
1114 | ? There are 3 primary items: a, d, e. Fit them into the primary gap. | |
1115 | Then fit b and c into the secondary gap between a and d, then fit q | |
1116 | into the tertiary gap between b and c. | |
1117 | ||
1118 | o Example: baseCE << b <<< q << c * nextCE(X,2) | |
1119 | ? There are 2 secondary items: b, c. Fit them into the secondary gap. | |
1120 | Then fit q into the tertiary gap between b and c. | |
1121 | o When incrementing primary values, we will not cross high byte | |
1122 | boundaries except where there is only a single-byte primary. That is to | |
1123 | ensure that the script reordering will continue to work. | |
1124 | */ | |
1125 | UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); | |
1126 | /* test for NULL */ | |
1127 | if (image == NULL) { | |
1128 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1129 | return NULL; | |
b75a7d8f | 1130 | } |
46f4442e A |
1131 | uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); |
1132 | ||
1133 | for(i = 0; i<src->resultLen; i++) { | |
1134 | /* now we need to generate the CEs */ | |
1135 | /* We stuff the initial value in the buffers, and increase the appropriate buffer */ | |
1136 | /* According to strength */ | |
1137 | if(U_SUCCESS(*status)) { | |
1138 | if(src->lh[i].first) { // if there are any elements | |
1139 | // due to the way parser works, subsequent tailorings | |
1140 | // may remove all the elements from a sequence, therefore | |
1141 | // leaving an empty tailoring sequence. | |
1142 | ucol_initBuffers(src, &src->lh[i], status); | |
1143 | } | |
1144 | } | |
1145 | if(U_FAILURE(*status)) { | |
1146 | uprv_free(image); | |
1147 | return NULL; | |
1148 | } | |
b75a7d8f | 1149 | } |
46f4442e A |
1150 | |
1151 | if(src->varTop != NULL) { /* stuff the variable top value */ | |
1152 | src->opts->variableTopValue = (*(src->varTop->CEs))>>16; | |
1153 | /* remove it from the list */ | |
1154 | if(src->varTop->listHeader->first == src->varTop) { /* first in list */ | |
1155 | src->varTop->listHeader->first = src->varTop->next; | |
1156 | } | |
1157 | if(src->varTop->listHeader->last == src->varTop) { /* first in list */ | |
1158 | src->varTop->listHeader->last = src->varTop->previous; | |
1159 | } | |
1160 | if(src->varTop->next != NULL) { | |
1161 | src->varTop->next->previous = src->varTop->previous; | |
1162 | } | |
1163 | if(src->varTop->previous != NULL) { | |
1164 | src->varTop->previous->next = src->varTop->next; | |
1165 | } | |
b75a7d8f | 1166 | } |
b75a7d8f A |
1167 | |
1168 | ||
46f4442e A |
1169 | tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); |
1170 | if(U_FAILURE(*status)) { | |
1171 | uprv_free(image); | |
1172 | return NULL; | |
1173 | } | |
b75a7d8f A |
1174 | |
1175 | ||
46f4442e A |
1176 | /* After this, we have assigned CE values to all regular CEs */ |
1177 | /* now we will go through list once more and resolve expansions, */ | |
1178 | /* make UCAElements structs and add them to table */ | |
1179 | for(i = 0; i<src->resultLen; i++) { | |
1180 | /* now we need to generate the CEs */ | |
1181 | /* We stuff the initial value in the buffers, and increase the appropriate buffer */ | |
1182 | /* According to strength */ | |
1183 | if(U_SUCCESS(*status)) { | |
1184 | ucol_createElements(src, t, &src->lh[i], status); | |
1185 | } | |
b75a7d8f | 1186 | } |
b75a7d8f | 1187 | |
46f4442e A |
1188 | UCAElements el; |
1189 | el.isThai = FALSE; | |
1190 | el.prefixSize = 0; | |
1191 | el.prefixChars[0] = 0; | |
b75a7d8f | 1192 | |
46f4442e A |
1193 | /* add latin-1 stuff */ |
1194 | ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); | |
b75a7d8f | 1195 | |
46f4442e A |
1196 | /* add stuff for copying */ |
1197 | if(src->copySet != NULL) { | |
1198 | int32_t i = 0; | |
1199 | UnicodeSet *set = (UnicodeSet *)src->copySet; | |
1200 | for(i = 0; i < set->getRangeCount(); i++) { | |
1201 | ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); | |
b75a7d8f | 1202 | } |
46f4442e | 1203 | } |
b75a7d8f | 1204 | |
46f4442e A |
1205 | if(U_SUCCESS(*status)) { |
1206 | /* copy contractions from the UCA - this is felt mostly for cyrillic*/ | |
1207 | ||
1208 | uint32_t tailoredCE = UCOL_NOT_FOUND; | |
46f4442e | 1209 | UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); |
4388f060 | 1210 | int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; |
46f4442e A |
1211 | UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); |
1212 | // Check for null pointer | |
1213 | if (ucaEl == NULL) { | |
4388f060 A |
1214 | *status = U_MEMORY_ALLOCATION_ERROR; |
1215 | return NULL; | |
b75a7d8f | 1216 | } |
46f4442e | 1217 | while(*conts != 0) { |
4388f060 A |
1218 | // A continuation is NUL-terminated and NUL-padded |
1219 | // except if it has the maximum length. | |
1220 | int32_t contractionLength = maxUCAContractionLength; | |
1221 | while(contractionLength > 0 && conts[contractionLength - 1] == 0) { | |
1222 | --contractionLength; | |
1223 | } | |
1224 | UChar32 first; | |
1225 | int32_t firstLength = 0; | |
1226 | U16_NEXT(conts, firstLength, contractionLength, first); | |
1227 | tailoredCE = utrie_get32(t->mapping, first, NULL); | |
46f4442e A |
1228 | if(tailoredCE != UCOL_NOT_FOUND) { |
1229 | UBool needToAdd = TRUE; | |
1230 | if(isCntTableElement(tailoredCE)) { | |
4388f060 | 1231 | if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { |
46f4442e A |
1232 | needToAdd = FALSE; |
1233 | } | |
1234 | } | |
1235 | if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { | |
1236 | UCAElements elm; | |
1237 | elm.cPoints = el.uchars; | |
1238 | elm.noOfCEs = 0; | |
1239 | elm.uchars[0] = *conts; | |
1240 | elm.uchars[1] = 0; | |
1241 | elm.cSize = 1; | |
1242 | elm.prefixChars[0] = *(conts+2); | |
1243 | elm.isThai = FALSE; | |
1244 | elm.prefix = elm.prefixChars; | |
1245 | elm.prefixSize = 1; | |
1246 | UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); | |
1247 | if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { | |
1248 | needToAdd = TRUE; | |
1249 | } | |
1250 | } | |
4388f060 | 1251 | if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { |
46f4442e A |
1252 | needToAdd = FALSE; |
1253 | } | |
1254 | ||
1255 | if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. | |
1256 | if (*(conts+1) != 0) { // contractions | |
1257 | el.prefix = el.prefixChars; | |
1258 | el.prefixSize = 0; | |
1259 | el.cPoints = el.uchars; | |
1260 | el.noOfCEs = 0; | |
4388f060 A |
1261 | u_memcpy(el.uchars, conts, contractionLength); |
1262 | el.cSize = contractionLength; | |
46f4442e A |
1263 | ucol_setText(ucaEl, el.uchars, el.cSize, status); |
1264 | } | |
1265 | else { // pre-context character | |
1266 | UChar str[4] = { 0 }; | |
1267 | int32_t len=0; | |
1268 | int32_t preKeyLen=0; | |
1269 | ||
1270 | el.cPoints = el.uchars; | |
1271 | el.noOfCEs = 0; | |
1272 | el.uchars[0] = *conts; | |
1273 | el.uchars[1] = 0; | |
1274 | el.cSize = 1; | |
1275 | el.prefixChars[0] = *(conts+2); | |
1276 | el.prefix = el.prefixChars; | |
1277 | el.prefixSize = 1; | |
1278 | if (el.prefixChars[0]!=0) { | |
1279 | // get CE of prefix character first | |
1280 | str[0]=el.prefixChars[0]; | |
1281 | str[1]=0; | |
1282 | ucol_setText(ucaEl, str, 1, status); | |
1283 | while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) | |
1284 | != UCOL_NULLORDER) { | |
1285 | preKeyLen++; // count number of keys for prefix character | |
1286 | } | |
1287 | str[len++] = el.prefixChars[0]; | |
1288 | } | |
1289 | ||
1290 | str[len++] = el.uchars[0]; | |
1291 | str[len]=0; | |
1292 | ucol_setText(ucaEl, str, len, status); | |
1293 | // Skip the keys for prefix character, then copy the rest to el. | |
1294 | while ((preKeyLen-->0) && | |
1295 | (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { | |
1296 | continue; | |
1297 | } | |
1298 | ||
1299 | } | |
1300 | while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { | |
1301 | el.noOfCEs++; | |
1302 | } | |
1303 | uprv_uca_addAnElement(t, &el, status); | |
1304 | } | |
1305 | ||
4388f060 A |
1306 | } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { |
1307 | ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); | |
46f4442e | 1308 | } |
4388f060 | 1309 | conts+=maxUCAContractionLength; |
46f4442e A |
1310 | } |
1311 | ucol_closeElements(ucaEl); | |
b75a7d8f | 1312 | } |
b75a7d8f | 1313 | |
46f4442e A |
1314 | // Add completely ignorable elements |
1315 | utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); | |
b75a7d8f | 1316 | |
46f4442e | 1317 | // add tailoring characters related canonical closures |
729e4ab9 | 1318 | uprv_uca_canonicalClosure(t, src, NULL, status); |
b75a7d8f A |
1319 | |
1320 | /* still need to produce compatibility closure */ | |
1321 | ||
46f4442e | 1322 | UCATableHeader *myData = uprv_uca_assembleTable(t, status); |
b75a7d8f | 1323 | |
46f4442e A |
1324 | uprv_uca_closeTempTable(t); |
1325 | uprv_free(image); | |
b75a7d8f | 1326 | |
46f4442e | 1327 | return myData; |
b75a7d8f A |
1328 | } |
1329 | ||
374ca955 A |
1330 | U_CDECL_BEGIN |
1331 | static UBool U_CALLCONV | |
b75a7d8f A |
1332 | ucol_bld_cleanup(void) |
1333 | { | |
1334 | udata_close(invUCA_DATA_MEM); | |
1335 | invUCA_DATA_MEM = NULL; | |
374ca955 | 1336 | _staticInvUCA = NULL; |
b75a7d8f A |
1337 | return TRUE; |
1338 | } | |
374ca955 | 1339 | U_CDECL_END |
b75a7d8f A |
1340 | |
1341 | U_CAPI const InverseUCATableHeader * U_EXPORT2 | |
1342 | ucol_initInverseUCA(UErrorCode *status) | |
1343 | { | |
1344 | if(U_FAILURE(*status)) return NULL; | |
1345 | ||
46f4442e A |
1346 | UBool needsInit; |
1347 | UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit); | |
1348 | ||
1349 | if(needsInit) { | |
b75a7d8f | 1350 | InverseUCATableHeader *newInvUCA = NULL; |
729e4ab9 | 1351 | UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); |
46f4442e | 1352 | |
b75a7d8f A |
1353 | if(U_FAILURE(*status)) { |
1354 | if (result) { | |
1355 | udata_close(result); | |
1356 | } | |
1357 | // This is not needed, as we are talking about | |
1358 | // memory we got from UData | |
1359 | //uprv_free(newInvUCA); | |
1360 | } | |
46f4442e | 1361 | |
b75a7d8f A |
1362 | if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ |
1363 | newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); | |
1364 | UCollator *UCA = ucol_initUCA(status); | |
1365 | // UCA versions of UCA and inverse UCA should match | |
1366 | if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { | |
46f4442e A |
1367 | *status = U_INVALID_FORMAT_ERROR; |
1368 | udata_close(result); | |
1369 | return NULL; | |
b75a7d8f | 1370 | } |
46f4442e | 1371 | |
b75a7d8f | 1372 | umtx_lock(NULL); |
374ca955 | 1373 | if(_staticInvUCA == NULL) { |
b75a7d8f | 1374 | invUCA_DATA_MEM = result; |
729e4ab9 | 1375 | _staticInvUCA = newInvUCA; |
b75a7d8f A |
1376 | result = NULL; |
1377 | newInvUCA = NULL; | |
1378 | } | |
1379 | umtx_unlock(NULL); | |
46f4442e | 1380 | |
b75a7d8f A |
1381 | if(newInvUCA != NULL) { |
1382 | udata_close(result); | |
1383 | // This is not needed, as we are talking about | |
1384 | // memory we got from UData | |
1385 | //uprv_free(newInvUCA); | |
1386 | } | |
1387 | else { | |
374ca955 | 1388 | ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); |
b75a7d8f A |
1389 | } |
1390 | } | |
1391 | } | |
374ca955 | 1392 | return _staticInvUCA; |
b75a7d8f A |
1393 | } |
1394 | ||
729e4ab9 A |
1395 | /* This is the data that is used for non-script reordering codes. These _must_ be kept |
1396 | * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. | |
1397 | */ | |
51004dcb | 1398 | static const char * const ReorderingTokenNames[] = { |
729e4ab9 A |
1399 | "SPACE", |
1400 | "PUNCT", | |
1401 | "SYMBOL", | |
1402 | "CURRENCY", | |
51004dcb | 1403 | "DIGIT" |
729e4ab9 A |
1404 | }; |
1405 | ||
1406 | static void toUpper(const char* src, char* dst, uint32_t length) { | |
1407 | for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { | |
4388f060 | 1408 | *dst = uprv_toupper(*src); |
729e4ab9 A |
1409 | } |
1410 | *dst = '\0'; | |
1411 | } | |
1412 | ||
1413 | U_INTERNAL int32_t U_EXPORT2 | |
1414 | ucol_findReorderingEntry(const char* name) { | |
1415 | char buffer[32]; | |
1416 | toUpper(name, buffer, 32); | |
51004dcb | 1417 | for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { |
729e4ab9 A |
1418 | if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { |
1419 | return entry + UCOL_REORDER_CODE_FIRST; | |
1420 | } | |
1421 | } | |
1422 | return USCRIPT_INVALID_CODE; | |
1423 | } | |
1424 | ||
b75a7d8f | 1425 | #endif /* #if !UCONFIG_NO_COLLATION */ |