]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
374ca955 | 3 | * Copyright (C) 1996-2004, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * file name: ucol.cpp | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * Modification history | |
12 | * Date Name Comments | |
13 | * 1996-1999 various members of ICU team maintained C API for collation framework | |
14 | * 02/16/2001 synwee Added internal method getPrevSpecialCE | |
15 | * 03/01/2001 synwee Added maxexpansion functionality. | |
16 | * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant | |
17 | */ | |
18 | ||
19 | #include "unicode/utypes.h" | |
374ca955 | 20 | #include "ustrenum.h" |
b75a7d8f A |
21 | #include "uassert.h" |
22 | ||
23 | #if !UCONFIG_NO_COLLATION | |
24 | ||
25 | #include "unicode/uloc.h" | |
26 | #include "unicode/coll.h" | |
27 | #include "unicode/tblcoll.h" | |
28 | #include "unicode/coleitr.h" | |
29 | #include "unicode/unorm.h" | |
30 | #include "unicode/udata.h" | |
31 | #include "unicode/uchar.h" | |
32 | #include "unicode/caniter.h" | |
33 | ||
34 | #include "ucol_bld.h" | |
35 | #include "ucol_imp.h" | |
36 | #include "ucol_tok.h" | |
37 | #include "ucol_elm.h" | |
38 | #include "bocsu.h" | |
39 | ||
40 | #include "unormimp.h" | |
41 | #include "unorm_it.h" | |
42 | #include "uresimp.h" | |
43 | #include "umutex.h" | |
44 | #include "uhash.h" | |
374ca955 | 45 | #include "ucln_in.h" |
b75a7d8f | 46 | #include "cstring.h" |
374ca955 A |
47 | #include "utracimp.h" |
48 | #include "putilimp.h" | |
b75a7d8f A |
49 | |
50 | #ifdef UCOL_DEBUG | |
51 | #include <stdio.h> | |
52 | #endif | |
53 | ||
54 | U_NAMESPACE_USE | |
55 | ||
56 | /* added by synwee for trie manipulation*/ | |
57 | #define STAGE_1_SHIFT_ 10 | |
58 | #define STAGE_2_SHIFT_ 4 | |
59 | #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F | |
60 | #define STAGE_3_MASK_ 0xF | |
61 | #define LAST_BYTE_MASK_ 0xFF | |
62 | #define SECOND_LAST_BYTE_SHIFT_ 8 | |
63 | ||
64 | #define ZERO_CC_LIMIT_ 0xC0 | |
65 | ||
374ca955 A |
66 | // static UCA. There is only one. Collators don't use it. |
67 | // It is referenced only in ucol_initUCA and ucol_cleanup | |
68 | static UCollator* _staticUCA = NULL; | |
69 | // static pointer to udata memory. Inited in ucol_initUCA | |
70 | // used for cleanup in ucol_cleanup | |
b75a7d8f A |
71 | static UDataMemory* UCA_DATA_MEM = NULL; |
72 | ||
374ca955 A |
73 | // this is static pointer to the normalizer fcdTrieIndex |
74 | // it is always the same between calls to u_cleanup | |
75 | // and therefore writing to it is not synchronized. | |
76 | // It is cleaned in ucol_cleanup | |
77 | static const uint16_t *fcdTrieIndex=NULL; | |
b75a7d8f A |
78 | |
79 | U_CDECL_BEGIN | |
80 | static UBool U_CALLCONV | |
81 | isAcceptableUCA(void * /*context*/, | |
82 | const char * /*type*/, const char * /*name*/, | |
83 | const UDataInfo *pInfo){ | |
84 | /* context, type & name are intentionally not used */ | |
85 | if( pInfo->size>=20 && | |
86 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
87 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
374ca955 A |
88 | pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 && /* dataFormat="UCol" */ |
89 | pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 && | |
90 | pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 && | |
91 | pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 && | |
92 | pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 && | |
93 | pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// && | |
94 | //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 && | |
95 | //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh | |
96 | //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh | |
b75a7d8f A |
97 | ) { |
98 | UVersionInfo UCDVersion; | |
99 | u_getUnicodeVersion(UCDVersion); | |
100 | if(pInfo->dataVersion[0]==UCDVersion[0] && | |
101 | pInfo->dataVersion[1]==UCDVersion[1]) { // && | |
102 | //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] && | |
103 | //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) { | |
104 | return TRUE; | |
105 | } else { | |
106 | return FALSE; | |
107 | } | |
108 | } else { | |
109 | return FALSE; | |
110 | } | |
111 | } | |
112 | ||
113 | ||
114 | static int32_t U_CALLCONV | |
115 | _getFoldingOffset(uint32_t data) { | |
116 | return (int32_t)(data&0xFFFFFF); | |
117 | } | |
118 | ||
119 | U_CDECL_END | |
120 | ||
121 | static | |
122 | inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, | |
123 | int32_t sourceLen, collIterate *s) { | |
124 | (s)->string = (s)->pos = (UChar *)(sourceString); | |
125 | (s)->origFlags = 0; | |
126 | (s)->flags = 0; | |
127 | if (sourceLen >= 0) { | |
128 | s->flags |= UCOL_ITER_HASLEN; | |
129 | (s)->endp = (UChar *)sourceString+sourceLen; | |
130 | } | |
131 | else { | |
132 | /* change to enable easier checking for end of string for fcdpositon */ | |
133 | (s)->endp = NULL; | |
134 | } | |
135 | (s)->CEpos = (s)->toReturn = (s)->CEs; | |
136 | (s)->writableBuffer = (s)->stackWritableBuffer; | |
137 | (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; | |
138 | (s)->coll = (collator); | |
139 | (s)->fcdPosition = 0; | |
140 | if(collator->normalizationMode == UCOL_ON) { | |
374ca955 | 141 | (s)->flags |= UCOL_ITER_NORM; |
b75a7d8f A |
142 | } |
143 | if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { | |
144 | (s)->flags |= UCOL_HIRAGANA_Q; | |
145 | } | |
146 | (s)->iterator = NULL; | |
147 | //(s)->iteratorIndex = 0; | |
148 | } | |
149 | ||
150 | U_CAPI void U_EXPORT2 | |
151 | uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, | |
152 | int32_t sourceLen, collIterate *s){ | |
153 | /* Out-of-line version for use from other files. */ | |
154 | IInit_collIterate(collator, sourceString, sourceLen, s); | |
155 | } | |
156 | ||
157 | ||
158 | /** | |
159 | * Backup the state of the collIterate struct data | |
160 | * @param data collIterate to backup | |
161 | * @param backup storage | |
162 | */ | |
163 | static | |
164 | inline void backupState(const collIterate *data, collIterateState *backup) | |
165 | { | |
166 | backup->fcdPosition = data->fcdPosition; | |
167 | backup->flags = data->flags; | |
168 | backup->origFlags = data->origFlags; | |
169 | backup->pos = data->pos; | |
170 | backup->bufferaddress = data->writableBuffer; | |
171 | backup->buffersize = data->writableBufSize; | |
172 | if(data->iterator != NULL) { | |
173 | //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); | |
174 | backup->iteratorIndex = data->iterator->getState(data->iterator); | |
175 | // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE | |
176 | backup->iteratorMove = 0; | |
177 | if(backup->iteratorIndex == UITER_NO_STATE) { | |
178 | while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { | |
179 | backup->iteratorMove++; | |
180 | data->iterator->move(data->iterator, -1, UITER_CURRENT); | |
181 | } | |
182 | data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); | |
183 | } | |
184 | } | |
185 | } | |
186 | ||
187 | /** | |
188 | * Loads the state into the collIterate struct data | |
189 | * @param data collIterate to backup | |
190 | * @param backup storage | |
191 | * @param forwards boolean to indicate if forwards iteration is used, | |
192 | * false indicates backwards iteration | |
193 | */ | |
194 | static | |
195 | inline void loadState(collIterate *data, const collIterateState *backup, | |
196 | UBool forwards) | |
197 | { | |
198 | UErrorCode status = U_ZERO_ERROR; | |
199 | data->flags = backup->flags; | |
200 | data->origFlags = backup->origFlags; | |
201 | if(data->iterator != NULL) { | |
202 | //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); | |
203 | data->iterator->setState(data->iterator, backup->iteratorIndex, &status); | |
204 | if(backup->iteratorMove != 0) { | |
205 | data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); | |
206 | } | |
207 | } | |
208 | data->pos = backup->pos; | |
209 | if ((data->flags & UCOL_ITER_INNORMBUF) && | |
210 | data->writableBuffer != backup->bufferaddress) { | |
211 | /* | |
212 | this is when a new buffer has been reallocated and we'll have to | |
213 | calculate the new position. | |
214 | note the new buffer has to contain the contents of the old buffer. | |
215 | */ | |
216 | if (forwards) { | |
217 | data->pos = data->writableBuffer + | |
218 | (data->pos - backup->bufferaddress); | |
219 | } | |
220 | else { | |
221 | /* backwards direction */ | |
222 | uint32_t temp = backup->buffersize - | |
223 | (data->pos - backup->bufferaddress); | |
224 | data->pos = data->writableBuffer + (data->writableBufSize - temp); | |
225 | } | |
226 | } | |
227 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
228 | /* | |
229 | this is alittle tricky. | |
230 | if we are initially not in the normalization buffer, even if we | |
231 | normalize in the later stage, the data in the buffer will be | |
232 | ignored, since we skip back up to the data string. | |
233 | however if we are already in the normalization buffer, any | |
234 | further normalization will pull data into the normalization | |
235 | buffer and modify the fcdPosition. | |
236 | since we are keeping the data in the buffer for use, the | |
237 | fcdPosition can not be reverted back. | |
238 | arrgghh.... | |
239 | */ | |
240 | data->fcdPosition = backup->fcdPosition; | |
241 | } | |
242 | } | |
243 | ||
244 | ||
245 | /* | |
246 | * collIter_eos() | |
247 | * Checks for a collIterate being positioned at the end of | |
248 | * its source string. | |
249 | * | |
250 | */ | |
251 | static | |
252 | inline UBool collIter_eos(collIterate *s) { | |
253 | if(s->flags & UCOL_USE_ITERATOR) { | |
254 | return !(s->iterator->hasNext(s->iterator)); | |
255 | } | |
256 | if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { | |
257 | // Null terminated string, but not at null, so not at end. | |
258 | // Whether in main or normalization buffer doesn't matter. | |
259 | return FALSE; | |
260 | } | |
261 | ||
262 | // String with length. Can't be in normalization buffer, which is always | |
263 | // null termintated. | |
264 | if (s->flags & UCOL_ITER_HASLEN) { | |
265 | return (s->pos == s->endp); | |
266 | } | |
267 | ||
268 | // We are at a null termination, could be either normalization buffer or main string. | |
269 | if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { | |
270 | // At null at end of main string. | |
271 | return TRUE; | |
272 | } | |
273 | ||
274 | // At null at end of normalization buffer. Need to check whether there there are | |
275 | // any characters left in the main buffer. | |
276 | if(s->origFlags & UCOL_USE_ITERATOR) { | |
277 | return !(s->iterator->hasNext(s->iterator)); | |
278 | } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { | |
279 | // Null terminated main string. fcdPosition is the 'return' position into main buf. | |
280 | return (*s->fcdPosition == 0); | |
281 | } | |
282 | else { | |
283 | // Main string with an end pointer. | |
284 | return s->fcdPosition == s->endp; | |
285 | } | |
286 | } | |
287 | ||
288 | /* | |
289 | * collIter_bos() | |
290 | * Checks for a collIterate being positioned at the start of | |
291 | * its source string. | |
292 | * | |
293 | */ | |
294 | static | |
295 | inline UBool collIter_bos(collIterate *source) { | |
296 | // if we're going backwards, we need to know whether there is more in the | |
297 | // iterator, even if we are in the side buffer | |
298 | if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { | |
299 | return !source->iterator->hasPrevious(source->iterator); | |
300 | } | |
301 | if (source->pos <= source->string || | |
302 | ((source->flags & UCOL_ITER_INNORMBUF) && | |
303 | *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { | |
304 | return TRUE; | |
305 | } | |
306 | return FALSE; | |
307 | } | |
308 | ||
309 | static | |
310 | inline UBool collIter_SimpleBos(collIterate *source) { | |
311 | // if we're going backwards, we need to know whether there is more in the | |
312 | // iterator, even if we are in the side buffer | |
313 | if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { | |
314 | return !source->iterator->hasPrevious(source->iterator); | |
315 | } | |
316 | if (source->pos == source->string) { | |
317 | return TRUE; | |
318 | } | |
319 | return FALSE; | |
320 | } | |
321 | //return (data->pos == data->string) || | |
322 | ||
323 | ||
324 | /** | |
325 | * Checks and free writable buffer if it is not the original stack buffer | |
326 | * in collIterate. This function does not reassign the writable buffer. | |
327 | * @param data collIterate struct to determine and free the writable buffer | |
328 | */ | |
329 | static | |
330 | inline void freeHeapWritableBuffer(collIterate *data) | |
331 | { | |
332 | if (data->writableBuffer != data->stackWritableBuffer) { | |
333 | uprv_free(data->writableBuffer); | |
334 | } | |
335 | } | |
336 | ||
337 | ||
338 | /****************************************************************************/ | |
339 | /* Following are the open/close functions */ | |
340 | /* */ | |
341 | /****************************************************************************/ | |
342 | static UCollator* | |
343 | tryOpeningFromRules(UResourceBundle *collElem, UErrorCode *status) { | |
344 | int32_t rulesLen = 0; | |
345 | const UChar *rules = ures_getStringByKey(collElem, "Sequence", &rulesLen, status); | |
346 | return ucol_openRules(rules, rulesLen, UCOL_DEFAULT, UCOL_DEFAULT, NULL, status); | |
347 | ||
348 | } | |
349 | ||
350 | ||
351 | U_CAPI UCollator* | |
352 | ucol_open(const char *loc, | |
374ca955 | 353 | UErrorCode *status) |
b75a7d8f | 354 | { |
374ca955 A |
355 | UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN); |
356 | UTRACE_DATA1(UTRACE_INFO, "locale = \"%s\"", loc); | |
b75a7d8f | 357 | UCollator *result = NULL; |
374ca955 A |
358 | |
359 | u_init(status); | |
360 | #if !UCONFIG_NO_SERVICE | |
361 | result = Collator::createUCollator(loc, status); | |
362 | if (result == NULL) | |
363 | #endif | |
364 | { | |
365 | result = ucol_open_internal(loc, status); | |
b75a7d8f | 366 | } |
374ca955 A |
367 | UTRACE_EXIT_PTR_STATUS(result, *status); |
368 | return result; | |
b75a7d8f A |
369 | } |
370 | ||
371 | // API in ucol_imp.h | |
372 | ||
373 | U_CFUNC UCollator* | |
374 | ucol_open_internal(const char *loc, | |
374ca955 | 375 | UErrorCode *status) |
b75a7d8f | 376 | { |
374ca955 | 377 | const UCollator* UCA = ucol_initUCA(status); |
b75a7d8f A |
378 | |
379 | /* New version */ | |
380 | if(U_FAILURE(*status)) return 0; | |
381 | ||
374ca955 A |
382 | |
383 | ||
b75a7d8f | 384 | UCollator *result = NULL; |
374ca955 A |
385 | UResourceBundle *b = ures_open(U_ICUDATA_COLL, loc, status); |
386 | ||
387 | /* we try to find stuff from keyword */ | |
388 | UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status); | |
389 | UResourceBundle *collElem = NULL; | |
390 | char keyBuffer[256]; | |
391 | // if there is a keyword, we pick it up and try to get elements | |
392 | if(!uloc_getKeywordValue(loc, "collation", keyBuffer, 256, status)) { | |
393 | // no keyword. we try to find the default setting, which will give us the keyword value | |
394 | UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); | |
395 | if(U_SUCCESS(*status)) { | |
396 | int32_t defaultKeyLen = 0; | |
397 | const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status); | |
398 | u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen); | |
399 | keyBuffer[defaultKeyLen] = 0; | |
400 | } else { | |
401 | *status = U_INTERNAL_PROGRAM_ERROR; | |
402 | return NULL; | |
403 | } | |
404 | ures_close(defaultColl); | |
405 | } | |
406 | collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status); | |
407 | ||
b75a7d8f | 408 | UResourceBundle *binary = NULL; |
374ca955 | 409 | UErrorCode binaryStatus = U_ZERO_ERROR; |
b75a7d8f A |
410 | |
411 | if(*status == U_MISSING_RESOURCE_ERROR) { /* We didn't find the tailoring data, we fallback to the UCA */ | |
412 | *status = U_USING_DEFAULT_WARNING; | |
374ca955 | 413 | result = ucol_initCollator(UCA->image, result, UCA, status); |
b75a7d8f | 414 | // if we use UCA, real locale is root |
374ca955 A |
415 | result->rb = ures_open(U_ICUDATA_COLL, "", status); |
416 | result->elements = ures_open(U_ICUDATA_COLL, "", status); | |
b75a7d8f A |
417 | if(U_FAILURE(*status)) { |
418 | goto clean; | |
419 | } | |
420 | ures_close(b); | |
421 | result->hasRealData = FALSE; | |
374ca955 | 422 | } else if(U_SUCCESS(*status)) { |
b75a7d8f A |
423 | binary = ures_getByKey(collElem, "%%CollationBin", NULL, &binaryStatus); |
424 | ||
425 | if(binaryStatus == U_MISSING_RESOURCE_ERROR) { /* we didn't find the binary image, we should use the rules */ | |
426 | binary = NULL; | |
427 | result = tryOpeningFromRules(collElem, status); | |
428 | if(U_FAILURE(*status)) { | |
429 | goto clean; | |
430 | } | |
431 | } else if(U_SUCCESS(*status)) { /* otherwise, we'll pick a collation data that exists */ | |
432 | int32_t len = 0; | |
433 | const uint8_t *inData = ures_getBinary(binary, &len, status); | |
434 | UCATableHeader *colData = (UCATableHeader *)inData; | |
435 | if(uprv_memcmp(colData->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0 || | |
436 | uprv_memcmp(colData->UCDVersion, UCA->image->UCDVersion, sizeof(UVersionInfo)) != 0 || | |
437 | colData->version[0] != UCOL_BUILDER_VERSION) { | |
438 | *status = U_DIFFERENT_UCA_VERSION; | |
439 | result = tryOpeningFromRules(collElem, status); | |
440 | } else { | |
441 | if(U_FAILURE(*status)){ | |
442 | goto clean; | |
443 | } | |
444 | if((uint32_t)len > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { | |
374ca955 | 445 | result = ucol_initCollator((const UCATableHeader *)inData, result, UCA, status); |
b75a7d8f A |
446 | if(U_FAILURE(*status)){ |
447 | goto clean; | |
448 | } | |
449 | result->hasRealData = TRUE; | |
450 | } else { | |
374ca955 | 451 | result = ucol_initCollator(UCA->image, result, UCA, status); |
b75a7d8f A |
452 | ucol_setOptionsFromHeader(result, (UColOptionSet *)(inData+((const UCATableHeader *)inData)->options), status); |
453 | if(U_FAILURE(*status)){ | |
454 | goto clean; | |
455 | } | |
456 | result->hasRealData = FALSE; | |
457 | } | |
458 | result->freeImageOnClose = FALSE; | |
459 | } | |
460 | } | |
461 | result->rb = b; | |
462 | result->elements = collElem; | |
463 | } else { /* There is another error, and we're just gonna clean up */ | |
464 | clean: | |
465 | ures_close(b); | |
466 | ures_close(collElem); | |
374ca955 | 467 | ures_close(collations); |
b75a7d8f A |
468 | ures_close(binary); |
469 | return NULL; | |
470 | } | |
471 | ||
472 | result->validLocale = NULL; // default is to use rb info | |
473 | ||
474 | if(loc == NULL) { | |
475 | loc = ures_getLocale(result->rb, status); | |
476 | } | |
477 | result->requestedLocale = (char *)uprv_malloc((uprv_strlen(loc)+1)*sizeof(char)); | |
478 | /* test for NULL */ | |
479 | if (result->requestedLocale == NULL) { | |
374ca955 A |
480 | *status = U_MEMORY_ALLOCATION_ERROR; |
481 | ures_close(b); // ??? appears needed | |
b75a7d8f | 482 | ures_close(collElem); |
374ca955 | 483 | ures_close(collations); |
b75a7d8f | 484 | ures_close(binary); // ??? appears needed |
374ca955 | 485 | return NULL; |
b75a7d8f A |
486 | } |
487 | uprv_strcpy(result->requestedLocale, loc); | |
488 | ||
489 | ures_close(binary); | |
374ca955 | 490 | ures_close(collations); //??? we have to decide on that. Probably affects something :) |
b75a7d8f A |
491 | return result; |
492 | } | |
493 | ||
374ca955 | 494 | |
b75a7d8f A |
495 | U_CAPI void U_EXPORT2 |
496 | ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt) | |
497 | { | |
498 | if (coll) { | |
499 | if (coll->validLocale) { | |
500 | uprv_free(coll->validLocale); | |
374ca955 | 501 | } |
b75a7d8f A |
502 | coll->validLocale = validLocaleToAdopt; |
503 | if (coll->requestedLocale) { // should always have | |
504 | uprv_free(coll->requestedLocale); | |
374ca955 | 505 | } |
b75a7d8f A |
506 | coll->requestedLocale = requestedLocaleToAdopt; |
507 | } | |
508 | } | |
509 | ||
510 | U_CAPI void U_EXPORT2 | |
511 | ucol_close(UCollator *coll) | |
512 | { | |
374ca955 A |
513 | UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
514 | UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); | |
b75a7d8f | 515 | if(coll != NULL) { |
374ca955 A |
516 | // these are always owned by each UCollator struct, |
517 | // so we always free them | |
518 | if(coll->validLocale != NULL) { | |
519 | uprv_free(coll->validLocale); | |
520 | } | |
521 | if(coll->requestedLocale != NULL) { | |
522 | uprv_free(coll->requestedLocale); | |
523 | } | |
b75a7d8f | 524 | |
374ca955 A |
525 | /* Here, it would be advisable to close: */ |
526 | /* - UData for UCA (unless we stuff it in the root resb */ | |
527 | /* Again, do we need additional housekeeping... HMMM! */ | |
528 | UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); | |
529 | if(coll->freeOnClose){ | |
530 | /* for safeClone, if freeOnClose is FALSE, | |
531 | don't free the other instance data */ | |
532 | if(coll->freeOptionsOnClose != FALSE) { | |
533 | if(coll->options != NULL) { | |
534 | uprv_free(coll->options); | |
535 | } | |
536 | } | |
537 | if(coll->mapping != NULL) { | |
538 | /*ucmpe32_close(coll->mapping);*/ | |
539 | uprv_free(coll->mapping); | |
540 | } | |
541 | if(coll->rules != NULL && coll->freeRulesOnClose) { | |
542 | uprv_free((UChar *)coll->rules); | |
543 | } | |
544 | if(coll->rb != NULL) { /* pointing to read-only memory */ | |
545 | ures_close(coll->rb); | |
546 | } | |
547 | if(coll->freeImageOnClose == TRUE) { | |
548 | uprv_free((UCATableHeader *)coll->image); | |
549 | } | |
550 | if(coll->elements != NULL) { | |
551 | ures_close(coll->elements); | |
552 | } | |
553 | if(coll->latinOneCEs != NULL) { | |
554 | uprv_free(coll->latinOneCEs); | |
555 | } | |
556 | uprv_free(coll); | |
b75a7d8f | 557 | } |
b75a7d8f | 558 | } |
374ca955 | 559 | UTRACE_EXIT(); |
b75a7d8f A |
560 | } |
561 | ||
562 | U_CAPI UCollator* U_EXPORT2 | |
563 | ucol_openRules( const UChar *rules, | |
564 | int32_t rulesLength, | |
565 | UColAttributeValue normalizationMode, | |
566 | UCollationStrength strength, | |
567 | UParseError *parseError, | |
568 | UErrorCode *status) | |
569 | { | |
570 | uint32_t listLen = 0; | |
571 | UColTokenParser src; | |
572 | UColAttributeValue norm; | |
573 | UParseError tErr; | |
374ca955 | 574 | |
b75a7d8f A |
575 | if(status == NULL || U_FAILURE(*status)){ |
576 | return 0; | |
577 | } | |
578 | ||
374ca955 A |
579 | u_init(status); |
580 | if (U_FAILURE(*status)) { | |
581 | return NULL; | |
582 | } | |
583 | ||
b75a7d8f A |
584 | if(rulesLength < -1 || (rules == NULL && rulesLength != 0)) { |
585 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
586 | return 0; | |
587 | } | |
588 | ||
589 | if(rulesLength == -1) { | |
590 | rulesLength = u_strlen(rules); | |
591 | } | |
592 | ||
593 | if(parseError == NULL){ | |
594 | parseError = &tErr; | |
595 | } | |
374ca955 | 596 | |
b75a7d8f A |
597 | switch(normalizationMode) { |
598 | case UCOL_OFF: | |
599 | case UCOL_ON: | |
600 | case UCOL_DEFAULT: | |
601 | norm = normalizationMode; | |
602 | break; | |
603 | default: | |
604 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
605 | return 0; | |
606 | } | |
607 | ||
374ca955 | 608 | UCollator *UCA = ucol_initUCA(status); |
b75a7d8f A |
609 | |
610 | if(U_FAILURE(*status)){ | |
611 | return NULL; | |
612 | } | |
613 | ||
614 | ucol_tok_initTokenList(&src, rules, rulesLength, UCA, status); | |
615 | listLen = ucol_tok_assembleTokenList(&src,parseError, status); | |
616 | ||
617 | if(U_FAILURE(*status)) { | |
618 | /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */ | |
619 | /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */ | |
620 | /* so something might be done here... or on lower level */ | |
621 | #ifdef UCOL_DEBUG | |
622 | if(*status == U_ILLEGAL_ARGUMENT_ERROR) { | |
623 | fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source); | |
624 | } else { | |
625 | fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source); | |
626 | } | |
627 | #endif | |
628 | ucol_tok_closeTokenList(&src); | |
629 | return NULL; | |
630 | } | |
631 | UCollator *result = NULL; | |
632 | UCATableHeader *table = NULL; | |
633 | ||
634 | if(src.resultLen > 0 || src.removeSet != NULL) { /* we have a set of rules, let's make something of it */ | |
635 | /* also, if we wanted to remove some contractions, we should make a tailoring */ | |
636 | table = ucol_assembleTailoringTable(&src, status); | |
637 | if(U_SUCCESS(*status)) { | |
638 | // builder version | |
639 | table->version[0] = UCOL_BUILDER_VERSION; | |
640 | // no tailoring information on this level | |
641 | table->version[1] = table->version[2] = table->version[3] = 0; | |
642 | // set UCD version | |
643 | u_getUnicodeVersion(table->UCDVersion); | |
644 | // set UCA version | |
645 | uprv_memcpy(table->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)); | |
374ca955 | 646 | result = ucol_initCollator(table, 0, UCA, status); |
b75a7d8f A |
647 | result->hasRealData = TRUE; |
648 | result->freeImageOnClose = TRUE; | |
649 | } | |
650 | } else { /* no rules, but no error either */ | |
651 | // must be only options | |
374ca955 A |
652 | // We will init the collator from UCA |
653 | result = ucol_initCollator(UCA->image, 0, UCA, status); | |
b75a7d8f A |
654 | // And set only the options |
655 | UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); | |
656 | /* test for NULL */ | |
657 | if (opts == NULL) { | |
658 | *status = U_MEMORY_ALLOCATION_ERROR; | |
659 | goto cleanup; | |
660 | } | |
661 | uprv_memcpy(opts, src.opts, sizeof(UColOptionSet)); | |
662 | ucol_setOptionsFromHeader(result, opts, status); | |
663 | result->freeOptionsOnClose = TRUE; | |
664 | result->hasRealData = FALSE; | |
665 | result->freeImageOnClose = FALSE; | |
666 | } | |
667 | ||
668 | if(U_SUCCESS(*status)) { | |
669 | UChar *newRules; | |
670 | result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION; | |
671 | if(rulesLength > 0) { | |
672 | newRules = (UChar *)uprv_malloc((rulesLength+1)*U_SIZEOF_UCHAR); | |
673 | /* test for NULL */ | |
674 | if (newRules == NULL) { | |
675 | *status = U_MEMORY_ALLOCATION_ERROR; | |
676 | goto cleanup; | |
677 | } | |
678 | uprv_memcpy(newRules, rules, rulesLength*U_SIZEOF_UCHAR); | |
679 | newRules[rulesLength]=0; | |
680 | result->rules = newRules; | |
681 | result->rulesLength = rulesLength; | |
682 | result->freeRulesOnClose = TRUE; | |
683 | } | |
684 | result->rb = NULL; | |
685 | result->elements = NULL; | |
686 | result->validLocale = NULL; | |
687 | result->requestedLocale = NULL; | |
688 | ucol_setAttribute(result, UCOL_STRENGTH, strength, status); | |
689 | ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status); | |
690 | } else { | |
691 | cleanup: | |
692 | if(result != NULL) { | |
693 | ucol_close(result); | |
694 | } else { | |
695 | if(table != NULL) { | |
696 | uprv_free(table); | |
697 | } | |
698 | } | |
699 | result = NULL; | |
700 | } | |
701 | ||
702 | ucol_tok_closeTokenList(&src); | |
703 | ||
704 | return result; | |
705 | } | |
706 | ||
707 | /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ | |
708 | /* you should be able to get the binary chunk to write out... Doesn't look very full now */ | |
709 | U_CAPI uint8_t* U_EXPORT2 | |
710 | ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) | |
711 | { | |
712 | uint8_t *result = NULL; | |
713 | if(U_FAILURE(*status)) { | |
714 | return NULL; | |
715 | } | |
716 | if(coll->hasRealData == TRUE) { | |
717 | *length = coll->image->size; | |
718 | result = (uint8_t *)uprv_malloc(*length); | |
719 | /* test for NULL */ | |
720 | if (result == NULL) { | |
721 | *status = U_MEMORY_ALLOCATION_ERROR; | |
722 | return NULL; | |
723 | } | |
724 | uprv_memcpy(result, coll->image, *length); | |
725 | } else { | |
726 | *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); | |
727 | result = (uint8_t *)uprv_malloc(*length); | |
728 | /* test for NULL */ | |
729 | if (result == NULL) { | |
730 | *status = U_MEMORY_ALLOCATION_ERROR; | |
731 | return NULL; | |
732 | } | |
374ca955 A |
733 | |
734 | /* build the UCATableHeader with minimal entries */ | |
735 | /* do not copy the header from the UCA file because its values are wrong! */ | |
736 | /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ | |
737 | ||
738 | /* reset everything */ | |
739 | uprv_memset(result, 0, *length); | |
740 | ||
741 | /* set the tailoring-specific values */ | |
742 | UCATableHeader *myData = (UCATableHeader *)result; | |
743 | myData->size = *length; | |
744 | ||
745 | /* offset for the options, the only part of the data that is present after the header */ | |
746 | myData->options = sizeof(UCATableHeader); | |
747 | ||
748 | /* need to always set the expansion value for an upper bound of the options */ | |
749 | myData->expansion = myData->options + sizeof(UColOptionSet); | |
750 | ||
751 | myData->magic = UCOL_HEADER_MAGIC; | |
752 | myData->isBigEndian = U_IS_BIG_ENDIAN; | |
753 | myData->charSetFamily = U_CHARSET_FAMILY; | |
754 | ||
755 | /* copy UCA's version; genrb will override all but the builder version with tailoring data */ | |
756 | uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); | |
757 | ||
758 | uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); | |
759 | uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); | |
760 | uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); | |
761 | myData->jamoSpecial = coll->image->jamoSpecial; | |
762 | ||
763 | /* copy the collator options */ | |
b75a7d8f A |
764 | uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); |
765 | } | |
766 | return result; | |
767 | } | |
768 | ||
769 | void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { | |
770 | if(U_FAILURE(*status)) { | |
771 | return; | |
772 | } | |
773 | result->caseFirst = (UColAttributeValue)opts->caseFirst; | |
774 | result->caseLevel = (UColAttributeValue)opts->caseLevel; | |
775 | result->frenchCollation = (UColAttributeValue)opts->frenchCollation; | |
776 | result->normalizationMode = (UColAttributeValue)opts->normalizationMode; | |
777 | result->strength = (UColAttributeValue)opts->strength; | |
778 | result->variableTopValue = opts->variableTopValue; | |
779 | result->alternateHandling = (UColAttributeValue)opts->alternateHandling; | |
780 | result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; | |
781 | result->numericCollation = (UColAttributeValue)opts->numericCollation; | |
782 | ||
783 | result->caseFirstisDefault = TRUE; | |
784 | result->caseLevelisDefault = TRUE; | |
785 | result->frenchCollationisDefault = TRUE; | |
786 | result->normalizationModeisDefault = TRUE; | |
787 | result->strengthisDefault = TRUE; | |
788 | result->variableTopValueisDefault = TRUE; | |
789 | result->hiraganaQisDefault = TRUE; | |
790 | result->numericCollationisDefault = TRUE; | |
791 | ||
792 | ucol_updateInternalState(result, status); | |
793 | ||
794 | result->options = opts; | |
795 | } | |
796 | ||
797 | #if 0 | |
798 | // doesn't look like anybody is using this | |
799 | void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { | |
800 | if(U_FAILURE(*status)) { | |
801 | return; | |
802 | } | |
803 | opts->caseFirst = result->caseFirst; | |
804 | opts->caseLevel = result->caseLevel; | |
805 | opts->frenchCollation = result->frenchCollation; | |
806 | opts->normalizationMode = result->normalizationMode; | |
807 | opts->strength = result->strength; | |
808 | opts->variableTopValue = result->variableTopValue; | |
809 | opts->alternateHandling = result->alternateHandling; | |
810 | opts->hiraganaQ = result->hiraganaQ; | |
811 | opts->numericCollation = result->numericCollation; | |
812 | } | |
813 | #endif | |
814 | ||
b75a7d8f A |
815 | |
816 | /** | |
817 | * Approximate determination if a character is at a contraction end. | |
818 | * Guaranteed to be TRUE if a character is at the end of a contraction, | |
819 | * otherwise it is not deterministic. | |
820 | * @param c character to be determined | |
821 | * @param coll collator | |
822 | */ | |
823 | static | |
824 | inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { | |
825 | if (UTF_IS_TRAIL(c)) { | |
826 | return TRUE; | |
827 | } | |
828 | ||
829 | if (c < coll->minContrEndCP) { | |
830 | return FALSE; | |
831 | } | |
832 | ||
833 | int32_t hash = c; | |
834 | uint8_t htbyte; | |
835 | if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { | |
836 | hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; | |
837 | } | |
838 | htbyte = coll->contrEndCP[hash>>3]; | |
839 | return (((htbyte >> (hash & 7)) & 1) == 1); | |
840 | } | |
841 | ||
842 | ||
843 | ||
844 | /* | |
845 | * i_getCombiningClass() | |
846 | * A fast, at least partly inline version of u_getCombiningClass() | |
847 | * This is a candidate for further optimization. Used heavily | |
848 | * in contraction processing. | |
849 | */ | |
850 | static | |
851 | inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) { | |
852 | uint8_t sCC = 0; | |
853 | if (c >= 0x300 && ucol_unsafeCP(c, coll)) { | |
854 | sCC = u_getCombiningClass(c); | |
855 | } | |
856 | return sCC; | |
857 | } | |
858 | ||
859 | ||
374ca955 | 860 | UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { |
b75a7d8f A |
861 | UChar c; |
862 | UCollator *result = fillIn; | |
863 | if(U_FAILURE(*status) || image == NULL) { | |
864 | return NULL; | |
865 | } | |
866 | ||
867 | if(result == NULL) { | |
868 | result = (UCollator *)uprv_malloc(sizeof(UCollator)); | |
869 | if(result == NULL) { | |
870 | *status = U_MEMORY_ALLOCATION_ERROR; | |
871 | return result; | |
872 | } | |
873 | result->freeOnClose = TRUE; | |
874 | } else { | |
875 | result->freeOnClose = FALSE; | |
876 | } | |
877 | ||
878 | result->image = image; | |
879 | const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; | |
880 | /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/ | |
374ca955 | 881 | UTrie *newUCAmapping = (UTrie *)uprv_malloc(sizeof(UTrie)); |
b75a7d8f A |
882 | if(newUCAmapping != NULL) { |
883 | utrie_unserialize(newUCAmapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); | |
884 | } else { | |
885 | *status = U_MEMORY_ALLOCATION_ERROR; | |
886 | if(result->freeOnClose == TRUE) { | |
887 | uprv_free(result); | |
888 | result = NULL; | |
889 | } | |
890 | return result; | |
891 | } | |
892 | if(U_SUCCESS(*status)) { | |
893 | result->mapping = newUCAmapping; | |
894 | } else { | |
895 | if(result->freeOnClose == TRUE) { | |
896 | uprv_free(result); | |
897 | result = NULL; | |
898 | } | |
899 | uprv_free(newUCAmapping); | |
900 | return result; | |
901 | } | |
902 | ||
903 | /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/ | |
904 | result->latinOneMapping = UTRIE_GET32_LATIN1(result->mapping); | |
905 | result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); | |
906 | result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); | |
907 | result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); | |
908 | ||
909 | result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options); | |
910 | result->freeOptionsOnClose = FALSE; | |
911 | ||
912 | /* set attributes */ | |
913 | result->caseFirst = (UColAttributeValue)result->options->caseFirst; | |
914 | result->caseLevel = (UColAttributeValue)result->options->caseLevel; | |
915 | result->frenchCollation = (UColAttributeValue)result->options->frenchCollation; | |
916 | result->normalizationMode = (UColAttributeValue)result->options->normalizationMode; | |
917 | result->strength = (UColAttributeValue)result->options->strength; | |
918 | result->variableTopValue = result->options->variableTopValue; | |
919 | result->alternateHandling = (UColAttributeValue)result->options->alternateHandling; | |
920 | result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ; | |
921 | result->numericCollation = (UColAttributeValue)result->options->numericCollation; | |
922 | ||
923 | result->caseFirstisDefault = TRUE; | |
924 | result->caseLevelisDefault = TRUE; | |
925 | result->frenchCollationisDefault = TRUE; | |
926 | result->normalizationModeisDefault = TRUE; | |
927 | result->strengthisDefault = TRUE; | |
928 | result->variableTopValueisDefault = TRUE; | |
929 | result->alternateHandlingisDefault = TRUE; | |
930 | result->hiraganaQisDefault = TRUE; | |
931 | result->numericCollationisDefault = TRUE; | |
932 | ||
933 | result->scriptOrder = NULL; | |
934 | ||
935 | result->rules = NULL; | |
936 | result->rulesLength = 0; | |
937 | ||
938 | /* get the version info from UCATableHeader and populate the Collator struct*/ | |
939 | result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ | |
940 | result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ | |
941 | ||
942 | result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; | |
943 | result->minUnsafeCP = 0; | |
944 | for (c=0; c<0x300; c++) { // Find the smallest unsafe char. | |
945 | if (ucol_unsafeCP(c, result)) break; | |
946 | } | |
947 | result->minUnsafeCP = c; | |
948 | ||
949 | result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; | |
950 | result->minContrEndCP = 0; | |
951 | for (c=0; c<0x300; c++) { // Find the Contraction-ending char. | |
952 | if (ucol_contractionEndCP(c, result)) break; | |
953 | } | |
954 | result->minContrEndCP = c; | |
955 | ||
956 | /* max expansion tables */ | |
957 | result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + | |
958 | result->image->endExpansionCE); | |
959 | result->lastEndExpansionCE = result->endExpansionCE + | |
960 | result->image->endExpansionCECount - 1; | |
961 | result->expansionCESize = (uint8_t*)result->image + | |
962 | result->image->expansionCESize; | |
963 | ||
b75a7d8f A |
964 | |
965 | //result->errorCode = *status; | |
966 | ||
967 | result->latinOneCEs = NULL; | |
968 | ||
969 | result->latinOneRegenTable = FALSE; | |
970 | result->latinOneFailed = FALSE; | |
374ca955 | 971 | result->UCA = UCA; |
b75a7d8f A |
972 | |
973 | ucol_updateInternalState(result, status); | |
974 | ||
975 | ||
976 | return result; | |
977 | } | |
978 | ||
374ca955 | 979 | /* new Mark's code */ |
b75a7d8f | 980 | |
374ca955 A |
981 | /** |
982 | * For generation of Implicit CEs | |
983 | * @author Davis | |
984 | * | |
985 | * Cleaned up so that changes can be made more easily. | |
986 | * Old values: | |
987 | # First Implicit: E26A792D | |
988 | # Last Implicit: E3DC70C0 | |
989 | # First CJK: E0030300 | |
990 | # Last CJK: E0A9DD00 | |
991 | # First CJK_A: E0A9DF00 | |
992 | # Last CJK_A: E0DE3100 | |
993 | */ | |
b75a7d8f | 994 | /* Following is a port of Mark's code for new treatment of implicits. |
374ca955 | 995 | * It is positioned here, since ucol_initUCA need to initialize the |
b75a7d8f A |
996 | * variables below according to the data in the fractional UCA. |
997 | */ | |
374ca955 | 998 | |
b75a7d8f | 999 | /** |
374ca955 A |
1000 | * Function used to: |
1001 | * a) collapse the 2 different Han ranges from UCA into one (in the right order), and | |
1002 | * b) bump any non-CJK characters by 10FFFF. | |
1003 | * The relevant blocks are: | |
1004 | * A: 4E00..9FFF; CJK Unified Ideographs | |
1005 | * F900..FAFF; CJK Compatibility Ideographs | |
1006 | * B: 3400..4DBF; CJK Unified Ideographs Extension A | |
1007 | * 20000..XX; CJK Unified Ideographs Extension B (and others later on) | |
1008 | * As long as | |
1009 | * no new B characters are allocated between 4E00 and FAFF, and | |
1010 | * no new A characters are outside of this range, | |
1011 | * (very high probability) this simple code will work. | |
1012 | * The reordered blocks are: | |
1013 | * Block1 is CJK | |
1014 | * Block2 is CJK_COMPAT_USED | |
1015 | * Block3 is CJK_A | |
1016 | * (all contiguous) | |
1017 | * Any other CJK gets its normal code point | |
1018 | * Any non-CJK gets +10FFFF | |
1019 | * When we reorder Block1, we make sure that it is at the very start, | |
1020 | * so that it will use a 3-byte form. | |
1021 | * Warning: the we only pick up the compatibility characters that are | |
1022 | * NOT decomposed, so that block is smaller! | |
1023 | */ | |
b75a7d8f A |
1024 | |
1025 | // CONSTANTS | |
374ca955 | 1026 | static const UChar32 |
b75a7d8f | 1027 | NON_CJK_OFFSET = 0x110000, |
374ca955 A |
1028 | UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 |
1029 | ||
1030 | /** | |
1031 | * Precomputed by constructor | |
1032 | */ | |
1033 | static int32_t | |
1034 | final3Multiplier = 0, | |
1035 | final4Multiplier = 0, | |
1036 | final3Count = 0, | |
1037 | final4Count = 0, | |
1038 | medialCount = 0, | |
1039 | min3Primary = 0, | |
1040 | min4Primary = 0, | |
1041 | max4Primary = 0, | |
1042 | minTrail = 0, | |
1043 | maxTrail = 0, | |
1044 | max3Trail = 0, | |
1045 | max4Trail = 0, | |
1046 | min4Boundary = 0; | |
b75a7d8f A |
1047 | |
1048 | static const UChar32 | |
1049 | CJK_BASE = 0x4E00, | |
1050 | CJK_LIMIT = 0x9FFF+1, | |
1051 | CJK_COMPAT_USED_BASE = 0xFA0E, | |
1052 | CJK_COMPAT_USED_LIMIT = 0xFA2F+1, | |
1053 | CJK_A_BASE = 0x3400, | |
1054 | CJK_A_LIMIT = 0x4DBF+1, | |
1055 | CJK_B_BASE = 0x20000, | |
1056 | CJK_B_LIMIT = 0x2A6DF+1; | |
1057 | ||
374ca955 A |
1058 | static UChar32 swapCJK(UChar32 i) { |
1059 | ||
1060 | if (i >= CJK_BASE) { | |
1061 | if (i < CJK_LIMIT) return i - CJK_BASE; | |
1062 | ||
1063 | if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; | |
1064 | ||
1065 | if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE | |
1066 | + (CJK_LIMIT - CJK_BASE); | |
1067 | if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; | |
1068 | ||
1069 | if (i < CJK_B_LIMIT) return i; // non-BMP-CJK | |
1070 | ||
1071 | return i + NON_CJK_OFFSET; // non-CJK | |
1072 | } | |
1073 | if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; | |
1074 | ||
1075 | if (i < CJK_A_LIMIT) return i - CJK_A_BASE | |
1076 | + (CJK_LIMIT - CJK_BASE) | |
1077 | + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); | |
1078 | return i + NON_CJK_OFFSET; // non-CJK | |
1079 | } | |
1080 | ||
1081 | U_CAPI UChar32 U_EXPORT2 | |
1082 | uprv_uca_getRawFromCodePoint(UChar32 i) { | |
1083 | return swapCJK(i)+1; | |
1084 | } | |
1085 | ||
1086 | U_CAPI UChar32 U_EXPORT2 | |
1087 | uprv_uca_getCodePointFromRaw(UChar32 i) { | |
1088 | i--; | |
1089 | UChar32 result = 0; | |
1090 | if(i >= NON_CJK_OFFSET) { | |
1091 | result = i - NON_CJK_OFFSET; | |
1092 | } else if(i >= CJK_B_BASE) { | |
1093 | result = i; | |
1094 | } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted | |
1095 | if(i < CJK_LIMIT - CJK_BASE) { | |
1096 | result = i + CJK_BASE; | |
1097 | } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { | |
1098 | result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); | |
1099 | } else { | |
1100 | result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); | |
1101 | } | |
1102 | } else { | |
1103 | result = -1; | |
1104 | } | |
1105 | return result; | |
b75a7d8f | 1106 | } |
b75a7d8f A |
1107 | |
1108 | // GET IMPLICIT PRIMARY WEIGHTS | |
1109 | // Return value is left justified primary key | |
374ca955 A |
1110 | U_CAPI uint32_t U_EXPORT2 |
1111 | uprv_uca_getImplicitFromRaw(UChar32 cp) { | |
1112 | /* | |
1113 | if (cp < 0 || cp > UCOL_MAX_INPUT) { | |
1114 | throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); | |
1115 | } | |
1116 | */ | |
1117 | int32_t last0 = cp - min4Boundary; | |
1118 | if (last0 < 0) { | |
1119 | int32_t last1 = cp / final3Count; | |
1120 | last0 = cp % final3Count; | |
b75a7d8f | 1121 | |
374ca955 A |
1122 | int32_t last2 = last1 / medialCount; |
1123 | last1 %= medialCount; | |
b75a7d8f | 1124 | |
374ca955 A |
1125 | last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start |
1126 | last1 = minTrail + last1; // offset | |
1127 | last2 = min3Primary + last2; // offset | |
b75a7d8f | 1128 | /* |
374ca955 A |
1129 | if (last2 >= min4Primary) { |
1130 | throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); | |
1131 | } | |
b75a7d8f | 1132 | */ |
374ca955 | 1133 | return (last2 << 24) + (last1 << 16) + (last0 << 8); |
b75a7d8f | 1134 | } else { |
374ca955 A |
1135 | int32_t last1 = last0 / final4Count; |
1136 | last0 %= final4Count; | |
b75a7d8f | 1137 | |
374ca955 A |
1138 | int32_t last2 = last1 / medialCount; |
1139 | last1 %= medialCount; | |
1140 | ||
1141 | int32_t last3 = last2 / medialCount; | |
1142 | last2 %= medialCount; | |
1143 | ||
1144 | last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start | |
1145 | last1 = minTrail + last1; // offset | |
1146 | last2 = minTrail + last2; // offset | |
1147 | last3 = min4Primary + last3; // offset | |
b75a7d8f | 1148 | /* |
374ca955 A |
1149 | if (last3 > max4Primary) { |
1150 | throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); | |
1151 | } | |
b75a7d8f | 1152 | */ |
374ca955 | 1153 | return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; |
b75a7d8f A |
1154 | } |
1155 | } | |
1156 | ||
374ca955 A |
1157 | U_CAPI uint32_t U_EXPORT2 |
1158 | uprv_uca_getImplicitPrimary(UChar32 cp) { | |
1159 | //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); | |
1160 | ||
1161 | cp = swapCJK(cp); | |
1162 | cp++; | |
1163 | // we now have a range of numbers from 0 to 21FFFF. | |
1164 | ||
1165 | //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); | |
1166 | ||
1167 | return uprv_uca_getImplicitFromRaw(cp); | |
1168 | } | |
1169 | ||
1170 | /** | |
1171 | * Converts implicit CE into raw integer ("code point") | |
1172 | * @param implicit | |
1173 | * @return -1 if illegal format | |
1174 | */ | |
1175 | U_CAPI UChar32 U_EXPORT2 | |
1176 | uprv_uca_getRawFromImplicit(uint32_t implicit) { | |
1177 | UChar32 result; | |
1178 | UChar32 b3 = implicit & 0xFF; | |
1179 | implicit >>= 8; | |
1180 | UChar32 b2 = implicit & 0xFF; | |
1181 | implicit >>= 8; | |
1182 | UChar32 b1 = implicit & 0xFF; | |
1183 | implicit >>= 8; | |
1184 | UChar32 b0 = implicit & 0xFF; | |
1185 | ||
1186 | // simple parameter checks | |
1187 | if (b0 < min3Primary || b0 > max4Primary | |
1188 | || b1 < minTrail || b1 > maxTrail) return -1; | |
1189 | // normal offsets | |
1190 | b1 -= minTrail; | |
1191 | ||
1192 | // take care of the final values, and compose | |
1193 | if (b0 < min4Primary) { | |
1194 | if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1; | |
1195 | b2 -= minTrail; | |
1196 | UChar32 remainder = b2 % final3Multiplier; | |
1197 | if (remainder != 0) return -1; | |
1198 | b0 -= min3Primary; | |
1199 | b2 /= final3Multiplier; | |
1200 | result = ((b0 * medialCount) + b1) * final3Count + b2; | |
1201 | } else { | |
1202 | if (b2 < minTrail || b2 > maxTrail | |
1203 | || b3 < minTrail || b3 > max4Trail) return -1; | |
1204 | b2 -= minTrail; | |
1205 | b3 -= minTrail; | |
1206 | UChar32 remainder = b3 % final4Multiplier; | |
1207 | if (remainder != 0) return -1; | |
1208 | b3 /= final4Multiplier; | |
1209 | b0 -= min4Primary; | |
1210 | result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; | |
1211 | } | |
1212 | // final check | |
1213 | if (result < 0 || result > UCOL_MAX_INPUT) return -1; | |
1214 | return result; | |
1215 | } | |
1216 | ||
1217 | ||
1218 | static inline int32_t divideAndRoundUp(int a, int b) { | |
1219 | return 1 + (a-1)/b; | |
1220 | } | |
1221 | ||
b75a7d8f A |
1222 | /* this function is either called from initUCA or from genUCA before |
1223 | * doing canonical closure for the UCA. | |
1224 | */ | |
374ca955 A |
1225 | |
1226 | /** | |
1227 | * Set up to generate implicits. | |
1228 | * @param minPrimary | |
1229 | * @param maxPrimary | |
1230 | * @param minTrail final byte | |
1231 | * @param maxTrail final byte | |
1232 | * @param gap3 the gap we leave for tailoring for 3-byte forms | |
1233 | * @param gap4 the gap we leave for tailoring for 4-byte forms | |
1234 | */ | |
1235 | static void initImplicitConstants(int minPrimary, int maxPrimary, | |
1236 | int minTrailIn, int maxTrailIn, | |
1237 | int gap3, int primaries3count, | |
1238 | UErrorCode *status) { | |
1239 | // some simple parameter checks | |
1240 | if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) { | |
1241 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1242 | return; | |
1243 | }; | |
1244 | if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) { | |
1245 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1246 | return; | |
1247 | }; | |
1248 | if (primaries3count < 1) { | |
1249 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1250 | return; | |
1251 | }; | |
1252 | ||
1253 | minTrail = minTrailIn; | |
1254 | maxTrail = maxTrailIn; | |
1255 | ||
1256 | min3Primary = minPrimary; | |
1257 | max4Primary = maxPrimary; | |
1258 | // compute constants for use later. | |
1259 | // number of values we can use in trailing bytes | |
1260 | // leave room for empty values between AND above, e.g. if gap = 2 | |
1261 | // range 3..7 => +3 -4 -5 -6 -7: so 1 value | |
1262 | // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values | |
1263 | // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values | |
1264 | final3Multiplier = gap3 + 1; | |
1265 | final3Count = (maxTrail - minTrail + 1) / final3Multiplier; | |
1266 | max3Trail = minTrail + (final3Count - 1) * final3Multiplier; | |
1267 | ||
1268 | // medials can use full range | |
1269 | medialCount = (maxTrail - minTrail + 1); | |
1270 | // find out how many values fit in each form | |
1271 | int32_t threeByteCount = medialCount * final3Count; | |
1272 | // now determine where the 3/4 boundary is. | |
1273 | // we use 3 bytes below the boundary, and 4 above | |
1274 | int32_t primariesAvailable = maxPrimary - minPrimary + 1; | |
1275 | int32_t primaries4count = primariesAvailable - primaries3count; | |
1276 | ||
1277 | ||
1278 | int32_t min3ByteCoverage = primaries3count * threeByteCount; | |
1279 | min4Primary = minPrimary + primaries3count; | |
1280 | min4Boundary = min3ByteCoverage; | |
1281 | // Now expand out the multiplier for the 4 bytes, and redo. | |
1282 | ||
1283 | int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; | |
1284 | int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); | |
1285 | //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte); | |
1286 | int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); | |
1287 | //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte); | |
1288 | int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; | |
1289 | //if (DEBUG) System.out.println("expandedGap: " + gap4); | |
1290 | if (gap4 < 1) { | |
1291 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1292 | return; | |
1293 | } | |
1294 | final4Multiplier = gap4 + 1; | |
1295 | final4Count = neededPerFinalByte; | |
1296 | max4Trail = minTrail + (final4Count - 1) * final4Multiplier; | |
1297 | /* | |
1298 | if (DEBUG) { | |
1299 | System.out.println("final4Count: " + final4Count); | |
1300 | for (int counter = 0; counter <= final4Count; ++counter) { | |
1301 | int value = minTrail + (1 + counter)*final4Multiplier; | |
1302 | System.out.println(counter + "\t" + value + "\t" + Utility.hex(value)); | |
1303 | } | |
1304 | } | |
1305 | */ | |
1306 | } | |
1307 | ||
1308 | /** | |
1309 | * Supply parameters for generating implicit CEs | |
1310 | */ | |
1311 | U_CAPI void U_EXPORT2 | |
1312 | uprv_uca_initImplicitConstants(int32_t minPrimary, int32_t maxPrimary, UErrorCode *status) { | |
1313 | // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. | |
1314 | initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); | |
1315 | } | |
1316 | ||
1317 | U_CDECL_BEGIN | |
1318 | static UBool U_CALLCONV | |
1319 | ucol_cleanup(void) | |
b75a7d8f | 1320 | { |
374ca955 A |
1321 | if (UCA_DATA_MEM) { |
1322 | udata_close(UCA_DATA_MEM); | |
1323 | UCA_DATA_MEM = NULL; | |
1324 | } | |
1325 | if (_staticUCA) { | |
1326 | ucol_close(_staticUCA); | |
1327 | _staticUCA = NULL; | |
1328 | } | |
1329 | fcdTrieIndex = NULL; | |
1330 | return TRUE; | |
b75a7d8f | 1331 | } |
374ca955 A |
1332 | U_CDECL_END |
1333 | ||
b75a7d8f A |
1334 | /* do not close UCA returned by ucol_initUCA! */ |
1335 | UCollator * | |
1336 | ucol_initUCA(UErrorCode *status) { | |
1337 | if(U_FAILURE(*status)) { | |
1338 | return NULL; | |
1339 | } | |
1340 | umtx_lock(NULL); | |
374ca955 | 1341 | UBool f = (_staticUCA == NULL); |
b75a7d8f | 1342 | umtx_unlock(NULL); |
374ca955 | 1343 | |
b75a7d8f A |
1344 | if(f) { |
1345 | UCollator *newUCA = NULL; | |
1346 | UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status); | |
374ca955 | 1347 | |
b75a7d8f A |
1348 | if(U_FAILURE(*status)) { |
1349 | if (result) { | |
1350 | udata_close(result); | |
1351 | } | |
1352 | uprv_free(newUCA); | |
1353 | } | |
374ca955 A |
1354 | |
1355 | // init FCD data | |
1356 | if (fcdTrieIndex == NULL) { | |
1357 | fcdTrieIndex = unorm_getFCDTrie(status); | |
1358 | ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); | |
1359 | } | |
1360 | ||
b75a7d8f | 1361 | if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ |
374ca955 | 1362 | newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status); |
b75a7d8f A |
1363 | if(U_SUCCESS(*status)){ |
1364 | newUCA->rb = NULL; | |
374ca955 A |
1365 | newUCA->elements = NULL; |
1366 | newUCA->validLocale = NULL; | |
1367 | newUCA->requestedLocale = NULL; | |
1368 | newUCA->hasRealData = FALSE; // real data lives in .dat file... | |
b75a7d8f A |
1369 | newUCA->freeImageOnClose = FALSE; |
1370 | umtx_lock(NULL); | |
374ca955 A |
1371 | if(_staticUCA == NULL) { |
1372 | _staticUCA = newUCA; | |
b75a7d8f A |
1373 | UCA_DATA_MEM = result; |
1374 | result = NULL; | |
1375 | newUCA = NULL; | |
1376 | } | |
1377 | umtx_unlock(NULL); | |
374ca955 | 1378 | |
b75a7d8f A |
1379 | if(newUCA != NULL) { |
1380 | udata_close(result); | |
1381 | uprv_free(newUCA); | |
1382 | } | |
1383 | else { | |
374ca955 | 1384 | ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
b75a7d8f A |
1385 | } |
1386 | // Initalize variables for implicit generation | |
374ca955 A |
1387 | const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts); |
1388 | uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status); | |
1389 | _staticUCA->mapping->getFoldingOffset = _getFoldingOffset; | |
b75a7d8f A |
1390 | }else{ |
1391 | udata_close(result); | |
1392 | uprv_free(newUCA); | |
374ca955 | 1393 | _staticUCA= NULL; |
b75a7d8f A |
1394 | } |
1395 | } | |
1396 | } | |
374ca955 | 1397 | return _staticUCA; |
b75a7d8f A |
1398 | } |
1399 | ||
1400 | ||
1401 | /* collIterNormalize Incremental Normalization happens here. */ | |
1402 | /* pick up the range of chars identifed by FCD, */ | |
1403 | /* normalize it into the collIterate's writable buffer, */ | |
1404 | /* switch the collIterate's state to use the writable buffer. */ | |
1405 | /* */ | |
1406 | static | |
1407 | void collIterNormalize(collIterate *collationSource) | |
1408 | { | |
1409 | UErrorCode status = U_ZERO_ERROR; | |
1410 | ||
1411 | int32_t normLen; | |
1412 | UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ | |
1413 | UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ | |
1414 | ||
1415 | normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize, | |
1416 | srcP, (int32_t)(endP - srcP), | |
1417 | FALSE, 0, | |
1418 | &status); | |
1419 | if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) { | |
1420 | // reallocate and terminate | |
1421 | if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, | |
1422 | &collationSource->writableBuffer, | |
1423 | (int32_t *)&collationSource->writableBufSize, normLen + 1, | |
1424 | 0) | |
1425 | ) { | |
1426 | #ifdef UCOL_DEBUG | |
1427 | fprintf(stderr, "collIterNormalize(), out of memory\n"); | |
1428 | #endif | |
1429 | return; | |
1430 | } | |
1431 | status = U_ZERO_ERROR; | |
1432 | normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize, | |
1433 | srcP, (int32_t)(endP - srcP), | |
1434 | FALSE, 0, | |
1435 | &status); | |
1436 | } | |
1437 | if (U_FAILURE(status)) { | |
1438 | #ifdef UCOL_DEBUG | |
1439 | fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status)); | |
1440 | #endif | |
1441 | return; | |
1442 | } | |
1443 | ||
1444 | if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { | |
1445 | collationSource->flags |= UCOL_ITER_ALLOCATED; | |
1446 | } | |
1447 | collationSource->pos = collationSource->writableBuffer; | |
1448 | collationSource->origFlags = collationSource->flags; | |
1449 | collationSource->flags |= UCOL_ITER_INNORMBUF; | |
1450 | collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); | |
1451 | } | |
1452 | ||
1453 | ||
1454 | // This function takes the iterator and extracts normalized stuff up to the next boundary | |
1455 | // It is similar in the end results to the collIterNormalize, but for the cases when we | |
1456 | // use an iterator | |
1457 | static | |
1458 | inline void normalizeIterator(collIterate *collationSource) { | |
1459 | UErrorCode status = U_ZERO_ERROR; | |
1460 | UBool wasNormalized = FALSE; | |
1461 | //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); | |
1462 | uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); | |
374ca955 | 1463 | int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
b75a7d8f A |
1464 | (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
1465 | if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { | |
1466 | // reallocate and terminate | |
1467 | if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, | |
1468 | &collationSource->writableBuffer, | |
1469 | (int32_t *)&collationSource->writableBufSize, normLen + 1, | |
1470 | 0) | |
1471 | ) { | |
1472 | #ifdef UCOL_DEBUG | |
1473 | fprintf(stderr, "normalizeIterator(), out of memory\n"); | |
1474 | #endif | |
1475 | return; | |
1476 | } | |
1477 | status = U_ZERO_ERROR; | |
1478 | //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); | |
1479 | collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); | |
374ca955 | 1480 | normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
b75a7d8f A |
1481 | (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
1482 | } | |
1483 | // Terminate the buffer - we already checked that it is big enough | |
374ca955 | 1484 | collationSource->writableBuffer[normLen] = 0; |
b75a7d8f A |
1485 | if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { |
1486 | collationSource->flags |= UCOL_ITER_ALLOCATED; | |
1487 | } | |
1488 | collationSource->pos = collationSource->writableBuffer; | |
1489 | collationSource->origFlags = collationSource->flags; | |
1490 | collationSource->flags |= UCOL_ITER_INNORMBUF; | |
1491 | collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); | |
1492 | } | |
1493 | ||
1494 | ||
1495 | /* Incremental FCD check and normalize */ | |
1496 | /* Called from getNextCE when normalization state is suspect. */ | |
1497 | /* When entering, the state is known to be this: */ | |
1498 | /* o We are working in the main buffer of the collIterate, not the side */ | |
1499 | /* writable buffer. When in the side buffer, normalization mode is always off, */ | |
1500 | /* so we won't get here. */ | |
1501 | /* o The leading combining class from the current character is 0 or */ | |
1502 | /* the trailing combining class of the previous char was zero. */ | |
1503 | /* True because the previous call to this function will have always exited */ | |
1504 | /* that way, and we get called for every char where cc might be non-zero. */ | |
1505 | static | |
1506 | inline UBool collIterFCD(collIterate *collationSource) { | |
1507 | UChar c, c2; | |
1508 | const UChar *srcP, *endP; | |
1509 | uint8_t leadingCC; | |
1510 | uint8_t prevTrailingCC = 0; | |
1511 | uint16_t fcd; | |
1512 | UBool needNormalize = FALSE; | |
1513 | ||
1514 | srcP = collationSource->pos-1; | |
1515 | ||
1516 | if (collationSource->flags & UCOL_ITER_HASLEN) { | |
1517 | endP = collationSource->endp; | |
1518 | } else { | |
1519 | endP = NULL; | |
1520 | } | |
1521 | ||
1522 | // Get the trailing combining class of the current character. If it's zero, | |
1523 | // we are OK. | |
1524 | c = *srcP++; | |
1525 | /* trie access */ | |
1526 | fcd = unorm_getFCD16(fcdTrieIndex, c); | |
1527 | if (fcd != 0) { | |
1528 | if (UTF_IS_FIRST_SURROGATE(c)) { | |
1529 | if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) { | |
1530 | ++srcP; | |
1531 | fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); | |
1532 | } else { | |
1533 | fcd = 0; | |
1534 | } | |
1535 | } | |
1536 | ||
1537 | prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
1538 | ||
1539 | if (prevTrailingCC != 0) { | |
1540 | // The current char has a non-zero trailing CC. Scan forward until we find | |
1541 | // a char with a leading cc of zero. | |
1542 | while (endP == NULL || srcP != endP) | |
1543 | { | |
1544 | const UChar *savedSrcP = srcP; | |
1545 | ||
1546 | c = *srcP++; | |
1547 | /* trie access */ | |
1548 | fcd = unorm_getFCD16(fcdTrieIndex, c); | |
1549 | if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) { | |
1550 | if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) { | |
1551 | ++srcP; | |
1552 | fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); | |
1553 | } else { | |
1554 | fcd = 0; | |
1555 | } | |
1556 | } | |
1557 | leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
1558 | if (leadingCC == 0) { | |
1559 | srcP = savedSrcP; // Hit char that is not part of combining sequence. | |
1560 | // back up over it. (Could be surrogate pair!) | |
1561 | break; | |
1562 | } | |
1563 | ||
1564 | if (leadingCC < prevTrailingCC) { | |
1565 | needNormalize = TRUE; | |
1566 | } | |
1567 | ||
1568 | prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
1569 | } | |
1570 | } | |
1571 | } | |
1572 | ||
1573 | collationSource->fcdPosition = (UChar *)srcP; | |
1574 | ||
1575 | return needNormalize; | |
1576 | } | |
1577 | ||
1578 | /****************************************************************************/ | |
1579 | /* Following are the CE retrieval functions */ | |
1580 | /* */ | |
1581 | /****************************************************************************/ | |
1582 | ||
1583 | /* there should be a macro version of this function in the header file */ | |
1584 | /* This is the first function that tries to fetch a collation element */ | |
1585 | /* If it's not succesfull or it encounters a more difficult situation */ | |
1586 | /* some more sofisticated and slower functions are invoked */ | |
1587 | static | |
1588 | inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { | |
1589 | uint32_t order = 0; | |
1590 | if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ | |
1591 | order = *(collationSource->toReturn++); /* if so, return them */ | |
1592 | if(collationSource->CEpos == collationSource->toReturn) { | |
1593 | collationSource->CEpos = collationSource->toReturn = collationSource->CEs; | |
1594 | } | |
1595 | return order; | |
1596 | } | |
1597 | ||
1598 | UChar ch = 0; | |
1599 | ||
1600 | for (;;) /* Loop handles case when incremental normalize switches */ | |
1601 | { /* to or from the side buffer / original string, and we */ | |
1602 | /* need to start again to get the next character. */ | |
1603 | ||
1604 | if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) | |
1605 | { | |
1606 | // The source string is null terminated and we're not working from the side buffer, | |
1607 | // and we're not normalizing. This is the fast path. | |
1608 | // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) | |
1609 | ch = *collationSource->pos++; | |
1610 | if (ch != 0) { | |
1611 | break; | |
1612 | } | |
1613 | else { | |
1614 | return UCOL_NO_MORE_CES; | |
1615 | } | |
1616 | } | |
1617 | ||
1618 | if (collationSource->flags & UCOL_ITER_HASLEN) { | |
1619 | // Normal path for strings when length is specified. | |
1620 | // (We can't be in side buffer because it is always null terminated.) | |
1621 | if (collationSource->pos >= collationSource->endp) { | |
1622 | // Ran off of the end of the main source string. We're done. | |
1623 | return UCOL_NO_MORE_CES; | |
1624 | } | |
1625 | ch = *collationSource->pos++; | |
1626 | } | |
1627 | else if(collationSource->flags & UCOL_USE_ITERATOR) { | |
1628 | UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); | |
1629 | if(iterCh == U_SENTINEL) { | |
1630 | return UCOL_NO_MORE_CES; | |
1631 | } | |
1632 | ch = (UChar)iterCh; | |
1633 | } | |
1634 | else | |
1635 | { | |
1636 | // Null terminated string. | |
1637 | ch = *collationSource->pos++; | |
1638 | if (ch == 0) { | |
1639 | // Ran off end of buffer. | |
1640 | if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
1641 | // Ran off end of main string. backing up one character. | |
1642 | collationSource->pos--; | |
1643 | return UCOL_NO_MORE_CES; | |
1644 | } | |
1645 | else | |
1646 | { | |
1647 | // Hit null in the normalize side buffer. | |
1648 | // Usually this means the end of the normalized data, | |
1649 | // except for one odd case: a null followed by combining chars, | |
1650 | // which is the case if we are at the start of the buffer. | |
1651 | if (collationSource->pos == collationSource->writableBuffer+1) { | |
1652 | break; | |
1653 | } | |
1654 | ||
1655 | // Null marked end of side buffer. | |
1656 | // Revert to the main string and | |
1657 | // loop back to top to try again to get a character. | |
1658 | collationSource->pos = collationSource->fcdPosition; | |
1659 | collationSource->flags = collationSource->origFlags; | |
1660 | continue; | |
1661 | } | |
1662 | } | |
1663 | } | |
1664 | ||
1665 | if(collationSource->flags&UCOL_HIRAGANA_Q) { | |
1666 | if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) { | |
1667 | collationSource->flags |= UCOL_WAS_HIRAGANA; | |
1668 | } else { | |
1669 | collationSource->flags &= ~UCOL_WAS_HIRAGANA; | |
1670 | } | |
1671 | } | |
1672 | ||
1673 | // We've got a character. See if there's any fcd and/or normalization stuff to do. | |
1674 | // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. | |
1675 | if ((collationSource->flags & UCOL_ITER_NORM) == 0) { | |
1676 | break; | |
1677 | } | |
1678 | ||
1679 | if (collationSource->fcdPosition >= collationSource->pos) { | |
1680 | // An earlier FCD check has already covered the current character. | |
1681 | // We can go ahead and process this char. | |
1682 | break; | |
1683 | } | |
1684 | ||
1685 | if (ch < ZERO_CC_LIMIT_ ) { | |
1686 | // Fast fcd safe path. Trailing combining class == 0. This char is OK. | |
1687 | break; | |
1688 | } | |
1689 | ||
1690 | if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
1691 | // We need to peek at the next character in order to tell if we are FCD | |
1692 | if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { | |
1693 | // We are at the last char of source string. | |
1694 | // It is always OK for FCD check. | |
1695 | break; | |
1696 | } | |
1697 | ||
1698 | // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test | |
1699 | if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
1700 | break; | |
1701 | } | |
1702 | } | |
1703 | ||
1704 | ||
1705 | // Need a more complete FCD check and possible normalization. | |
1706 | if (collIterFCD(collationSource)) { | |
1707 | collIterNormalize(collationSource); | |
1708 | } | |
1709 | if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
1710 | // No normalization was needed. Go ahead and process the char we already had. | |
1711 | break; | |
1712 | } | |
1713 | ||
1714 | // Some normalization happened. Next loop iteration will pick up a char | |
1715 | // from the normalization buffer. | |
1716 | ||
1717 | } // end for (;;) | |
1718 | ||
1719 | ||
1720 | if (ch <= 0xFF) { | |
1721 | /* For latin-1 characters we never need to fall back to the UCA table */ | |
1722 | /* because all of the UCA data is replicated in the latinOneMapping array */ | |
1723 | order = coll->latinOneMapping[ch]; | |
1724 | if (order > UCOL_NOT_FOUND) { | |
1725 | order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); | |
1726 | } | |
1727 | } | |
1728 | else | |
1729 | { | |
1730 | order = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); | |
1731 | if(order > UCOL_NOT_FOUND) { /* if a CE is special */ | |
1732 | order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ | |
1733 | } | |
374ca955 | 1734 | if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ |
b75a7d8f | 1735 | /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ |
374ca955 | 1736 | order = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch); |
b75a7d8f A |
1737 | |
1738 | if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ | |
374ca955 | 1739 | order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); |
b75a7d8f A |
1740 | } |
1741 | } | |
1742 | } | |
1743 | return order; /* return the CE */ | |
1744 | } | |
1745 | ||
1746 | /* ucol_getNextCE, out-of-line version for use from other files. */ | |
1747 | U_CAPI uint32_t U_EXPORT2 | |
1748 | ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { | |
1749 | return ucol_IGetNextCE(coll, collationSource, status); | |
374ca955 | 1750 | } |
b75a7d8f A |
1751 | |
1752 | ||
1753 | /** | |
1754 | * Incremental previous normalization happens here. Pick up the range of chars | |
1755 | * identifed by FCD, normalize it into the collIterate's writable buffer, | |
1756 | * switch the collIterate's state to use the writable buffer. | |
1757 | * @param data collation iterator data | |
1758 | */ | |
1759 | static | |
1760 | void collPrevIterNormalize(collIterate *data) | |
1761 | { | |
1762 | UErrorCode status = U_ZERO_ERROR; | |
1763 | UChar *pEnd = data->pos; /* End normalize + 1 */ | |
1764 | UChar *pStart; | |
1765 | uint32_t normLen; | |
1766 | UChar *pStartNorm; | |
1767 | ||
1768 | /* Start normalize */ | |
1769 | if (data->fcdPosition == NULL) { | |
1770 | pStart = data->string; | |
1771 | } | |
1772 | else { | |
1773 | pStart = data->fcdPosition + 1; | |
1774 | } | |
1775 | ||
1776 | normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, | |
1777 | data->writableBuffer, 0, &status); | |
1778 | ||
1779 | if (data->writableBufSize <= normLen) { | |
1780 | freeHeapWritableBuffer(data); | |
1781 | data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) * | |
1782 | sizeof(UChar)); | |
1783 | if(data->writableBuffer == NULL) { // something is wrong here, return | |
1784 | return; | |
1785 | } | |
1786 | data->flags |= UCOL_ITER_ALLOCATED; | |
1787 | /* to handle the zero termination */ | |
1788 | data->writableBufSize = normLen + 1; | |
1789 | } | |
1790 | status = U_ZERO_ERROR; | |
1791 | /* | |
1792 | this puts the null termination infront of the normalized string instead | |
1793 | of the end | |
1794 | */ | |
1795 | pStartNorm = data->writableBuffer + (data->writableBufSize - normLen); | |
1796 | *(pStartNorm - 1) = 0; | |
1797 | unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm, | |
1798 | normLen, &status); | |
1799 | ||
1800 | data->pos = data->writableBuffer + data->writableBufSize; | |
1801 | data->origFlags = data->flags; | |
1802 | data->flags |= UCOL_ITER_INNORMBUF; | |
1803 | data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
1804 | } | |
1805 | ||
1806 | ||
1807 | /** | |
1808 | * Incremental FCD check for previous iteration and normalize. Called from | |
1809 | * getPrevCE when normalization state is suspect. | |
1810 | * When entering, the state is known to be this: | |
1811 | * o We are working in the main buffer of the collIterate, not the side | |
1812 | * writable buffer. When in the side buffer, normalization mode is always | |
1813 | * off, so we won't get here. | |
1814 | * o The leading combining class from the current character is 0 or the | |
1815 | * trailing combining class of the previous char was zero. | |
1816 | * True because the previous call to this function will have always exited | |
1817 | * that way, and we get called for every char where cc might be non-zero. | |
1818 | * @param data collation iterate struct | |
1819 | * @return normalization status, TRUE for normalization to be done, FALSE | |
1820 | * otherwise | |
1821 | */ | |
1822 | static | |
1823 | inline UBool collPrevIterFCD(collIterate *data) | |
1824 | { | |
1825 | const UChar *src, *start; | |
1826 | UChar c, c2; | |
1827 | uint8_t leadingCC; | |
1828 | uint8_t trailingCC = 0; | |
1829 | uint16_t fcd; | |
1830 | UBool result = FALSE; | |
1831 | ||
1832 | start = data->string; | |
1833 | src = data->pos + 1; | |
1834 | ||
1835 | /* Get the trailing combining class of the current character. */ | |
1836 | c = *--src; | |
1837 | if (!UTF_IS_SURROGATE(c)) { | |
1838 | fcd = unorm_getFCD16(fcdTrieIndex, c); | |
1839 | } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) { | |
1840 | --src; | |
1841 | fcd = unorm_getFCD16(fcdTrieIndex, c2); | |
1842 | if (fcd != 0) { | |
1843 | fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); | |
1844 | } | |
1845 | } else /* unpaired surrogate */ { | |
1846 | fcd = 0; | |
1847 | } | |
1848 | ||
1849 | leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
1850 | ||
1851 | if (leadingCC != 0) { | |
1852 | /* | |
1853 | The current char has a non-zero leading combining class. | |
1854 | Scan backward until we find a char with a trailing cc of zero. | |
1855 | */ | |
1856 | for (;;) | |
1857 | { | |
1858 | if (start == src) { | |
1859 | data->fcdPosition = NULL; | |
1860 | return result; | |
1861 | } | |
1862 | ||
1863 | c = *--src; | |
1864 | if (!UTF_IS_SURROGATE(c)) { | |
1865 | fcd = unorm_getFCD16(fcdTrieIndex, c); | |
1866 | } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) { | |
1867 | --src; | |
1868 | fcd = unorm_getFCD16(fcdTrieIndex, c2); | |
1869 | if (fcd != 0) { | |
1870 | fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); | |
1871 | } | |
1872 | } else /* unpaired surrogate */ { | |
1873 | fcd = 0; | |
1874 | } | |
1875 | ||
1876 | trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
1877 | ||
1878 | if (trailingCC == 0) { | |
1879 | break; | |
1880 | } | |
1881 | ||
1882 | if (leadingCC < trailingCC) { | |
1883 | result = TRUE; | |
1884 | } | |
1885 | ||
1886 | leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
1887 | } | |
1888 | } | |
1889 | ||
1890 | data->fcdPosition = (UChar *)src; | |
1891 | ||
1892 | return result; | |
1893 | } | |
1894 | ||
1895 | /** gets a character from the string at a given offset | |
1896 | * Handles both normal and iterative cases. | |
1897 | * No error checking - caller beware! | |
1898 | */ | |
374ca955 | 1899 | inline static |
b75a7d8f A |
1900 | UChar peekCharacter(collIterate *source, int32_t offset) { |
1901 | if(source->pos != NULL) { | |
1902 | return *(source->pos + offset); | |
1903 | } else if(source->iterator != NULL) { | |
1904 | if(offset != 0) { | |
1905 | source->iterator->move(source->iterator, offset, UITER_CURRENT); | |
1906 | UChar toReturn = (UChar)source->iterator->next(source->iterator); | |
1907 | source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); | |
1908 | return toReturn; | |
1909 | } else { | |
1910 | return (UChar)source->iterator->current(source->iterator); | |
1911 | } | |
1912 | } else { | |
1913 | return (UChar)U_SENTINEL; | |
1914 | } | |
1915 | } | |
1916 | ||
1917 | /** | |
1918 | * Determines if we are at the start of the data string in the backwards | |
1919 | * collation iterator | |
1920 | * @param data collation iterator | |
1921 | * @return TRUE if we are at the start | |
1922 | */ | |
1923 | static | |
1924 | inline UBool isAtStartPrevIterate(collIterate *data) { | |
1925 | if(data->pos == NULL && data->iterator != NULL) { | |
1926 | return !data->iterator->hasPrevious(data->iterator); | |
1927 | } | |
1928 | //return (collIter_bos(data)) || | |
1929 | return (data->pos == data->string) || | |
1930 | ((data->flags & UCOL_ITER_INNORMBUF) && | |
1931 | *(data->pos - 1) == 0 && data->fcdPosition == NULL); | |
1932 | } | |
1933 | ||
374ca955 A |
1934 | static |
1935 | inline void goBackOne(collIterate *data) { | |
1936 | # if 0 | |
1937 | // somehow, it looks like we need to keep iterator synced up | |
1938 | // at all times, as above. | |
1939 | if(data->pos) { | |
1940 | data->pos--; | |
1941 | } | |
1942 | if(data->iterator) { | |
1943 | data->iterator->previous(data->iterator); | |
1944 | } | |
1945 | #endif | |
1946 | if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { | |
1947 | data->iterator->previous(data->iterator); | |
1948 | } | |
1949 | if(data->pos) { | |
1950 | data->pos --; | |
1951 | } | |
1952 | } | |
1953 | ||
b75a7d8f A |
1954 | /** |
1955 | * Inline function that gets a simple CE. | |
1956 | * So what it does is that it will first check the expansion buffer. If the | |
1957 | * expansion buffer is not empty, ie the end pointer to the expansion buffer | |
1958 | * is different from the string pointer, we return the collation element at the | |
1959 | * return pointer and decrement it. | |
1960 | * For more complicated CEs it resorts to getComplicatedCE. | |
1961 | * @param coll collator data | |
1962 | * @param data collation iterator struct | |
1963 | * @param status error status | |
1964 | */ | |
1965 | static | |
1966 | inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, | |
1967 | UErrorCode *status) | |
1968 | { | |
374ca955 | 1969 | uint32_t result = (uint32_t)UCOL_NULLORDER; |
b75a7d8f A |
1970 | if (data->toReturn > data->CEs) { |
1971 | data->toReturn --; | |
1972 | result = *(data->toReturn); | |
1973 | if (data->CEs == data->toReturn) { | |
1974 | data->CEpos = data->toReturn; | |
1975 | } | |
1976 | } | |
1977 | else { | |
1978 | UChar ch = 0; | |
1979 | /* | |
1980 | Loop handles case when incremental normalize switches to or from the | |
1981 | side buffer / original string, and we need to start again to get the | |
1982 | next character. | |
1983 | */ | |
1984 | for (;;) { | |
1985 | if (data->flags & UCOL_ITER_HASLEN) { | |
1986 | /* | |
1987 | Normal path for strings when length is specified. | |
1988 | Not in side buffer because it is always null terminated. | |
1989 | */ | |
1990 | if (data->pos <= data->string) { | |
1991 | /* End of the main source string */ | |
1992 | return UCOL_NO_MORE_CES; | |
1993 | } | |
1994 | data->pos --; | |
1995 | ch = *data->pos; | |
1996 | } | |
1997 | // we are using an iterator to go back. Pray for us! | |
1998 | else if (data->flags & UCOL_USE_ITERATOR) { | |
1999 | UChar32 iterCh = data->iterator->previous(data->iterator); | |
2000 | if(iterCh == U_SENTINEL) { | |
2001 | return UCOL_NO_MORE_CES; | |
2002 | } else { | |
2003 | ch = (UChar)iterCh; | |
2004 | } | |
2005 | } | |
2006 | else { | |
2007 | data->pos --; | |
2008 | ch = *data->pos; | |
2009 | /* we are in the side buffer. */ | |
2010 | if (ch == 0) { | |
2011 | /* | |
2012 | At the start of the normalize side buffer. | |
2013 | Go back to string. | |
2014 | Because pointer points to the last accessed character, | |
2015 | hence we have to increment it by one here. | |
2016 | */ | |
2017 | if (data->fcdPosition == NULL) { | |
2018 | data->pos = data->string; | |
2019 | return UCOL_NO_MORE_CES; | |
2020 | } | |
2021 | else { | |
2022 | data->pos = data->fcdPosition + 1; | |
2023 | } | |
2024 | data->flags = data->origFlags; | |
2025 | continue; | |
2026 | } | |
2027 | } | |
2028 | ||
2029 | if(data->flags&UCOL_HIRAGANA_Q) { | |
2030 | if(ch>=0x3040 && ch<=0x309f) { | |
2031 | data->flags |= UCOL_WAS_HIRAGANA; | |
2032 | } else { | |
2033 | data->flags &= ~UCOL_WAS_HIRAGANA; | |
2034 | } | |
2035 | } | |
374ca955 | 2036 | |
b75a7d8f | 2037 | /* |
374ca955 | 2038 | * got a character to determine if there's fcd and/or normalization |
b75a7d8f A |
2039 | * stuff to do. |
2040 | * if the current character is not fcd. | |
2041 | * if current character is at the start of the string | |
2042 | * Trailing combining class == 0. | |
2043 | * Note if pos is in the writablebuffer, norm is always 0 | |
2044 | */ | |
374ca955 | 2045 | if (ch < ZERO_CC_LIMIT_ || |
b75a7d8f A |
2046 | // this should propel us out of the loop in the iterator case |
2047 | (data->flags & UCOL_ITER_NORM) == 0 || | |
374ca955 | 2048 | (data->fcdPosition != NULL && data->fcdPosition <= data->pos) |
b75a7d8f A |
2049 | || data->string == data->pos) { |
2050 | break; | |
2051 | } | |
2052 | ||
2053 | if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
2054 | /* if next character is FCD */ | |
2055 | if (data->pos == data->string) { | |
2056 | /* First char of string is always OK for FCD check */ | |
2057 | break; | |
2058 | } | |
2059 | ||
2060 | /* Not first char of string, do the FCD fast test */ | |
2061 | if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
2062 | break; | |
2063 | } | |
2064 | } | |
2065 | ||
2066 | /* Need a more complete FCD check and possible normalization. */ | |
2067 | if (collPrevIterFCD(data)) { | |
2068 | collPrevIterNormalize(data); | |
2069 | } | |
2070 | ||
2071 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
2072 | /* No normalization. Go ahead and process the char. */ | |
2073 | break; | |
2074 | } | |
2075 | ||
2076 | /* | |
2077 | Some normalization happened. | |
2078 | Next loop picks up a char from the normalization buffer. | |
2079 | */ | |
2080 | } | |
2081 | ||
2082 | /* attempt to handle contractions, after removal of the backwards | |
2083 | contraction | |
2084 | */ | |
2085 | if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { | |
2086 | result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); | |
2087 | } | |
2088 | else { | |
b75a7d8f A |
2089 | // TODO: fix me for THAI - I reference *(data->pos-1) |
2090 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0 && | |
2091 | /*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally | |
374ca955 A |
2092 | // makes sure that we're not at the beggining of the string |
2093 | //data->pos > data->string && | |
2094 | !collIter_bos(data) && | |
b75a7d8f A |
2095 | UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) |
2096 | //UCOL_ISTHAIPREVOWEL(*(data->pos -1))) | |
2097 | { | |
374ca955 A |
2098 | collIterateState entryState; |
2099 | backupState(data, &entryState); | |
2100 | // we have to check if the previous character is also Thai | |
2101 | // if not, we can just set the result | |
2102 | goBackOne(data); | |
2103 | if(collIter_bos(data) || !UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) { | |
2104 | loadState(data, &entryState, FALSE); | |
2105 | result = UCOL_THAI; | |
2106 | } else { // previous is also reordered | |
2107 | // we need to go back as long as they are being reordered | |
2108 | // count over the range of reorderable characters and see | |
2109 | // if there is an even or odd number of them | |
2110 | // if even, we should not reorder. If odd we should reorder. | |
2111 | int32_t noReordered = 1; // the one we already detected | |
2112 | while(!collIter_bos(data) && UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) { | |
2113 | noReordered++; | |
2114 | goBackOne(data); | |
2115 | } | |
2116 | if(noReordered & 1) { // odd number of reorderables | |
2117 | result = UCOL_THAI; | |
2118 | } else { | |
2119 | result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); | |
2120 | } | |
2121 | loadState(data, &entryState, FALSE); | |
2122 | } | |
b75a7d8f | 2123 | } |
374ca955 A |
2124 | else if (ch <= 0xFF) { |
2125 | result = coll->latinOneMapping[ch]; | |
2126 | //if (result > UCOL_NOT_FOUND) { | |
2127 | //result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); | |
2128 | //} | |
2129 | } | |
b75a7d8f A |
2130 | else { |
2131 | /*result = ucmpe32_get(coll->mapping, ch);*/ | |
2132 | result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); | |
2133 | } | |
374ca955 A |
2134 | if (result > UCOL_NOT_FOUND) { |
2135 | result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); | |
2136 | } | |
b75a7d8f A |
2137 | if (result == UCOL_NOT_FOUND) { |
2138 | if (!isAtStartPrevIterate(data) && | |
2139 | ucol_contractionEndCP(ch, data->coll)) { | |
2140 | result = UCOL_CONTRACTION; | |
2141 | } | |
2142 | else { | |
2143 | /*result = ucmpe32_get(UCA->mapping, ch);*/ | |
374ca955 A |
2144 | if(coll->UCA) { |
2145 | result = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch); | |
2146 | } | |
b75a7d8f A |
2147 | } |
2148 | ||
374ca955 A |
2149 | if (result > UCOL_NOT_FOUND && coll->UCA) { |
2150 | result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); | |
b75a7d8f A |
2151 | } |
2152 | } | |
2153 | } | |
2154 | } | |
b75a7d8f A |
2155 | return result; |
2156 | } | |
2157 | ||
2158 | ||
2159 | /* ucol_getPrevCE, out-of-line version for use from other files. */ | |
2160 | U_CAPI uint32_t U_EXPORT2 | |
2161 | ucol_getPrevCE(const UCollator *coll, collIterate *data, | |
2162 | UErrorCode *status) { | |
2163 | return ucol_IGetPrevCE(coll, data, status); | |
2164 | } | |
2165 | ||
2166 | ||
2167 | /* this should be connected to special Jamo handling */ | |
2168 | U_CAPI uint32_t U_EXPORT2 | |
2169 | ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { | |
2170 | collIterate colIt; | |
2171 | uint32_t order; | |
2172 | IInit_collIterate(coll, &u, 1, &colIt); | |
2173 | order = ucol_IGetNextCE(coll, &colIt, status); | |
2174 | /*UCOL_GETNEXTCE(order, coll, colIt, status);*/ | |
2175 | return order; | |
2176 | } | |
2177 | ||
2178 | /** | |
2179 | * Inserts the argument character into the end of the buffer pushing back the | |
2180 | * null terminator. | |
2181 | * @param data collIterate struct data | |
2182 | * @param pNull pointer to the null termination | |
2183 | * @param ch character to be appended | |
2184 | * @return the position of the new addition | |
2185 | */ | |
2186 | static | |
2187 | inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch) | |
2188 | { | |
2189 | uint32_t size = data->writableBufSize; | |
2190 | UChar *newbuffer; | |
2191 | const uint32_t incsize = 5; | |
2192 | ||
2193 | if ((data->writableBuffer + size) > (pNull + 1)) { | |
2194 | *pNull = ch; | |
2195 | *(pNull + 1) = 0; | |
2196 | return pNull; | |
2197 | } | |
2198 | ||
2199 | /* | |
2200 | buffer will always be null terminated at the end. | |
2201 | giving extra space since it is likely that more characters will be added. | |
2202 | */ | |
2203 | size += incsize; | |
2204 | newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); | |
2205 | if(newbuffer != NULL) { // something wrong, but no status | |
2206 | uprv_memcpy(newbuffer, data->writableBuffer, | |
2207 | data->writableBufSize * sizeof(UChar)); | |
2208 | ||
2209 | freeHeapWritableBuffer(data); | |
2210 | data->writableBufSize = size; | |
2211 | data->writableBuffer = newbuffer; | |
2212 | ||
2213 | newbuffer = newbuffer + data->writableBufSize; | |
2214 | *newbuffer = ch; | |
2215 | *(newbuffer + 1) = 0; | |
2216 | } | |
2217 | return newbuffer; | |
2218 | } | |
2219 | ||
2220 | /** | |
2221 | * Inserts the argument string into the end of the buffer pushing back the | |
2222 | * null terminator. | |
2223 | * @param data collIterate struct data | |
2224 | * @param pNull pointer to the null termination | |
2225 | * @param string to be appended | |
2226 | * @param length of the string to be appended | |
2227 | * @return the position of the new addition | |
2228 | */ | |
2229 | static | |
2230 | inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str, | |
2231 | int32_t length) | |
2232 | { | |
2233 | uint32_t size = pNull - data->writableBuffer; | |
2234 | UChar *newbuffer; | |
2235 | ||
2236 | if (data->writableBuffer + data->writableBufSize > pNull + length + 1) { | |
2237 | uprv_memcpy(pNull, str, length * sizeof(UChar)); | |
2238 | *(pNull + length) = 0; | |
2239 | return pNull; | |
2240 | } | |
2241 | ||
2242 | /* | |
2243 | buffer will always be null terminated at the end. | |
2244 | giving extra space since it is likely that more characters will be added. | |
2245 | */ | |
2246 | newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1)); | |
2247 | if(newbuffer != NULL) { | |
2248 | uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar)); | |
2249 | uprv_memcpy(newbuffer + size, str, length * sizeof(UChar)); | |
2250 | ||
2251 | freeHeapWritableBuffer(data); | |
2252 | data->writableBufSize = size + length + 1; | |
2253 | data->writableBuffer = newbuffer; | |
2254 | } | |
2255 | ||
2256 | return newbuffer; | |
2257 | } | |
2258 | ||
2259 | /** | |
2260 | * Special normalization function for contraction in the forwards iterator. | |
2261 | * This normalization sequence will place the current character at source->pos | |
2262 | * and its following normalized sequence into the buffer. | |
2263 | * The fcd position, pos will be changed. | |
2264 | * pos will now point to positions in the buffer. | |
2265 | * Flags will be changed accordingly. | |
2266 | * @param data collation iterator data | |
2267 | */ | |
2268 | static | |
2269 | inline void normalizeNextContraction(collIterate *data) | |
2270 | { | |
2271 | UChar *buffer = data->writableBuffer; | |
2272 | uint32_t buffersize = data->writableBufSize; | |
2273 | uint32_t strsize; | |
2274 | UErrorCode status = U_ZERO_ERROR; | |
2275 | /* because the pointer points to the next character */ | |
2276 | UChar *pStart = data->pos - 1; | |
2277 | UChar *pEnd; | |
2278 | uint32_t normLen; | |
2279 | UChar *pStartNorm; | |
2280 | ||
2281 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
2282 | *data->writableBuffer = *(pStart - 1); | |
2283 | strsize = 1; | |
2284 | } | |
2285 | else { | |
2286 | strsize = u_strlen(data->writableBuffer); | |
2287 | } | |
2288 | ||
2289 | pEnd = data->fcdPosition; | |
2290 | ||
2291 | normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, | |
2292 | &status); | |
2293 | ||
2294 | if (buffersize <= normLen + strsize) { | |
2295 | uint32_t size = strsize + normLen + 1; | |
2296 | UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); | |
2297 | if(temp != NULL) { | |
2298 | uprv_memcpy(temp, buffer, sizeof(UChar) * strsize); | |
2299 | freeHeapWritableBuffer(data); | |
2300 | data->writableBuffer = temp; | |
2301 | data->writableBufSize = size; | |
2302 | data->flags |= UCOL_ITER_ALLOCATED; | |
2303 | } | |
2304 | } | |
2305 | ||
2306 | status = U_ZERO_ERROR; | |
2307 | pStartNorm = buffer + strsize; | |
2308 | /* null-termination will be added here */ | |
2309 | unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, | |
2310 | normLen + 1, &status); | |
2311 | ||
2312 | data->pos = data->writableBuffer + strsize; | |
2313 | data->origFlags = data->flags; | |
2314 | data->flags |= UCOL_ITER_INNORMBUF; | |
2315 | data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
2316 | } | |
2317 | ||
2318 | /** | |
2319 | * Contraction character management function that returns the next character | |
2320 | * for the forwards iterator. | |
2321 | * Does nothing if the next character is in buffer and not the first character | |
2322 | * in it. | |
2323 | * Else it checks next character in data string to see if it is normalizable. | |
2324 | * If it is not, the character is simply copied into the buffer, else | |
2325 | * the whole normalized substring is copied into the buffer, including the | |
2326 | * current character. | |
2327 | * @param data collation element iterator data | |
2328 | * @return next character | |
2329 | */ | |
2330 | static | |
2331 | inline UChar getNextNormalizedChar(collIterate *data) | |
2332 | { | |
2333 | UChar nextch; | |
2334 | UChar ch; | |
2335 | // Here we need to add the iterator code. One problem is the way | |
2336 | // end of string is handled. If we just return next char, it could | |
2337 | // be the sentinel. Most of the cases already check for this, but we | |
2338 | // need to be sure. | |
2339 | if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { | |
2340 | /* if no normalization and not in buffer. */ | |
2341 | if(data->flags & UCOL_USE_ITERATOR) { | |
2342 | return (UChar)data->iterator->next(data->iterator); | |
2343 | } else { | |
2344 | return *(data->pos ++); | |
2345 | } | |
2346 | } | |
2347 | ||
2348 | //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { | |
2349 | //normalizeIterator(data); | |
2350 | //} | |
2351 | ||
2352 | UChar *pEndWritableBuffer = NULL; | |
2353 | UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); | |
2354 | if ((innormbuf && *data->pos != 0) || | |
2355 | (data->fcdPosition != NULL && !innormbuf && | |
2356 | data->pos < data->fcdPosition)) { | |
2357 | /* | |
2358 | if next character is in normalized buffer, no further normalization | |
2359 | is required | |
2360 | */ | |
2361 | return *(data->pos ++); | |
2362 | } | |
2363 | ||
2364 | if (data->flags & UCOL_ITER_HASLEN) { | |
2365 | /* in data string */ | |
2366 | if (data->pos + 1 == data->endp) { | |
2367 | return *(data->pos ++); | |
2368 | } | |
2369 | } | |
2370 | else { | |
2371 | if (innormbuf) { | |
374ca955 A |
2372 | // inside the normalization buffer, but at the end |
2373 | // (since we encountered zero). This means, in the | |
2374 | // case we're using char iterator, that we need to | |
2375 | // do another round of normalization. | |
b75a7d8f A |
2376 | //if(data->origFlags & UCOL_USE_ITERATOR) { |
2377 | // we need to restore original flags, | |
2378 | // otherwise, we'll lose them | |
2379 | //data->flags = data->origFlags; | |
2380 | //normalizeIterator(data); | |
2381 | //return *(data->pos++); | |
2382 | //} else { | |
2383 | /* | |
2384 | in writable buffer, at this point fcdPosition can not be | |
2385 | pointing to the end of the data string. see contracting tag. | |
2386 | */ | |
2387 | if(data->fcdPosition) { | |
2388 | if (*(data->fcdPosition + 1) == 0 || | |
2389 | data->fcdPosition + 1 == data->endp) { | |
2390 | /* at the end of the string, dump it into the normalizer */ | |
2391 | data->pos = insertBufferEnd(data, data->pos, | |
2392 | *(data->fcdPosition)) + 1; | |
2393 | return *(data->fcdPosition ++); | |
2394 | } | |
2395 | pEndWritableBuffer = data->pos; | |
2396 | data->pos = data->fcdPosition; | |
2397 | } else if(data->origFlags & UCOL_USE_ITERATOR) { | |
2398 | // if we are here, we're using a normalizing iterator. | |
2399 | // we should just continue further. | |
2400 | data->flags = data->origFlags; | |
2401 | data->pos = NULL; | |
2402 | return (UChar)data->iterator->next(data->iterator); | |
2403 | } | |
2404 | //} | |
2405 | } | |
2406 | else { | |
2407 | if (*(data->pos + 1) == 0) { | |
2408 | return *(data->pos ++); | |
2409 | } | |
2410 | } | |
2411 | } | |
2412 | ||
2413 | ch = *data->pos ++; | |
2414 | nextch = *data->pos; | |
2415 | ||
2416 | /* | |
2417 | * if the current character is not fcd. | |
2418 | * Trailing combining class == 0. | |
2419 | */ | |
2420 | if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && | |
2421 | (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || | |
2422 | ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { | |
2423 | /* | |
2424 | Need a more complete FCD check and possible normalization. | |
2425 | normalize substring will be appended to buffer | |
2426 | */ | |
2427 | if (collIterFCD(data)) { | |
2428 | normalizeNextContraction(data); | |
2429 | return *(data->pos ++); | |
2430 | } | |
2431 | else if (innormbuf) { | |
2432 | /* fcdposition shifted even when there's no normalization, if we | |
2433 | don't input the rest into this, we'll get the wrong position when | |
2434 | we reach the end of the writableBuffer */ | |
2435 | int32_t length = data->fcdPosition - data->pos + 1; | |
2436 | data->pos = insertBufferEnd(data, pEndWritableBuffer, | |
2437 | data->pos - 1, length); | |
2438 | return *(data->pos ++); | |
2439 | } | |
2440 | } | |
2441 | ||
2442 | if (innormbuf) { | |
2443 | /* | |
2444 | no normalization is to be done hence only one character will be | |
2445 | appended to the buffer. | |
2446 | */ | |
2447 | data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1; | |
2448 | } | |
2449 | ||
2450 | /* points back to the pos in string */ | |
2451 | return ch; | |
2452 | } | |
2453 | ||
b75a7d8f A |
2454 | |
2455 | ||
2456 | /** | |
2457 | * Function to copy the buffer into writableBuffer and sets the fcd position to | |
2458 | * the correct position | |
2459 | * @param source data string source | |
2460 | * @param buffer character buffer | |
2461 | * @param tempdb current position in buffer that has been used up | |
2462 | */ | |
2463 | static | |
2464 | inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer, | |
2465 | UChar *tempdb) | |
2466 | { | |
2467 | /* okay confusing part here. to ensure that the skipped characters are | |
2468 | considered later, we need to place it in the appropriate position in the | |
2469 | normalization buffer and reassign the pos pointer. simple case if pos | |
2470 | reside in string, simply copy to normalization buffer and | |
2471 | fcdposition = pos, pos = start of normalization buffer. if pos in | |
2472 | normalization buffer, we'll insert the copy infront of pos and point pos | |
2473 | to the start of the normalization buffer. why am i doing these copies? | |
2474 | well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does | |
2475 | not require any changes, which be really painful. */ | |
2476 | uint32_t length = u_strlen(buffer);; | |
2477 | if (source->flags & UCOL_ITER_INNORMBUF) { | |
2478 | u_strcpy(tempdb, source->pos); | |
2479 | } | |
2480 | else { | |
2481 | source->fcdPosition = source->pos; | |
2482 | source->origFlags = source->flags; | |
2483 | source->flags |= UCOL_ITER_INNORMBUF; | |
2484 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); | |
2485 | } | |
2486 | ||
2487 | if (length >= source->writableBufSize) { | |
2488 | freeHeapWritableBuffer(source); | |
2489 | source->writableBuffer = | |
2490 | (UChar *)uprv_malloc((length + 1) * sizeof(UChar)); | |
2491 | if(source->writableBuffer == NULL) { | |
2492 | return; | |
2493 | } | |
2494 | source->writableBufSize = length; | |
2495 | } | |
2496 | ||
2497 | u_strcpy(source->writableBuffer, buffer); | |
2498 | source->pos = source->writableBuffer; | |
2499 | } | |
2500 | ||
2501 | /** | |
2502 | * Function to get the discontiguos collation element within the source. | |
2503 | * Note this function will set the position to the appropriate places. | |
2504 | * @param coll current collator used | |
2505 | * @param source data string source | |
2506 | * @param constart index to the start character in the contraction table | |
2507 | * @return discontiguos collation element offset | |
2508 | */ | |
2509 | static | |
2510 | uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, | |
2511 | const UChar *constart) | |
2512 | { | |
2513 | /* source->pos currently points to the second combining character after | |
2514 | the start character */ | |
2515 | UChar *temppos = source->pos; | |
2516 | UChar buffer[4*UCOL_MAX_BUFFER]; | |
2517 | UChar *tempdb = buffer; | |
2518 | const UChar *tempconstart = constart; | |
2519 | uint8_t tempflags = source->flags; | |
2520 | UBool multicontraction = FALSE; | |
2521 | UChar *tempbufferpos = 0; | |
2522 | collIterateState discState; | |
2523 | ||
2524 | backupState(source, &discState); | |
2525 | ||
2526 | //*tempdb = *(source->pos - 1); | |
2527 | *tempdb = peekCharacter(source, -1); | |
2528 | tempdb ++; | |
2529 | while (TRUE) { | |
2530 | UChar *UCharOffset; | |
2531 | UChar schar, | |
2532 | tchar; | |
2533 | uint32_t result; | |
2534 | ||
2535 | if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) | |
2536 | || (peekCharacter(source, 0) == 0 && | |
2537 | //|| (*source->pos == 0 && | |
2538 | ((source->flags & UCOL_ITER_INNORMBUF) == 0 || | |
2539 | source->fcdPosition == NULL || | |
2540 | source->fcdPosition == source->endp || | |
2541 | *(source->fcdPosition) == 0 || | |
2542 | u_getCombiningClass(*(source->fcdPosition)) == 0)) || | |
2543 | /* end of string in null terminated string or stopped by a | |
2544 | null character, note fcd does not always point to a base | |
2545 | character after the discontiguos change */ | |
2546 | u_getCombiningClass(peekCharacter(source, 0)) == 0) { | |
2547 | //u_getCombiningClass(*(source->pos)) == 0) { | |
2548 | //constart = (UChar *)coll->image + getContractOffset(CE); | |
2549 | if (multicontraction) { | |
2550 | *tempbufferpos = 0; | |
2551 | source->pos = temppos - 1; | |
2552 | setDiscontiguosAttribute(source, buffer, tempdb); | |
2553 | return *(coll->contractionCEs + | |
2554 | (tempconstart - coll->contractionIndex)); | |
2555 | } | |
2556 | constart = tempconstart; | |
2557 | break; | |
2558 | } | |
2559 | ||
2560 | UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ | |
2561 | schar = getNextNormalizedChar(source); | |
2562 | ||
2563 | while (schar > (tchar = *UCharOffset)) { | |
2564 | UCharOffset++; | |
2565 | } | |
2566 | ||
2567 | if (schar != tchar) { | |
2568 | /* not the correct codepoint. we stuff the current codepoint into | |
2569 | the discontiguos buffer and try the next character */ | |
2570 | *tempdb = schar; | |
2571 | tempdb ++; | |
2572 | continue; | |
2573 | } | |
2574 | else { | |
2575 | if (u_getCombiningClass(schar) == | |
2576 | u_getCombiningClass(peekCharacter(source, -2))) { | |
2577 | //u_getCombiningClass(*(source->pos - 2))) { | |
2578 | *tempdb = schar; | |
2579 | tempdb ++; | |
2580 | continue; | |
2581 | } | |
2582 | result = *(coll->contractionCEs + | |
2583 | (UCharOffset - coll->contractionIndex)); | |
2584 | } | |
2585 | *tempdb = 0; | |
2586 | ||
2587 | if (result == UCOL_NOT_FOUND) { | |
2588 | break; | |
2589 | } else if (isContraction(result)) { | |
2590 | /* this is a multi-contraction*/ | |
2591 | tempconstart = (UChar *)coll->image + getContractOffset(result); | |
2592 | if (*(coll->contractionCEs + (constart - coll->contractionIndex)) | |
2593 | != UCOL_NOT_FOUND) { | |
2594 | multicontraction = TRUE; | |
2595 | temppos = source->pos + 1; | |
2596 | tempbufferpos = buffer + u_strlen(buffer); | |
2597 | } | |
2598 | } else { | |
2599 | setDiscontiguosAttribute(source, buffer, tempdb); | |
2600 | return result; | |
2601 | } | |
2602 | } | |
2603 | ||
2604 | /* no problems simply reverting just like that, | |
2605 | if we are in string before getting into this function, points back to | |
2606 | string hence no problem. | |
2607 | if we are in normalization buffer before getting into this function, | |
2608 | since we'll never use another normalization within this function, we | |
2609 | know that fcdposition points to a base character. the normalization buffer | |
2610 | never change, hence this revert works. */ | |
2611 | loadState(source, &discState, TRUE); | |
2612 | goBackOne(source); | |
2613 | ||
2614 | //source->pos = temppos - 1; | |
2615 | source->flags = tempflags; | |
2616 | return *(coll->contractionCEs + (constart - coll->contractionIndex)); | |
2617 | } | |
2618 | ||
2619 | static | |
2620 | inline UBool isNonChar(UChar32 cp) { | |
2621 | if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) { | |
2622 | return TRUE; | |
2623 | } | |
2624 | return FALSE; | |
2625 | } | |
2626 | ||
2627 | /* now uses Mark's getImplicitPrimary code */ | |
2628 | static | |
2629 | inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { | |
2630 | if(isNonChar(cp)) { | |
2631 | return 0; | |
2632 | } | |
374ca955 | 2633 | uint32_t r = uprv_uca_getImplicitPrimary(cp); |
b75a7d8f A |
2634 | *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; |
2635 | return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' | |
2636 | } | |
2637 | ||
2638 | /** | |
2639 | * Inserts the argument character into the front of the buffer replacing the | |
2640 | * front null terminator. | |
2641 | * @param data collation element iterator data | |
2642 | * @param pNull pointer to the null terminator | |
2643 | * @param ch character to be appended | |
2644 | * @return positon of added character | |
2645 | */ | |
2646 | static | |
2647 | inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch) | |
2648 | { | |
2649 | uint32_t size = data->writableBufSize; | |
2650 | UChar *end; | |
2651 | UChar *newbuffer; | |
2652 | const uint32_t incsize = 5; | |
2653 | ||
2654 | if (pNull > data->writableBuffer + 1) { | |
2655 | *pNull = ch; | |
2656 | *(pNull - 1) = 0; | |
2657 | return pNull; | |
2658 | } | |
2659 | ||
2660 | /* | |
2661 | buffer will always be null terminated infront. | |
2662 | giving extra space since it is likely that more characters will be added. | |
2663 | */ | |
2664 | size += incsize; | |
2665 | newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); | |
2666 | if(newbuffer == NULL) { | |
2667 | return NULL; | |
2668 | } | |
2669 | end = newbuffer + incsize; | |
2670 | uprv_memcpy(end, data->writableBuffer, | |
2671 | data->writableBufSize * sizeof(UChar)); | |
2672 | *end = ch; | |
2673 | *(end - 1) = 0; | |
2674 | ||
2675 | freeHeapWritableBuffer(data); | |
2676 | ||
2677 | data->writableBufSize = size; | |
2678 | data->writableBuffer = newbuffer; | |
2679 | return end; | |
2680 | } | |
2681 | ||
2682 | /** | |
2683 | * Special normalization function for contraction in the previous iterator. | |
2684 | * This normalization sequence will place the current character at source->pos | |
2685 | * and its following normalized sequence into the buffer. | |
2686 | * The fcd position, pos will be changed. | |
2687 | * pos will now point to positions in the buffer. | |
2688 | * Flags will be changed accordingly. | |
2689 | * @param data collation iterator data | |
2690 | */ | |
2691 | static | |
2692 | inline void normalizePrevContraction(collIterate *data) | |
2693 | { | |
2694 | UChar *buffer = data->writableBuffer; | |
2695 | uint32_t buffersize = data->writableBufSize; | |
2696 | uint32_t nulltermsize; | |
2697 | UErrorCode status = U_ZERO_ERROR; | |
2698 | UChar *pEnd = data->pos + 1; /* End normalize + 1 */ | |
2699 | UChar *pStart; | |
2700 | uint32_t normLen; | |
2701 | UChar *pStartNorm; | |
2702 | ||
2703 | if (data->flags & UCOL_ITER_HASLEN) { | |
2704 | /* | |
2705 | normalization buffer not used yet, we'll pull down the next | |
2706 | character into the end of the buffer | |
2707 | */ | |
2708 | *(buffer + (buffersize - 1)) = *(data->pos + 1); | |
2709 | nulltermsize = buffersize - 1; | |
2710 | } | |
2711 | else { | |
2712 | nulltermsize = buffersize; | |
2713 | UChar *temp = buffer + (nulltermsize - 1); | |
2714 | while (*(temp --) != 0) { | |
2715 | nulltermsize --; | |
2716 | } | |
2717 | } | |
2718 | ||
2719 | /* Start normalize */ | |
2720 | if (data->fcdPosition == NULL) { | |
2721 | pStart = data->string; | |
2722 | } | |
2723 | else { | |
2724 | pStart = data->fcdPosition + 1; | |
2725 | } | |
2726 | ||
2727 | normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, | |
2728 | &status); | |
2729 | ||
2730 | if (nulltermsize <= normLen) { | |
2731 | uint32_t size = buffersize - nulltermsize + normLen + 1; | |
2732 | UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); | |
2733 | if(temp != NULL) { | |
2734 | nulltermsize = normLen + 1; | |
2735 | uprv_memcpy(temp + normLen, buffer, | |
2736 | sizeof(UChar) * (buffersize - nulltermsize)); | |
2737 | freeHeapWritableBuffer(data); | |
2738 | data->writableBuffer = temp; | |
2739 | data->writableBufSize = size; | |
2740 | } | |
2741 | } | |
2742 | ||
2743 | status = U_ZERO_ERROR; | |
2744 | /* | |
2745 | this puts the null termination infront of the normalized string instead | |
2746 | of the end | |
2747 | */ | |
2748 | pStartNorm = buffer + (nulltermsize - normLen); | |
2749 | *(pStartNorm - 1) = 0; | |
2750 | unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, | |
2751 | &status); | |
2752 | ||
2753 | data->pos = data->writableBuffer + nulltermsize; | |
2754 | data->origFlags = data->flags; | |
2755 | data->flags |= UCOL_ITER_INNORMBUF; | |
2756 | data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
2757 | } | |
2758 | ||
2759 | /** | |
2760 | * Contraction character management function that returns the previous character | |
2761 | * for the backwards iterator. | |
2762 | * Does nothing if the previous character is in buffer and not the first | |
2763 | * character in it. | |
2764 | * Else it checks previous character in data string to see if it is | |
2765 | * normalizable. | |
2766 | * If it is not, the character is simply copied into the buffer, else | |
2767 | * the whole normalized substring is copied into the buffer, including the | |
2768 | * current character. | |
2769 | * @param data collation element iterator data | |
2770 | * @return previous character | |
2771 | */ | |
2772 | static | |
2773 | inline UChar getPrevNormalizedChar(collIterate *data) | |
2774 | { | |
2775 | UChar prevch; | |
2776 | UChar ch; | |
2777 | UChar *start; | |
2778 | UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); | |
2779 | UChar *pNull = NULL; | |
2780 | if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || | |
2781 | (innormbuf && *(data->pos - 1) != 0)) { | |
2782 | /* | |
2783 | if no normalization. | |
2784 | if previous character is in normalized buffer, no further normalization | |
2785 | is required | |
2786 | */ | |
2787 | if(data->flags & UCOL_USE_ITERATOR) { | |
2788 | data->iterator->move(data->iterator, -1, UITER_CURRENT); | |
2789 | return (UChar)data->iterator->next(data->iterator); | |
2790 | } else { | |
2791 | return *(data->pos - 1); | |
2792 | } | |
2793 | } | |
2794 | ||
2795 | start = data->pos; | |
2796 | if (data->flags & UCOL_ITER_HASLEN) { | |
2797 | /* in data string */ | |
2798 | if ((start - 1) == data->string) { | |
2799 | return *(start - 1); | |
2800 | } | |
2801 | start --; | |
2802 | ch = *start; | |
2803 | prevch = *(start - 1); | |
2804 | } | |
2805 | else { | |
2806 | /* | |
2807 | in writable buffer, at this point fcdPosition can not be NULL. | |
2808 | see contracting tag. | |
2809 | */ | |
2810 | if (data->fcdPosition == data->string) { | |
2811 | /* at the start of the string, just dump it into the normalizer */ | |
2812 | insertBufferFront(data, data->pos - 1, *(data->fcdPosition)); | |
2813 | data->fcdPosition = NULL; | |
2814 | return *(data->pos - 1); | |
2815 | } | |
2816 | pNull = data->pos - 1; | |
2817 | start = data->fcdPosition; | |
2818 | ch = *start; | |
2819 | prevch = *(start - 1); | |
2820 | } | |
2821 | /* | |
2822 | * if the current character is not fcd. | |
2823 | * Trailing combining class == 0. | |
2824 | */ | |
2825 | if (data->fcdPosition > start && | |
2826 | (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) | |
2827 | { | |
2828 | /* | |
2829 | Need a more complete FCD check and possible normalization. | |
2830 | normalize substring will be appended to buffer | |
2831 | */ | |
2832 | UChar *backuppos = data->pos; | |
2833 | data->pos = start; | |
2834 | if (collPrevIterFCD(data)) { | |
2835 | normalizePrevContraction(data); | |
2836 | return *(data->pos - 1); | |
2837 | } | |
2838 | data->pos = backuppos; | |
2839 | data->fcdPosition ++; | |
2840 | } | |
2841 | ||
2842 | if (innormbuf) { | |
2843 | /* | |
2844 | no normalization is to be done hence only one character will be | |
2845 | appended to the buffer. | |
2846 | */ | |
2847 | insertBufferFront(data, pNull, ch); | |
2848 | data->fcdPosition --; | |
2849 | } | |
2850 | ||
2851 | return ch; | |
2852 | } | |
2853 | ||
2854 | /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ | |
2855 | /* It is called by getNextCE */ | |
2856 | ||
2857 | uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { | |
2858 | collIterateState entryState; | |
2859 | backupState(source, &entryState); | |
2860 | UChar32 cp = ch; | |
2861 | ||
2862 | for (;;) { | |
2863 | // This loop will repeat only in the case of contractions, and only when a contraction | |
2864 | // is found and the first CE resulting from that contraction is itself a special | |
2865 | // (an expansion, for example.) All other special CE types are fully handled the | |
2866 | // first time through, and the loop exits. | |
2867 | ||
2868 | const uint32_t *CEOffset = NULL; | |
2869 | switch(getCETag(CE)) { | |
2870 | case NOT_FOUND_TAG: | |
2871 | /* This one is not found, and we'll let somebody else bother about it... no more games */ | |
2872 | return CE; | |
2873 | case SURROGATE_TAG: | |
2874 | /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ | |
2875 | /* two things can happen here: next code point can be a trailing surrogate - we will use it */ | |
2876 | /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ | |
2877 | /* we return 0 (completely ignorable - per UCA specification */ | |
2878 | { | |
2879 | UChar trail; | |
2880 | collIterateState state; | |
2881 | backupState(source, &state); | |
2882 | if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { | |
374ca955 | 2883 | // we chould have stepped one char forward and it might have turned that it |
b75a7d8f A |
2884 | // was not a trail surrogate. In that case, we have to backup. |
2885 | loadState(source, &state, TRUE); | |
2886 | return 0; | |
2887 | } else { | |
2888 | /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ | |
2889 | CE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, CE&0xFFFFFF, trail); | |
2890 | if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. | |
2891 | // We need to backup | |
2892 | loadState(source, &state, TRUE); | |
2893 | return CE; | |
374ca955 | 2894 | } |
b75a7d8f A |
2895 | // calculate the supplementary code point value, if surrogate was not tailored |
2896 | cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); | |
2897 | } | |
2898 | } | |
2899 | break; | |
2900 | case THAI_TAG: | |
2901 | /* Thai/Lao reordering */ | |
2902 | if (((source->flags) & UCOL_ITER_INNORMBUF) /* Already Swapped || */ | |
374ca955 | 2903 | || collIter_eos(source)) /* At end of string. No swap possible */ |
b75a7d8f A |
2904 | { |
2905 | // Treat Thai as a length one expansion */ | |
2906 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ | |
2907 | CE = *CEOffset++; | |
2908 | } | |
2909 | else | |
2910 | { | |
374ca955 A |
2911 | // Move the prevowel and the following base Consonant into the normalization buffer |
2912 | // with their order swapped | |
2913 | // Note: this operation might activate the normalization buffer. We have to check for | |
2914 | // that and act accordingly. | |
2915 | UChar thCh = getNextNormalizedChar(source); | |
2916 | UChar32 cp = 0; | |
2917 | if(U16_IS_LEAD(thCh)) { | |
2918 | if(!collIter_eos(source)) { | |
2919 | collIterateState thaiState; | |
2920 | backupState(source, &thaiState); | |
2921 | UChar trailCh = getNextNormalizedChar(source); | |
2922 | if(U16_IS_TRAIL(trailCh)) { | |
2923 | cp = U16_GET_SUPPLEMENTARY(thCh, trailCh); | |
2924 | } else { | |
2925 | loadState(source, &thaiState, TRUE); | |
2926 | cp = (UChar32)thCh; | |
2927 | } | |
2928 | } else { | |
2929 | cp = (UChar32)thCh; | |
2930 | } | |
2931 | } else { | |
2932 | cp = (UChar32)thCh; | |
2933 | } | |
2934 | // Now we have the character that needs to be decomposed | |
2935 | // if the normalizing buffer was not used, we can just use our structure and be happy. | |
2936 | if((source->flags & UCOL_ITER_INNORMBUF) == 0) { | |
2937 | // decompose into writable buffer | |
2938 | int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1); | |
2939 | if(decompLen < 0) { | |
2940 | decompLen = -decompLen; | |
2941 | } | |
2942 | // reorder Thai and the character after it | |
2943 | if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) { | |
2944 | source->writableBuffer[0] = source->writableBuffer[1]; | |
2945 | source->writableBuffer[1] = source->writableBuffer[2]; | |
2946 | source->writableBuffer[2] = ch; | |
2947 | } else { | |
2948 | source->writableBuffer[0] = source->writableBuffer[1]; | |
2949 | source->writableBuffer[1] = ch; | |
2950 | } | |
2951 | // zero terminate, since normalization buffer is always zero terminated | |
2952 | source->writableBuffer[decompLen+1] = 0; // we added the prevowel | |
b75a7d8f | 2953 | if(source->pos) { |
374ca955 | 2954 | source->fcdPosition = source->pos; // Indicate where to continue in main input string |
b75a7d8f | 2955 | // after exhausting the writableBuffer |
b75a7d8f | 2956 | } |
374ca955 | 2957 | source->pos = source->writableBuffer; |
b75a7d8f A |
2958 | source->origFlags = source->flags; |
2959 | source->flags |= UCOL_ITER_INNORMBUF; | |
2960 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); | |
374ca955 A |
2961 | } |
2962 | else { | |
2963 | // stuff is already normalized... what to do here??? | |
2964 | ||
2965 | // if we are in the normalization buffer, thCh must be in it | |
2966 | // prove by contradiction | |
2967 | // if thCh is not in the normalization buffer, | |
2968 | // that means that trailCh is the normalization buffer | |
2969 | // that means that trailCh is a trail surrogate by the above | |
2970 | // bounding if block, this is a contradiction because there | |
2971 | // are no characters at the moment that decomposes to an | |
2972 | // unmatched surrogate. qed. | |
2973 | if (cp >= 0x10000) { | |
2974 | source->writableBuffer[0] = source->writableBuffer[1]; | |
2975 | source->writableBuffer[1] = source->writableBuffer[2]; | |
2976 | source->writableBuffer[2] = ch; | |
2977 | } | |
2978 | else { | |
2979 | source->writableBuffer[0] = source->writableBuffer[1]; | |
2980 | source->writableBuffer[1] = ch; | |
2981 | } | |
2982 | source->pos = source->writableBuffer; | |
2983 | } | |
2984 | CE = UCOL_IGNORABLE; | |
b75a7d8f A |
2985 | } |
2986 | break; | |
2987 | case SPEC_PROC_TAG: | |
2988 | { | |
2989 | // Special processing is getting a CE that is preceded by a certain prefix | |
2990 | // Currently this is only needed for optimizing Japanese length and iteration marks. | |
374ca955 A |
2991 | // When we encouter a special processing tag, we go backwards and try to see if |
2992 | // we have a match. | |
b75a7d8f A |
2993 | // Contraction tables are used - so the whole process is not unlike contraction. |
2994 | // prefix data is stored backwards in the table. | |
2995 | const UChar *UCharOffset; | |
2996 | UChar schar, tchar; | |
2997 | collIterateState prefixState; | |
2998 | backupState(source, &prefixState); | |
2999 | loadState(source, &entryState, TRUE); | |
3000 | goBackOne(source); // We want to look at the point where we entered - actually one | |
3001 | // before that... | |
3002 | ||
3003 | for(;;) { | |
3004 | // This loop will run once per source string character, for as long as we | |
374ca955 | 3005 | // are matching a potential contraction sequence |
b75a7d8f | 3006 | |
374ca955 | 3007 | // First we position ourselves at the begining of contraction sequence |
b75a7d8f A |
3008 | const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
3009 | if (collIter_bos(source)) { | |
3010 | CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); | |
3011 | break; | |
3012 | } | |
3013 | schar = getPrevNormalizedChar(source); | |
3014 | goBackOne(source); | |
3015 | ||
3016 | while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ | |
3017 | UCharOffset++; | |
3018 | } | |
3019 | ||
3020 | if (schar == tchar) { | |
3021 | // Found the source string char in the table. | |
3022 | // Pick up the corresponding CE from the table. | |
3023 | CE = *(coll->contractionCEs + | |
3024 | (UCharOffset - coll->contractionIndex)); | |
3025 | } | |
3026 | else | |
3027 | { | |
374ca955 | 3028 | // if there is a completely ignorable code point in the middle of |
b75a7d8f A |
3029 | // a prefix, we need to act as if it's not there |
3030 | // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) | |
3031 | // lone surrogates cannot be set to zero as it would break other processing | |
3032 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); | |
3033 | // it's easy for BMP code points | |
3034 | if(isZeroCE == 0) { | |
3035 | continue; | |
3036 | } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) { | |
3037 | // for supplementary code points, we have to check the next one | |
3038 | // situations where we are going to ignore | |
3039 | // 1. beginning of the string: schar is a lone surrogate | |
3040 | // 2. schar is a lone surrogate | |
3041 | // 3. schar is a trail surrogate in a valid surrogate sequence | |
3042 | // that is explicitly set to zero. | |
3043 | if (!collIter_bos(source)) { | |
3044 | UChar lead; | |
3045 | if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) { | |
3046 | isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead); | |
3047 | if(getCETag(isZeroCE) == SURROGATE_TAG) { | |
3048 | uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar); | |
3049 | if(finalCE == 0) { | |
3050 | // this is a real, assigned completely ignorable code point | |
3051 | goBackOne(source); | |
3052 | continue; | |
3053 | } | |
3054 | } | |
3055 | } else { | |
3056 | // lone surrogate, completely ignorable | |
3057 | continue; | |
3058 | } | |
3059 | } else { | |
3060 | // lone surrogate at the beggining, completely ignorable | |
3061 | continue; | |
3062 | } | |
3063 | } | |
3064 | // Source string char was not in the table. | |
3065 | // We have not found the prefix. | |
3066 | CE = *(coll->contractionCEs + | |
3067 | (ContractionStart - coll->contractionIndex)); | |
3068 | } | |
3069 | ||
3070 | if(!isPrefix(CE)) { | |
3071 | // The source string char was in the contraction table, and the corresponding | |
3072 | // CE is not a prefix CE. We found the prefix, break | |
3073 | // out of loop, this CE will end up being returned. This is the normal | |
3074 | // way out of prefix handling when the source actually contained | |
3075 | // the prefix. | |
3076 | break; | |
3077 | } | |
3078 | } | |
3079 | if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue | |
3080 | loadState(source, &prefixState, TRUE); | |
3081 | if(source->origFlags & UCOL_USE_ITERATOR) { | |
3082 | source->flags = source->origFlags; | |
3083 | } | |
3084 | } else { // prefix search was a failure, we have to backup all the way to the start | |
3085 | loadState(source, &entryState, TRUE); | |
3086 | } | |
3087 | break; | |
3088 | } | |
3089 | case CONTRACTION_TAG: | |
3090 | { | |
3091 | /* This should handle contractions */ | |
3092 | collIterateState state; | |
3093 | backupState(source, &state); | |
3094 | uint32_t firstCE = UCOL_NOT_FOUND; | |
3095 | const UChar *UCharOffset; | |
3096 | UChar schar, tchar; | |
3097 | ||
3098 | for (;;) { | |
3099 | /* This loop will run once per source string character, for as long as we */ | |
3100 | /* are matching a potential contraction sequence */ | |
3101 | ||
3102 | /* First we position ourselves at the begining of contraction sequence */ | |
3103 | const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); | |
3104 | ||
3105 | if (collIter_eos(source)) { | |
3106 | // Ran off the end of the source string. | |
3107 | CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); | |
3108 | // So we'll pick whatever we have at the point... | |
3109 | if (CE == UCOL_NOT_FOUND) { | |
3110 | // back up the source over all the chars we scanned going into this contraction. | |
374ca955 | 3111 | CE = firstCE; |
b75a7d8f A |
3112 | loadState(source, &state, TRUE); |
3113 | if(source->origFlags & UCOL_USE_ITERATOR) { | |
374ca955 | 3114 | source->flags = source->origFlags; |
b75a7d8f A |
3115 | } |
3116 | } | |
3117 | break; | |
3118 | } | |
3119 | ||
3120 | uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ | |
3121 | uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); | |
3122 | ||
3123 | schar = getNextNormalizedChar(source); | |
3124 | while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ | |
3125 | UCharOffset++; | |
3126 | } | |
3127 | ||
3128 | if (schar == tchar) { | |
3129 | // Found the source string char in the contraction table. | |
3130 | // Pick up the corresponding CE from the table. | |
3131 | CE = *(coll->contractionCEs + | |
3132 | (UCharOffset - coll->contractionIndex)); | |
3133 | } | |
3134 | else | |
3135 | { | |
374ca955 | 3136 | // if there is a completely ignorable code point in the middle of |
b75a7d8f A |
3137 | // contraction, we need to act as if it's not there |
3138 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); | |
3139 | // it's easy for BMP code points | |
3140 | if(isZeroCE == 0) { | |
374ca955 | 3141 | continue; |
b75a7d8f A |
3142 | } else if(UTF_IS_LEAD(schar)) { |
3143 | if(!collIter_eos(source)) { | |
3144 | backupState(source, &state); | |
3145 | UChar trail = getNextNormalizedChar(source); | |
3146 | if(UTF_IS_TRAIL(trail)) { // do stuff with trail | |
3147 | if(getCETag(isZeroCE) == SURROGATE_TAG) { | |
3148 | uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail); | |
3149 | if(finalCE == 0) { | |
3150 | continue; | |
3151 | } | |
3152 | } | |
3153 | } else { | |
3154 | // broken surrogate sequence, thus completely ignorable | |
3155 | loadState(source, &state, TRUE); | |
3156 | continue; | |
3157 | } | |
3158 | loadState(source, &state, TRUE); | |
374ca955 | 3159 | } else { // no more characters, so broken surrogate pair... |
b75a7d8f | 3160 | // this contraction will ultimately fail, but not because of us |
374ca955 | 3161 | continue; |
b75a7d8f A |
3162 | } |
3163 | } // else if(UTF_IS_LEAD(schar)) | |
3164 | ||
3165 | // Source string char was not in contraction table. | |
3166 | // Unless we have a discontiguous contraction, we have finished | |
3167 | // with this contraction. | |
3168 | uint8_t sCC; | |
374ca955 | 3169 | if (schar < 0x300 || |
b75a7d8f A |
3170 | maxCC == 0 || |
3171 | (sCC = i_getCombiningClass(schar, coll)) == 0 || | |
374ca955 | 3172 | sCC>maxCC || |
b75a7d8f A |
3173 | (allSame != 0 && sCC == maxCC) || |
3174 | collIter_eos(source)) { | |
374ca955 A |
3175 | // Contraction can not be discontiguous. |
3176 | goBackOne(source); // back up the source string by one, | |
b75a7d8f A |
3177 | // because the character we just looked at was |
3178 | // not part of the contraction. */ | |
3179 | CE = *(coll->contractionCEs + | |
3180 | (ContractionStart - coll->contractionIndex)); | |
3181 | } else { | |
3182 | // | |
3183 | // Contraction is possibly discontiguous. | |
3184 | // Scan more of source string looking for a match | |
3185 | // | |
3186 | UChar tempchar; | |
3187 | /* find the next character if schar is not a base character | |
3188 | and we are not yet at the end of the string */ | |
3189 | tempchar = getNextNormalizedChar(source); | |
3190 | goBackOne(source); | |
3191 | if (i_getCombiningClass(tempchar, coll) == 0) { | |
3192 | goBackOne(source); | |
3193 | /* Spit out the last char of the string, wasn't tasty enough */ | |
3194 | CE = *(coll->contractionCEs + | |
3195 | (ContractionStart - coll->contractionIndex)); | |
3196 | } else { | |
3197 | CE = getDiscontiguous(coll, source, ContractionStart); | |
3198 | } | |
3199 | } | |
3200 | } // else after if(schar == tchar) | |
3201 | ||
3202 | if(CE == UCOL_NOT_FOUND) { | |
3203 | /* The Source string did not match the contraction that we were checking. */ | |
3204 | /* Back up the source position to undo the effects of having partially */ | |
3205 | /* scanned through what ultimately proved to not be a contraction. */ | |
3206 | loadState(source, &state, TRUE); | |
3207 | CE = firstCE; | |
b75a7d8f A |
3208 | break; |
3209 | } | |
374ca955 | 3210 | |
b75a7d8f A |
3211 | if(!isContraction(CE)) { |
3212 | // The source string char was in the contraction table, and the corresponding | |
3213 | // CE is not a contraction CE. We completed the contraction, break | |
3214 | // out of loop, this CE will end up being returned. This is the normal | |
3215 | // way out of contraction handling when the source actually contained | |
3216 | // the contraction. | |
3217 | break; | |
3218 | } | |
374ca955 | 3219 | |
b75a7d8f A |
3220 | |
3221 | // The source string char was in the contraction table, and the corresponding | |
3222 | // CE is IS a contraction CE. We will continue looping to check the source | |
3223 | // string for the remaining chars in the contraction. | |
3224 | uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); | |
3225 | if(tempCE != UCOL_NOT_FOUND) { | |
3226 | // We have scanned a a section of source string for which there is a | |
374ca955 | 3227 | // CE from the contraction table. Remember the CE and scan position, so |
b75a7d8f A |
3228 | // that we can return to this point if further scanning fails to |
3229 | // match a longer contraction sequence. | |
3230 | firstCE = tempCE; | |
3231 | ||
3232 | goBackOne(source); | |
3233 | backupState(source, &state); | |
3234 | getNextNormalizedChar(source); | |
3235 | ||
3236 | // Another way to do this is: | |
3237 | //collIterateState tempState; | |
3238 | //backupState(source, &tempState); | |
3239 | //goBackOne(source); | |
3240 | //backupState(source, &state); | |
3241 | //loadState(source, &tempState, TRUE); | |
3242 | ||
3243 | // The problem is that for incomplete contractions we have to remember the previous | |
374ca955 | 3244 | // position. Before, the only thing I needed to do was state.pos--; |
b75a7d8f | 3245 | // After iterator introduction and especially after introduction of normalizing |
374ca955 | 3246 | // iterators, it became much more difficult to decrease the saved state. |
b75a7d8f A |
3247 | // I'm not yet sure which of the two methods above is faster. |
3248 | } | |
3249 | } // for(;;) | |
3250 | break; | |
3251 | } // case CONTRACTION_TAG: | |
3252 | case LONG_PRIMARY_TAG: | |
3253 | { | |
3254 | *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; | |
3255 | CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; | |
3256 | return CE; | |
3257 | } | |
3258 | case EXPANSION_TAG: | |
3259 | { | |
3260 | /* This should handle expansion. */ | |
3261 | /* NOTE: we can encounter both continuations and expansions in an expansion! */ | |
3262 | /* I have to decide where continuations are going to be dealt with */ | |
3263 | uint32_t size; | |
3264 | uint32_t i; /* general counter */ | |
3265 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ | |
3266 | size = getExpansionCount(CE); | |
3267 | CE = *CEOffset++; | |
3268 | if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ | |
3269 | for(i = 1; i<size; i++) { | |
3270 | *(source->CEpos++) = *CEOffset++; | |
3271 | } | |
374ca955 A |
3272 | } else { /* else, we do */ |
3273 | while(*CEOffset != 0) { | |
3274 | *(source->CEpos++) = *CEOffset++; | |
3275 | } | |
3276 | } | |
3277 | return CE; | |
3278 | } | |
3279 | case DIGIT_TAG: | |
3280 | { | |
3281 | /* | |
3282 | We do a check to see if we want to collate digits as numbers; if so we generate | |
3283 | a custom collation key. Otherwise we pull out the value stored in the expansion table. | |
3284 | */ | |
3285 | uint32_t size; | |
3286 | uint32_t i; /* general counter */ | |
3287 | collIterateState digitState; | |
3288 | ||
3289 | if (source->coll->numericCollation == UCOL_ON){ | |
3290 | UChar32 char32 = 0; | |
3291 | ||
3292 | uint32_t digIndx = 0; | |
3293 | uint32_t endIndex = 0; | |
3294 | uint32_t trailingZeroIndex = 0; | |
3295 | ||
3296 | uint32_t primWeight = 0; | |
3297 | ||
3298 | int32_t digVal = 0; | |
3299 | uint8_t collateVal = 0; | |
3300 | ||
3301 | UBool nonZeroValReached = FALSE; | |
3302 | ||
3303 | uint8_t *numTempBuf; | |
3304 | uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs. | |
3305 | uint32_t numTempBufSize = UCOL_MAX_BUFFER; | |
3306 | ||
3307 | numTempBuf = stackNumTempBuf; | |
3308 | /* | |
3309 | We parse the source string until we hit a char that's NOT a digit. | |
3310 | Use this u_charDigitValue. This might be slow because we have to | |
3311 | handle surrogates... | |
3312 | */ | |
3313 | /* | |
3314 | if (U16_IS_LEAD(ch)){ | |
3315 | if (!collIter_eos(source)) { | |
3316 | backupState(source, &digitState); | |
3317 | UChar trail = getNextNormalizedChar(source); | |
3318 | if(U16_IS_TRAIL(trail)) { | |
3319 | char32 = U16_GET_SUPPLEMENTARY(ch, trail); | |
3320 | } else { | |
3321 | loadState(source, &digitState, TRUE); | |
3322 | char32 = ch; | |
3323 | } | |
3324 | } else { | |
3325 | char32 = ch; | |
3326 | } | |
3327 | } else { | |
3328 | char32 = ch; | |
3329 | } | |
3330 | digVal = u_charDigitValue(char32); | |
3331 | */ | |
3332 | digVal = u_charDigitValue(cp); // if we have arrived here, we have | |
3333 | // already processed possible supplementaries that trigered the digit tag - | |
3334 | // all supplementaries are marked in the UCA. | |
3335 | /* | |
3336 | We pad a zero in front of the first element anyways. This takes | |
3337 | care of the (probably) most common case where people are sorting things followed | |
3338 | by a single digit | |
3339 | */ | |
3340 | digIndx++; | |
3341 | for(;;){ | |
3342 | // Make sure we have enough space. | |
3343 | if (digIndx >= ((numTempBufSize - 2) * 2) + 1) | |
3344 | { | |
3345 | numTempBufSize *= 2; | |
3346 | if (numTempBuf == stackNumTempBuf){ | |
3347 | numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize); | |
3348 | uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER); | |
3349 | }else | |
3350 | uprv_realloc(numTempBuf, numTempBufSize); | |
3351 | } | |
3352 | ||
3353 | // Skipping over leading zeroes. | |
3354 | if (digVal != 0 || nonZeroValReached){ | |
3355 | if (digVal != 0 && !nonZeroValReached) | |
3356 | nonZeroValReached = TRUE; | |
3357 | ||
3358 | /* | |
3359 | We parse the digit string into base 100 numbers (this fits into a byte). | |
3360 | We only add to the buffer in twos, thus if we are parsing an odd character, | |
3361 | that serves as the 'tens' digit while the if we are parsing an even one, that | |
3362 | is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into | |
3363 | a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid | |
3364 | overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less | |
3365 | than all the other bytes. | |
3366 | */ | |
3367 | ||
3368 | if (digIndx % 2 == 1){ | |
3369 | collateVal += (uint8_t)digVal; | |
3370 | ||
3371 | // We don't enter the low-order-digit case unless we've already seen | |
3372 | // the high order, or for the first digit, which is always non-zero. | |
3373 | if (collateVal != 0) | |
3374 | trailingZeroIndex = 0; | |
3375 | ||
3376 | numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
3377 | collateVal = 0; | |
3378 | } | |
3379 | else{ | |
3380 | // We drop the collation value into the buffer so if we need to do | |
3381 | // a "front patch" we don't have to check to see if we're hitting the | |
3382 | // last element. | |
3383 | collateVal = (uint8_t)(digVal * 10); | |
3384 | ||
3385 | // Check for trailing zeroes. | |
3386 | if (collateVal == 0) | |
3387 | { | |
3388 | if (!trailingZeroIndex) | |
3389 | trailingZeroIndex = (digIndx/2) + 2; | |
3390 | } | |
3391 | else | |
3392 | trailingZeroIndex = 0; | |
3393 | ||
3394 | numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
3395 | } | |
3396 | digIndx++; | |
3397 | } | |
3398 | ||
3399 | // Get next character. | |
3400 | if (!collIter_eos(source)){ | |
3401 | ch = getNextNormalizedChar(source); | |
3402 | if (U16_IS_LEAD(ch)){ | |
3403 | if (!collIter_eos(source)) { | |
3404 | backupState(source, &digitState); | |
3405 | UChar trail = getNextNormalizedChar(source); | |
3406 | if(U16_IS_TRAIL(trail)) { | |
3407 | char32 = U16_GET_SUPPLEMENTARY(ch, trail); | |
3408 | } else { | |
3409 | loadState(source, &digitState, TRUE); | |
3410 | char32 = ch; | |
3411 | } | |
3412 | } | |
3413 | } else { | |
3414 | char32 = ch; | |
3415 | } | |
3416 | ||
3417 | if ((digVal = u_charDigitValue(char32)) == -1){ | |
3418 | // Resetting position to point to the next unprocessed char. We | |
3419 | // overshot it when doing our test/set for numbers. | |
3420 | if (char32 > 0xFFFF) { // For surrogates. | |
3421 | loadState(source, &digitState, TRUE); | |
3422 | //goBackOne(source); | |
3423 | } | |
3424 | goBackOne(source); | |
3425 | break; | |
3426 | } | |
3427 | } else { | |
3428 | break; | |
3429 | } | |
3430 | } | |
3431 | ||
3432 | if (nonZeroValReached == FALSE){ | |
3433 | digIndx = 2; | |
3434 | numTempBuf[2] = 6; | |
b75a7d8f | 3435 | } |
374ca955 A |
3436 | |
3437 | endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; | |
3438 | if (digIndx % 2 != 0){ | |
3439 | /* | |
3440 | We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what | |
3441 | we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. | |
3442 | Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a | |
3443 | single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. | |
3444 | */ | |
3445 | ||
3446 | for(i = 2; i < endIndex; i++){ | |
3447 | numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + | |
3448 | (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; | |
3449 | } | |
3450 | --digIndx; | |
3451 | } | |
3452 | ||
3453 | // Subtract one off of the last byte. | |
3454 | numTempBuf[endIndex-1] -= 1; | |
3455 | ||
3456 | /* | |
3457 | We want to skip over the first two slots in the buffer. The first slot | |
3458 | is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the | |
3459 | sign/exponent byte: 0x80 + (decimalPos/2) & 7f. | |
3460 | */ | |
3461 | numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; | |
3462 | numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); | |
3463 | ||
3464 | // Now transfer the collation key to our collIterate struct. | |
3465 | // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. | |
3466 | size = ((endIndex+1) & ~1)/2; | |
3467 | CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight | |
3468 | (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight | |
3469 | UCOL_BYTE_COMMON; // Tertiary weight. | |
3470 | i = 2; // Reset the index into the buffer. | |
3471 | while(i < endIndex) | |
3472 | { | |
3473 | primWeight = numTempBuf[i++] << 8; | |
3474 | if ( i < endIndex) | |
3475 | primWeight |= numTempBuf[i++]; | |
3476 | *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; | |
3477 | } | |
3478 | ||
3479 | if (numTempBuf != stackNumTempBuf) | |
3480 | uprv_free(numTempBuf); | |
3481 | } else { | |
3482 | // no numeric mode, we'll just switch to whatever we stashed and continue | |
3483 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ | |
3484 | CE = *CEOffset++; | |
3485 | break; | |
3486 | #if 0 | |
3487 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ | |
3488 | size = getExpansionCount(CE); | |
3489 | CE = *CEOffset++; | |
3490 | if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ | |
3491 | for(i = 1; i<size; i++) { | |
3492 | *(source->CEpos++) = *CEOffset++; | |
3493 | } | |
3494 | } else { /* else, we do */ | |
3495 | while(*CEOffset != 0) { | |
3496 | *(source->CEpos++) = *CEOffset++; | |
3497 | } | |
3498 | } | |
3499 | #endif | |
b75a7d8f A |
3500 | } |
3501 | return CE; | |
3502 | } | |
b75a7d8f A |
3503 | /* various implicits optimization */ |
3504 | // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit | |
3505 | case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ | |
3506 | //return getImplicit(cp, source, 0x04000000); | |
3507 | return getImplicit(cp, source); | |
3508 | case IMPLICIT_TAG: /* everything that is not defined otherwise */ | |
3509 | /* UCA is filled with these. Tailorings are NOT_FOUND */ | |
3510 | //return getImplicit(cp, source, 0); | |
3511 | return getImplicit(cp, source); | |
3512 | case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ | |
3513 | return 0; /* broken surrogate sequence */ | |
3514 | case LEAD_SURROGATE_TAG: /* D800-DBFF*/ | |
3515 | UChar nextChar; | |
3516 | if( source->flags & UCOL_USE_ITERATOR) { | |
3517 | if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { | |
3518 | cp = U16_GET_SUPPLEMENTARY(ch, nextChar); | |
3519 | source->iterator->next(source->iterator); | |
3520 | return getImplicit(cp, source); | |
3521 | } else { | |
3522 | return 0; | |
3523 | } | |
3524 | } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && | |
3525 | U_IS_TRAIL((nextChar=*source->pos))) { | |
3526 | cp = U16_GET_SUPPLEMENTARY(ch, nextChar); | |
3527 | source->pos++; | |
3528 | return getImplicit(cp, source); | |
3529 | } else { | |
3530 | return 0; /* completely ignorable */ | |
3531 | } | |
3532 | case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ | |
3533 | { | |
3534 | const uint32_t | |
3535 | SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; | |
3536 | //const uint32_t LCount = 19; | |
374ca955 | 3537 | const uint32_t VCount = 21; |
b75a7d8f A |
3538 | const uint32_t TCount = 28; |
3539 | //const uint32_t NCount = VCount * TCount; // 588 | |
3540 | //const uint32_t SCount = LCount * NCount; // 11172 | |
3541 | uint32_t L = ch - SBase; | |
3542 | ||
3543 | // divide into pieces | |
3544 | ||
3545 | uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation | |
3546 | L /= TCount; | |
3547 | uint32_t V = L % VCount; | |
3548 | L /= VCount; | |
3549 | ||
3550 | // offset them | |
3551 | ||
3552 | L += LBase; | |
3553 | V += VBase; | |
3554 | T += TBase; | |
3555 | ||
3556 | // return the first CE, but first put the rest into the expansion buffer | |
3557 | if (!source->coll->image->jamoSpecial) { // FAST PATH | |
3558 | ||
3559 | /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/ | |
3560 | /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/ | |
3561 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V); | |
3562 | if (T != TBase) { | |
3563 | /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/ | |
3564 | /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/ | |
3565 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T); | |
3566 | } | |
3567 | ||
3568 | /*return ucmpe32_get(UCA->mapping, L);*/ // return first one | |
3569 | /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/ | |
3570 | return UTRIE_GET32_FROM_LEAD(coll->mapping, L); | |
3571 | ||
3572 | } else { // Jamo is Special | |
374ca955 | 3573 | // Since Hanguls pass the FCD check, it is |
b75a7d8f A |
3574 | // guaranteed that we won't be in |
3575 | // the normalization buffer if something like this happens | |
3576 | // However, if we are using a uchar iterator and normalization | |
3577 | // is ON, the Hangul that lead us here is going to be in that | |
374ca955 | 3578 | // normalization buffer. Here we want to restore the uchar |
b75a7d8f A |
3579 | // iterator state and pull out of the normalization buffer |
3580 | if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { | |
3581 | source->flags = source->origFlags; // restore the iterator | |
3582 | source->pos = NULL; | |
3583 | } | |
3584 | // Move Jamos into normalization buffer | |
3585 | source->writableBuffer[0] = (UChar)L; | |
3586 | source->writableBuffer[1] = (UChar)V; | |
3587 | if (T != TBase) { | |
3588 | source->writableBuffer[2] = (UChar)T; | |
3589 | source->writableBuffer[3] = 0; | |
3590 | } else { | |
3591 | source->writableBuffer[2] = 0; | |
3592 | } | |
3593 | ||
3594 | source->fcdPosition = source->pos; // Indicate where to continue in main input string | |
3595 | // after exhausting the writableBuffer | |
3596 | source->pos = source->writableBuffer; | |
3597 | source->origFlags = source->flags; | |
3598 | source->flags |= UCOL_ITER_INNORMBUF; | |
3599 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
3600 | ||
3601 | return(UCOL_IGNORABLE); | |
3602 | } | |
3603 | } | |
3604 | case CHARSET_TAG: | |
3605 | /* not yet implemented */ | |
3606 | /* probably after 1.8 */ | |
3607 | return UCOL_NOT_FOUND; | |
3608 | default: | |
3609 | *status = U_INTERNAL_PROGRAM_ERROR; | |
3610 | CE=0; | |
3611 | break; | |
3612 | } | |
3613 | if (CE <= UCOL_NOT_FOUND) break; | |
3614 | } | |
3615 | return CE; | |
3616 | } | |
3617 | ||
3618 | ||
3619 | /* now uses Mark's getImplicitPrimary code */ | |
3620 | static | |
3621 | inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { | |
3622 | if(isNonChar(cp)) { | |
3623 | return 0; | |
3624 | } | |
3625 | ||
374ca955 | 3626 | uint32_t r = uprv_uca_getImplicitPrimary(cp); |
b75a7d8f A |
3627 | |
3628 | *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; | |
3629 | collationSource->toReturn = collationSource->CEpos; | |
3630 | return ((r & 0x0000FFFF)<<16) | 0x000000C0; | |
3631 | } | |
3632 | ||
3633 | /** | |
3634 | * This function handles the special CEs like contractions, expansions, | |
3635 | * surrogates, Thai. | |
3636 | * It is called by both getPrevCE | |
3637 | */ | |
3638 | uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, | |
3639 | collIterate *source, | |
3640 | UErrorCode *status) | |
3641 | { | |
3642 | const uint32_t *CEOffset = NULL; | |
3643 | UChar *UCharOffset = NULL; | |
3644 | UChar schar; | |
3645 | const UChar *constart = NULL; | |
3646 | uint32_t size; | |
3647 | UChar buffer[UCOL_MAX_BUFFER]; | |
3648 | uint32_t *endCEBuffer; | |
3649 | UChar *strbuffer; | |
3650 | int32_t noChars = 0; | |
3651 | ||
3652 | for(;;) | |
3653 | { | |
3654 | /* the only ces that loops are thai and contractions */ | |
3655 | switch (getCETag(CE)) | |
3656 | { | |
3657 | case NOT_FOUND_TAG: /* this tag always returns */ | |
3658 | return CE; | |
3659 | case SURROGATE_TAG: /* This is a surrogate pair */ | |
3660 | /* essentialy an engaged lead surrogate. */ | |
3661 | /* if you have encountered it here, it means that a */ | |
3662 | /* broken sequence was encountered and this is an error */ | |
3663 | return 0; | |
3664 | case THAI_TAG: | |
3665 | if ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */ | |
3666 | source->string == source->pos || /* At start of string.|| */ | |
3667 | /* previous char not Thai prevowel */ | |
3668 | /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally | |
3669 | UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE) | |
3670 | //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE) | |
3671 | { | |
3672 | /* Treat Thai as a length one expansion */ | |
3673 | /* find the offset to expansion table */ | |
3674 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); | |
3675 | CE = *CEOffset ++; | |
3676 | } | |
3677 | else | |
3678 | { | |
3679 | /* | |
3680 | Move the prevowel and the following base Consonant into the | |
3681 | normalization buffer with their order swapped | |
3682 | */ | |
374ca955 A |
3683 | UChar32 cp = (UChar32)peekCharacter(source, 0); |
3684 | UBool reorder = TRUE; | |
3685 | ||
3686 | int32_t decompLen = unorm_getDecomposition(cp, FALSE, source->writableBuffer, UCOL_WRITABLE_BUFFER_SIZE-1); | |
3687 | if(decompLen < 0) { | |
3688 | decompLen = -decompLen; // there was no decomposition | |
3689 | } else { // we need to check if we will hit a contraction trigger because of decomposition | |
3690 | int32_t i = decompLen; | |
3691 | for(i = 0; i < decompLen; i++) { | |
3692 | if(ucol_contractionEndCP(source->writableBuffer[i], coll)) { | |
3693 | reorder = FALSE; | |
3694 | } | |
3695 | } | |
3696 | } | |
3697 | ||
3698 | UChar *tempbuffer = source->writableBuffer + | |
3699 | (source->writableBufSize - 1); | |
3700 | uprv_memcpy(tempbuffer-decompLen + 1, source->writableBuffer, sizeof(UChar)*decompLen); | |
3701 | if(reorder) { | |
3702 | *(tempbuffer - decompLen) = *(tempbuffer - decompLen + 1); | |
3703 | *(tempbuffer - decompLen + 1) = peekCharacter(source, -1); | |
3704 | } else { | |
3705 | *(tempbuffer - decompLen) = peekCharacter(source, -1); | |
3706 | } | |
3707 | *(tempbuffer - decompLen - 1) = 0; | |
3708 | ||
3709 | ||
3710 | /* | |
b75a7d8f A |
3711 | UChar *tempbuffer = source->writableBuffer + |
3712 | (source->writableBufSize - 1); | |
3713 | *(tempbuffer - 2) = 0; | |
3714 | *(tempbuffer - 1) = peekCharacter(source, 0); | |
3715 | *(tempbuffer) = peekCharacter(source, -1); | |
374ca955 | 3716 | */ |
b75a7d8f A |
3717 | /* |
3718 | Indicate where to continue in main input string after exhausting | |
3719 | the writableBuffer | |
3720 | */ | |
3721 | if (source->pos - 1 == source->string) { | |
3722 | source->fcdPosition = NULL; | |
3723 | } else { | |
3724 | source->fcdPosition = source->pos-2; | |
3725 | } | |
3726 | ||
374ca955 | 3727 | source->pos = tempbuffer+1; // we're doing predecrement, right? |
b75a7d8f A |
3728 | source->origFlags = source->flags; |
3729 | source->flags |= UCOL_ITER_INNORMBUF; | |
3730 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
3731 | ||
3732 | //CE = UCOL_IGNORABLE; | |
3733 | return(UCOL_IGNORABLE); | |
3734 | } | |
3735 | break; | |
3736 | case SPEC_PROC_TAG: | |
3737 | { | |
3738 | // Special processing is getting a CE that is preceded by a certain prefix | |
3739 | // Currently this is only needed for optimizing Japanese length and iteration marks. | |
374ca955 A |
3740 | // When we encouter a special processing tag, we go backwards and try to see if |
3741 | // we have a match. | |
b75a7d8f A |
3742 | // Contraction tables are used - so the whole process is not unlike contraction. |
3743 | // prefix data is stored backwards in the table. | |
3744 | const UChar *UCharOffset; | |
3745 | UChar schar, tchar; | |
3746 | collIterateState prefixState; | |
3747 | backupState(source, &prefixState); | |
3748 | for(;;) { | |
3749 | // This loop will run once per source string character, for as long as we | |
374ca955 | 3750 | // are matching a potential contraction sequence |
b75a7d8f | 3751 | |
374ca955 | 3752 | // First we position ourselves at the begining of contraction sequence |
b75a7d8f A |
3753 | const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
3754 | ||
3755 | if (collIter_bos(source)) { | |
3756 | CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); | |
3757 | break; | |
3758 | } | |
3759 | schar = getPrevNormalizedChar(source); | |
3760 | goBackOne(source); | |
3761 | ||
3762 | while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ | |
3763 | UCharOffset++; | |
3764 | } | |
3765 | ||
3766 | if (schar == tchar) { | |
3767 | // Found the source string char in the table. | |
3768 | // Pick up the corresponding CE from the table. | |
3769 | CE = *(coll->contractionCEs + | |
3770 | (UCharOffset - coll->contractionIndex)); | |
3771 | } | |
3772 | else | |
374ca955 A |
3773 | { |
3774 | // if there is a completely ignorable code point in the middle of | |
b75a7d8f A |
3775 | // a prefix, we need to act as if it's not there |
3776 | // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) | |
3777 | // lone surrogates cannot be set to zero as it would break other processing | |
3778 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); | |
3779 | // it's easy for BMP code points | |
3780 | if(isZeroCE == 0) { | |
3781 | continue; | |
3782 | } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) { | |
3783 | // for supplementary code points, we have to check the next one | |
3784 | // situations where we are going to ignore | |
3785 | // 1. beginning of the string: schar is a lone surrogate | |
3786 | // 2. schar is a lone surrogate | |
3787 | // 3. schar is a trail surrogate in a valid surrogate sequence | |
3788 | // that is explicitly set to zero. | |
3789 | if (!collIter_bos(source)) { | |
3790 | UChar lead; | |
3791 | if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) { | |
3792 | isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead); | |
3793 | if(getCETag(isZeroCE) == SURROGATE_TAG) { | |
3794 | uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar); | |
3795 | if(finalCE == 0) { | |
3796 | // this is a real, assigned completely ignorable code point | |
3797 | goBackOne(source); | |
3798 | continue; | |
3799 | } | |
3800 | } | |
3801 | } else { | |
3802 | // lone surrogate, completely ignorable | |
3803 | continue; | |
3804 | } | |
3805 | } else { | |
3806 | // lone surrogate at the beggining, completely ignorable | |
3807 | continue; | |
3808 | } | |
3809 | } | |
3810 | // Source string char was not in the table. | |
3811 | // We have not found the prefix. | |
3812 | CE = *(coll->contractionCEs + | |
3813 | (ContractionStart - coll->contractionIndex)); | |
3814 | } | |
3815 | ||
3816 | if(!isPrefix(CE)) { | |
3817 | // The source string char was in the contraction table, and the corresponding | |
3818 | // CE is not a prefix CE. We found the prefix, break | |
3819 | // out of loop, this CE will end up being returned. This is the normal | |
3820 | // way out of prefix handling when the source actually contained | |
3821 | // the prefix. | |
3822 | break; | |
3823 | } | |
3824 | } | |
3825 | loadState(source, &prefixState, TRUE); | |
3826 | break; | |
3827 | } | |
3828 | ||
3829 | case CONTRACTION_TAG: | |
3830 | /* to ensure that the backwards and forwards iteration matches, we | |
3831 | take the current region of most possible match and pass it through | |
3832 | the forward iteration. this will ensure that the obstinate problem of | |
3833 | overlapping contractions will not occur. | |
3834 | */ | |
3835 | schar = peekCharacter(source, 0); | |
3836 | constart = (UChar *)coll->image + getContractOffset(CE); | |
3837 | if (isAtStartPrevIterate(source) | |
3838 | /* commented away contraction end checks after adding the checks | |
3839 | in getPrevCE */) { | |
3840 | /* start of string or this is not the end of any contraction */ | |
3841 | CE = *(coll->contractionCEs + | |
3842 | (constart - coll->contractionIndex)); | |
3843 | break; | |
3844 | } | |
3845 | strbuffer = buffer; | |
3846 | UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); | |
3847 | *(UCharOffset --) = 0; | |
3848 | noChars = 0; | |
3849 | // have to swap thai characters | |
374ca955 A |
3850 | while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) { |
3851 | // we might have ended here after trying to reorder Thai, but seeing that there are unsafe points | |
3852 | // in the backward processing | |
b75a7d8f A |
3853 | *(UCharOffset) = schar; |
3854 | noChars++; | |
3855 | UCharOffset --; | |
3856 | schar = getPrevNormalizedChar(source); | |
3857 | goBackOne(source); | |
3858 | // TODO: when we exhaust the contraction buffer, | |
3859 | // it needs to get reallocated. The problem is | |
3860 | // that the size depends on the string which is | |
3861 | // not iterated over. However, since we're travelling | |
3862 | // backwards, we already had to set the iterator at | |
3863 | // the end - so we might as well know where we are? | |
3864 | if (UCharOffset + 1 == buffer) { | |
3865 | /* we have exhausted the buffer */ | |
3866 | int32_t newsize = 0; | |
3867 | if(source->pos) { // actually dealing with a position | |
3868 | newsize = source->pos - source->string + 1; | |
3869 | } else { // iterator | |
3870 | newsize = 4 * UCOL_MAX_BUFFER; | |
3871 | } | |
3872 | strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * | |
3873 | (newsize + UCOL_MAX_BUFFER)); | |
374ca955 A |
3874 | /* test for NULL */ |
3875 | if (strbuffer == NULL) { | |
3876 | *status = U_MEMORY_ALLOCATION_ERROR; | |
3877 | return UCOL_NO_MORE_CES; | |
3878 | } | |
b75a7d8f A |
3879 | UCharOffset = strbuffer + newsize; |
3880 | uprv_memcpy(UCharOffset, buffer, | |
3881 | UCOL_MAX_BUFFER * sizeof(UChar)); | |
3882 | UCharOffset --; | |
3883 | } | |
3884 | if ((source->pos && (source->pos == source->string || | |
3885 | ((source->flags & UCOL_ITER_INNORMBUF) && | |
3886 | *(source->pos - 1) == 0 && source->fcdPosition == NULL))) | |
3887 | || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { | |
3888 | break; | |
3889 | } | |
3890 | } | |
3891 | /* adds the initial base character to the string */ | |
3892 | *(UCharOffset) = schar; | |
3893 | noChars++; | |
3894 | ||
374ca955 | 3895 | /* a new collIterate is used to simplify things, since using the current |
b75a7d8f A |
3896 | collIterate will mean that the forward and backwards iteration will |
3897 | share and change the same buffers. we don't want to get into that. */ | |
3898 | collIterate temp; | |
3899 | //IInit_collIterate(coll, UCharOffset, -1, &temp); | |
3900 | IInit_collIterate(coll, UCharOffset, noChars, &temp); | |
3901 | temp.flags &= ~UCOL_ITER_NORM; | |
3902 | ||
3903 | CE = ucol_IGetNextCE(coll, &temp, status); | |
3904 | endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; | |
3905 | while (CE != UCOL_NO_MORE_CES) { | |
3906 | *(source->CEpos ++) = CE; | |
3907 | if (source->CEpos == endCEBuffer) { | |
3908 | /* ran out of CE space, bail. | |
3909 | there's no guarantee of the right character position after | |
3910 | this bail*/ | |
3911 | *status = U_BUFFER_OVERFLOW_ERROR; | |
3912 | source->CEpos = source->CEs; | |
3913 | freeHeapWritableBuffer(&temp); | |
3914 | if (strbuffer != buffer) { | |
3915 | uprv_free(strbuffer); | |
3916 | } | |
374ca955 | 3917 | return (uint32_t)UCOL_NULLORDER; |
b75a7d8f A |
3918 | } |
3919 | CE = ucol_IGetNextCE(coll, &temp, status); | |
3920 | } | |
3921 | freeHeapWritableBuffer(&temp); | |
3922 | if (strbuffer != buffer) { | |
3923 | uprv_free(strbuffer); | |
3924 | } | |
3925 | source->toReturn = source->CEpos - 1; | |
3926 | if (source->toReturn == source->CEs) { | |
3927 | source->CEpos = source->CEs; | |
3928 | } | |
3929 | return *(source->toReturn); | |
3930 | case LONG_PRIMARY_TAG: | |
3931 | { | |
3932 | *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; | |
3933 | *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; | |
3934 | source->toReturn = source->CEpos - 1; | |
3935 | return *(source->toReturn); | |
3936 | } | |
3937 | case EXPANSION_TAG: /* this tag always returns */ | |
3938 | /* | |
3939 | This should handle expansion. | |
3940 | NOTE: we can encounter both continuations and expansions in an expansion! | |
3941 | I have to decide where continuations are going to be dealt with | |
3942 | */ | |
3943 | /* find the offset to expansion table */ | |
3944 | CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
3945 | size = getExpansionCount(CE); | |
3946 | if (size != 0) { | |
3947 | /* | |
3948 | if there are less than 16 elements in expansion, we don't terminate | |
3949 | */ | |
3950 | uint32_t count; | |
3951 | for (count = 0; count < size; count++) { | |
3952 | *(source->CEpos ++) = *CEOffset++; | |
3953 | } | |
3954 | } | |
3955 | else { | |
3956 | /* else, we do */ | |
3957 | while (*CEOffset != 0) { | |
3958 | *(source->CEpos ++) = *CEOffset ++; | |
3959 | } | |
3960 | } | |
3961 | source->toReturn = source->CEpos - 1; | |
374ca955 | 3962 | // in case of one element expansion, we |
b75a7d8f A |
3963 | // want to immediately return CEpos |
3964 | if(source->toReturn == source->CEs) { | |
3965 | source->CEpos = source->CEs; | |
3966 | } | |
3967 | return *(source->toReturn); | |
374ca955 | 3968 | case DIGIT_TAG: |
b75a7d8f | 3969 | { |
374ca955 A |
3970 | /* |
3971 | We do a check to see if we want to collate digits as numbers; if so we generate | |
b75a7d8f A |
3972 | a custom collation key. Otherwise we pull out the value stored in the expansion table. |
3973 | */ | |
374ca955 | 3974 | //uint32_t size; |
b75a7d8f | 3975 | uint32_t i; /* general counter */ |
374ca955 A |
3976 | collIterateState state; |
3977 | ||
3978 | if (source->coll->numericCollation == UCOL_ON){ | |
3979 | UChar32 char32 = 0; | |
3980 | ||
3981 | uint32_t digIndx = 0; | |
3982 | uint32_t endIndex = 0; | |
3983 | uint32_t leadingZeroIndex = 0; | |
3984 | uint32_t trailingZeroCount = 0; | |
3985 | ||
3986 | uint32_t primWeight = 0; | |
3987 | ||
3988 | int32_t digVal = 0; | |
3989 | uint8_t collateVal = 0; | |
3990 | ||
3991 | UBool nonZeroValReached = FALSE; | |
3992 | ||
3993 | uint8_t *numTempBuf; | |
3994 | uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs. | |
3995 | uint32_t numTempBufSize = UCOL_MAX_BUFFER; | |
3996 | ||
3997 | numTempBuf = stackNumTempBuf; | |
3998 | /* | |
3999 | We parse the source string until we hit a char that's NOT a digit. | |
4000 | Use this u_charDigitValue. This might be slow because we have to | |
4001 | handle surrogates... | |
4002 | */ | |
4003 | ||
4004 | if (U16_IS_TRAIL (ch)){ | |
4005 | if (!collIter_bos(source)){ | |
4006 | UChar lead = getPrevNormalizedChar(source); | |
4007 | if(U16_IS_LEAD(lead)) { | |
4008 | char32 = U16_GET_SUPPLEMENTARY(lead,ch); | |
4009 | goBackOne(source); | |
4010 | } else { | |
4011 | char32 = ch; | |
4012 | } | |
4013 | } else { | |
4014 | char32 = ch; | |
4015 | } | |
4016 | } else { | |
4017 | char32 = ch; | |
4018 | } | |
4019 | digVal = u_charDigitValue(char32); | |
4020 | ||
4021 | for(;;){ | |
4022 | // Make sure we have enough space. | |
4023 | if (digIndx >= ((numTempBufSize - 2) * 2) + 1) | |
4024 | { | |
4025 | numTempBufSize *= 2; | |
4026 | if (numTempBuf == stackNumTempBuf){ | |
4027 | numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize); | |
4028 | uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER); | |
4029 | }else | |
4030 | uprv_realloc(numTempBuf, numTempBufSize); | |
4031 | } | |
4032 | ||
4033 | // Skip over trailing zeroes, and keep a count of them. | |
4034 | if (digVal != 0) | |
4035 | nonZeroValReached = TRUE; | |
4036 | if (nonZeroValReached){ | |
4037 | /* | |
4038 | We parse the digit string into base 100 numbers (this fits into a byte). | |
4039 | We only add to the buffer in twos, thus if we are parsing an odd character, | |
4040 | that serves as the 'tens' digit while the if we are parsing an even one, that | |
4041 | is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into | |
4042 | a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid | |
4043 | overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less | |
4044 | than all the other bytes. | |
4045 | ||
4046 | Since we're doing in this reverse we want to put the first digit encountered into the | |
4047 | ones place and the second digit encountered into the tens place. | |
4048 | */ | |
4049 | ||
4050 | if ((digIndx + trailingZeroCount) % 2 == 1){ | |
4051 | // High-order digit case (tens place) | |
4052 | collateVal += (uint8_t)(digVal * 10); | |
4053 | ||
4054 | // We cannot set leadingZeroIndex unless it has been set for the | |
4055 | // low-order digit. Therefore, all we can do for the high-order | |
4056 | // digit is turn it off, never on. | |
4057 | // The only time we will have a high digit without a low is for | |
4058 | // the very first non-zero digit, so no zero check is necessary. | |
4059 | if (collateVal != 0) | |
4060 | leadingZeroIndex = 0; | |
4061 | ||
4062 | numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
4063 | collateVal = 0; | |
4064 | } | |
4065 | else{ | |
4066 | // Low-order digit case (ones place) | |
4067 | collateVal = (uint8_t)digVal; | |
4068 | ||
4069 | // Check for leading zeroes. | |
4070 | if (collateVal == 0) | |
4071 | { | |
4072 | if (!leadingZeroIndex) | |
4073 | leadingZeroIndex = (digIndx/2) + 2; | |
4074 | } | |
4075 | else | |
4076 | leadingZeroIndex = 0; | |
4077 | ||
4078 | // No need to write to buffer; the case of a last odd digit | |
4079 | // is handled below. | |
4080 | } | |
4081 | ++digIndx; | |
4082 | } | |
4083 | else | |
4084 | ++trailingZeroCount; | |
4085 | ||
4086 | if (!collIter_bos(source)){ | |
4087 | ch = getPrevNormalizedChar(source); | |
4088 | //goBackOne(source); | |
4089 | if (U16_IS_TRAIL(ch)){ | |
4090 | backupState(source, &state); | |
4091 | if (!collIter_bos(source)) | |
4092 | { | |
4093 | goBackOne(source); | |
4094 | UChar lead = getPrevNormalizedChar(source); | |
4095 | if(U16_IS_LEAD(lead)) { | |
4096 | char32 = U16_GET_SUPPLEMENTARY(lead,ch); | |
4097 | } else { | |
4098 | loadState(source, &state, FALSE); | |
4099 | char32 = ch; | |
4100 | } | |
4101 | } | |
4102 | } | |
4103 | else | |
4104 | char32 = ch; | |
4105 | ||
4106 | if ((digVal = u_charDigitValue(char32)) == -1){ | |
4107 | if (char32 > 0xFFFF) {// For surrogates. | |
4108 | loadState(source, &state, FALSE); | |
4109 | } | |
4110 | // Don't need to "reverse" the goBackOne call, | |
4111 | // as this points to the next position to process.. | |
4112 | //if (char32 > 0xFFFF) // For surrogates. | |
4113 | //getNextNormalizedChar(source); | |
4114 | break; | |
4115 | } | |
4116 | goBackOne(source); | |
4117 | }else | |
4118 | break; | |
4119 | } | |
4120 | ||
4121 | if (nonZeroValReached == FALSE){ | |
4122 | digIndx = 2; | |
4123 | trailingZeroCount = 0; | |
4124 | numTempBuf[2] = 6; | |
4125 | } | |
4126 | ||
4127 | if ((digIndx + trailingZeroCount) % 2 != 0){ | |
4128 | numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; | |
4129 | digIndx += 1; // The implicit leading zero | |
4130 | } | |
4131 | if (trailingZeroCount % 2 != 0){ | |
4132 | // We had to consume one trailing zero for the low digit | |
4133 | // of the least significant byte | |
4134 | digIndx += 1; // The trailing zero not in the exponent | |
4135 | trailingZeroCount -= 1; | |
4136 | } | |
4137 | ||
4138 | endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; | |
4139 | ||
4140 | // Subtract one off of the last byte. Really the first byte here, but it's reversed... | |
4141 | numTempBuf[2] -= 1; | |
4142 | ||
4143 | /* | |
4144 | We want to skip over the first two slots in the buffer. The first slot | |
4145 | is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the | |
4146 | sign/exponent byte: 0x80 + (decimalPos/2) & 7f. | |
4147 | The exponent must be adjusted by the number of leading zeroes, and the number of | |
4148 | trailing zeroes. | |
4149 | */ | |
4150 | numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; | |
4151 | uint32_t exponent = (digIndx+trailingZeroCount)/2; | |
4152 | if (leadingZeroIndex) | |
4153 | exponent -= ((digIndx/2) + 2 - leadingZeroIndex); | |
4154 | numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); | |
4155 | ||
4156 | // Now transfer the collation key to our collIterate struct. | |
4157 | // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. | |
4158 | //size = ((endIndex+1) & ~1)/2; | |
4159 | *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight | |
4160 | (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight | |
4161 | UCOL_BYTE_COMMON; // Tertiary weight. | |
4162 | i = endIndex - 1; // Reset the index into the buffer. | |
4163 | while(i >= 2) | |
4164 | { | |
4165 | primWeight = numTempBuf[i--] << 8; | |
4166 | if ( i >= 2) | |
4167 | primWeight |= numTempBuf[i--]; | |
4168 | *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; | |
4169 | } | |
4170 | if (numTempBuf != stackNumTempBuf) | |
4171 | uprv_free(numTempBuf); | |
4172 | ||
4173 | source->toReturn = source->CEpos -1; | |
4174 | return *(source->toReturn); | |
b75a7d8f | 4175 | } |
374ca955 A |
4176 | else { |
4177 | CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
4178 | CE = *(CEOffset++); | |
4179 | break; | |
4180 | #if 0 | |
4181 | /* find the offset to expansion table */ | |
4182 | CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
4183 | size = getExpansionCount(CE); | |
4184 | if (size != 0) { | |
4185 | /* | |
4186 | if there are less than 16 elements in expansion, we don't terminate | |
4187 | */ | |
4188 | uint32_t count; | |
4189 | for (count = 0; count < size; count++) { | |
4190 | *(source->CEpos ++) = *CEOffset++; | |
4191 | } | |
4192 | } | |
4193 | else { | |
4194 | /* else, we do */ | |
4195 | while (*CEOffset != 0) { | |
4196 | *(source->CEpos ++) = *CEOffset ++; | |
4197 | } | |
4198 | } | |
4199 | source->toReturn = source->CEpos - 1; | |
4200 | // in case of one element expansion, we | |
b75a7d8f A |
4201 | // want to immediately return CEpos |
4202 | if(source->toReturn == source->CEs) { | |
4203 | source->CEpos = source->CEs; | |
4204 | } | |
374ca955 A |
4205 | return *(source->toReturn); |
4206 | #endif | |
4207 | } | |
4208 | } | |
b75a7d8f A |
4209 | case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ |
4210 | { | |
4211 | const uint32_t | |
4212 | SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; | |
374ca955 | 4213 | //const uint32_t LCount = 19; |
b75a7d8f A |
4214 | const uint32_t VCount = 21; |
4215 | const uint32_t TCount = 28; | |
4216 | //const uint32_t NCount = VCount * TCount; /* 588 */ | |
4217 | //const uint32_t SCount = LCount * NCount; /* 11172 */ | |
4218 | ||
4219 | uint32_t L = ch - SBase; | |
4220 | /* | |
4221 | divide into pieces. | |
4222 | we do it in this order since some compilers can do % and / in one | |
4223 | operation | |
4224 | */ | |
4225 | uint32_t T = L % TCount; | |
4226 | L /= TCount; | |
4227 | uint32_t V = L % VCount; | |
4228 | L /= VCount; | |
4229 | ||
4230 | /* offset them */ | |
4231 | L += LBase; | |
4232 | V += VBase; | |
4233 | T += TBase; | |
4234 | ||
4235 | /* | |
4236 | return the first CE, but first put the rest into the expansion buffer | |
4237 | */ | |
4238 | if (!source->coll->image->jamoSpecial) | |
4239 | { | |
4240 | /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/ | |
4241 | /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/ | |
4242 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L); | |
4243 | /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/ | |
4244 | /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/ | |
4245 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V); | |
4246 | if (T != TBase) | |
4247 | /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/ | |
4248 | /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/ | |
4249 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T); | |
4250 | ||
4251 | source->toReturn = source->CEpos - 1; | |
4252 | return *(source->toReturn); | |
4253 | } else { | |
374ca955 | 4254 | // Since Hanguls pass the FCD check, it is |
b75a7d8f A |
4255 | // guaranteed that we won't be in |
4256 | // the normalization buffer if something like this happens | |
4257 | // Move Jamos into normalization buffer | |
4258 | /* | |
4259 | Move the Jamos into the | |
374ca955 | 4260 | normalization buffer |
b75a7d8f A |
4261 | */ |
4262 | UChar *tempbuffer = source->writableBuffer + | |
4263 | (source->writableBufSize - 1); | |
4264 | *(tempbuffer) = 0; | |
4265 | if (T != TBase) { | |
4266 | *(tempbuffer - 1) = (UChar)T; | |
4267 | *(tempbuffer - 2) = (UChar)V; | |
4268 | *(tempbuffer - 3) = (UChar)L; | |
4269 | *(tempbuffer - 4) = 0; | |
4270 | } else { | |
4271 | *(tempbuffer - 1) = (UChar)V; | |
4272 | *(tempbuffer - 2) = (UChar)L; | |
4273 | *(tempbuffer - 3) = 0; | |
4274 | } | |
4275 | ||
4276 | /* | |
4277 | Indicate where to continue in main input string after exhausting | |
4278 | the writableBuffer | |
4279 | */ | |
4280 | if (source->pos == source->string) { | |
4281 | source->fcdPosition = NULL; | |
4282 | } else { | |
4283 | source->fcdPosition = source->pos-1; | |
4284 | } | |
4285 | ||
4286 | source->pos = tempbuffer; | |
4287 | source->origFlags = source->flags; | |
4288 | source->flags |= UCOL_ITER_INNORMBUF; | |
4289 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
4290 | ||
4291 | return(UCOL_IGNORABLE); | |
4292 | } | |
4293 | } | |
4294 | case LEAD_SURROGATE_TAG: /* D800-DBFF*/ | |
4295 | return 0; /* broken surrogate sequence */ | |
4296 | case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ | |
4297 | { | |
4298 | UChar32 cp = 0; | |
4299 | UChar prevChar; | |
4300 | UChar *prev; | |
4301 | if (isAtStartPrevIterate(source)) { | |
4302 | /* we are at the start of the string, wrong place to be at */ | |
4303 | return 0; | |
4304 | } | |
4305 | if (source->pos != source->writableBuffer) { | |
4306 | prev = source->pos - 1; | |
4307 | } else { | |
4308 | prev = source->fcdPosition; | |
4309 | } | |
4310 | prevChar = *prev; | |
4311 | ||
4312 | /* Handles Han and Supplementary characters here.*/ | |
4313 | if (UTF_IS_FIRST_SURROGATE(prevChar)) { | |
4314 | cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); | |
4315 | source->pos = prev; | |
4316 | } else { | |
4317 | return 0; /* completely ignorable */ | |
4318 | } | |
4319 | return getPrevImplicit(cp, source); | |
4320 | } | |
4321 | // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function | |
4322 | case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ | |
4323 | return getPrevImplicit(ch, source); | |
4324 | case IMPLICIT_TAG: /* everything that is not defined otherwise */ | |
4325 | return getPrevImplicit(ch, source); | |
4326 | /* UCA is filled with these. Tailorings are NOT_FOUND */ | |
4327 | /* not yet implemented */ | |
4328 | case CHARSET_TAG: /* this tag always returns */ | |
4329 | /* probably after 1.8 */ | |
4330 | return UCOL_NOT_FOUND; | |
4331 | default: /* this tag always returns */ | |
4332 | *status = U_INTERNAL_PROGRAM_ERROR; | |
4333 | CE=0; | |
4334 | break; | |
4335 | } | |
4336 | if (CE <= UCOL_NOT_FOUND) { | |
4337 | break; | |
4338 | } | |
4339 | } | |
4340 | return CE; | |
4341 | } | |
4342 | ||
4343 | /* This should really be a macro */ | |
4344 | /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */ | |
4345 | /* anyway */ | |
4346 | static | |
4347 | uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { | |
4348 | #ifdef UCOL_DEBUG | |
4349 | fprintf(stderr, "."); | |
4350 | #endif | |
4351 | uint8_t *newStart = NULL; | |
4352 | uint32_t offset = *secondaries-secStart; | |
4353 | ||
4354 | if(secStart==second) { | |
4355 | newStart=(uint8_t*)uprv_malloc(newSize); | |
4356 | if(newStart==NULL) { | |
4357 | *status = U_MEMORY_ALLOCATION_ERROR; | |
4358 | return NULL; | |
4359 | } | |
4360 | uprv_memcpy(newStart, secStart, *secondaries-secStart); | |
4361 | } else { | |
4362 | newStart=(uint8_t*)uprv_realloc(secStart, newSize); | |
4363 | if(newStart==NULL) { | |
4364 | *status = U_MEMORY_ALLOCATION_ERROR; | |
4365 | return NULL; | |
4366 | } | |
4367 | } | |
4368 | *secondaries=newStart+offset; | |
4369 | *secSize=newSize; | |
4370 | return newStart; | |
4371 | } | |
4372 | ||
4373 | ||
4374 | /* This should really be a macro */ | |
4375 | /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ | |
4376 | /* secondaries in French */ | |
4377 | /* | |
4378 | void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { | |
4379 | uint8_t temp; | |
4380 | while(start<end) { | |
4381 | temp = *start; | |
4382 | *start++ = *end; | |
4383 | *end-- = temp; | |
4384 | } | |
4385 | } | |
4386 | */ | |
4387 | ||
4388 | #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ | |
4389 | TYPE tempA; \ | |
4390 | while((start)<(end)) { \ | |
4391 | tempA = *(start); \ | |
4392 | *(start)++ = *(end); \ | |
4393 | *(end)-- = tempA; \ | |
4394 | } \ | |
4395 | } | |
4396 | ||
4397 | /****************************************************************************/ | |
4398 | /* Following are the sortkey generation functions */ | |
4399 | /* */ | |
4400 | /****************************************************************************/ | |
4401 | ||
4402 | /** | |
4403 | * Merge two sort keys. | |
4404 | * This is useful, for example, to combine sort keys from first and last names | |
4405 | * to sort such pairs. | |
4406 | * Merged sort keys consider on each collation level the first part first entirely, | |
4407 | * then the second one. | |
4408 | * It is possible to merge multiple sort keys by consecutively merging | |
4409 | * another one with the intermediate result. | |
4410 | * | |
4411 | * The length of the merge result is the sum of the lengths of the input sort keys | |
4412 | * minus 1. | |
4413 | * | |
4414 | * @param src1 the first sort key | |
4415 | * @param src1Length the length of the first sort key, including the zero byte at the end; | |
4416 | * can be -1 if the function is to find the length | |
4417 | * @param src2 the second sort key | |
4418 | * @param src2Length the length of the second sort key, including the zero byte at the end; | |
4419 | * can be -1 if the function is to find the length | |
4420 | * @param dest the buffer where the merged sort key is written, | |
4421 | * can be NULL if destCapacity==0 | |
4422 | * @param destCapacity the number of bytes in the dest buffer | |
4423 | * @return the length of the merged sort key, src1Length+src2Length-1; | |
4424 | * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), | |
4425 | * in which cases the contents of dest is undefined | |
4426 | * | |
4427 | * @draft | |
4428 | */ | |
4429 | U_CAPI int32_t U_EXPORT2 | |
4430 | ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, | |
4431 | const uint8_t *src2, int32_t src2Length, | |
4432 | uint8_t *dest, int32_t destCapacity) { | |
4433 | int32_t destLength; | |
4434 | uint8_t b; | |
4435 | ||
4436 | /* check arguments */ | |
4437 | if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || | |
4438 | src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || | |
4439 | destCapacity<0 || (destCapacity>0 && dest==NULL) | |
4440 | ) { | |
4441 | /* error, attempt to write a zero byte and return 0 */ | |
4442 | if(dest!=NULL && destCapacity>0) { | |
4443 | *dest=0; | |
4444 | } | |
4445 | return 0; | |
4446 | } | |
4447 | ||
4448 | /* check lengths and capacity */ | |
4449 | if(src1Length<0) { | |
4450 | src1Length=(int32_t)uprv_strlen((const char *)src1)+1; | |
4451 | } | |
4452 | if(src2Length<0) { | |
4453 | src2Length=(int32_t)uprv_strlen((const char *)src2)+1; | |
4454 | } | |
4455 | ||
4456 | destLength=src1Length+src2Length-1; | |
4457 | if(destLength>destCapacity) { | |
4458 | /* the merged sort key does not fit into the destination */ | |
4459 | return destLength; | |
4460 | } | |
4461 | ||
4462 | /* merge the sort keys with the same number of levels */ | |
4463 | while(*src1!=0 && *src2!=0) { /* while both have another level */ | |
4464 | /* copy level from src1 not including 00 or 01 */ | |
4465 | while((b=*src1)>=2) { | |
4466 | ++src1; | |
4467 | *dest++=b; | |
4468 | } | |
4469 | ||
4470 | /* add a 02 merge separator */ | |
4471 | *dest++=2; | |
4472 | ||
4473 | /* copy level from src2 not including 00 or 01 */ | |
4474 | while((b=*src2)>=2) { | |
4475 | ++src2; | |
4476 | *dest++=b; | |
4477 | } | |
4478 | ||
4479 | /* if both sort keys have another level, then add a 01 level separator and continue */ | |
4480 | if(*src1==1 && *src2==1) { | |
4481 | ++src1; | |
4482 | ++src2; | |
4483 | *dest++=1; | |
4484 | } | |
4485 | } | |
4486 | ||
4487 | /* | |
4488 | * here, at least one sort key is finished now, but the other one | |
4489 | * might have some contents left from containing more levels; | |
4490 | * that contents is just appended to the result | |
4491 | */ | |
4492 | if(*src1!=0) { | |
4493 | /* src1 is not finished, therefore *src2==0, and src1 is appended */ | |
4494 | src2=src1; | |
4495 | } | |
4496 | /* append src2, "the other, unfinished sort key" */ | |
4497 | uprv_strcpy((char *)dest, (const char *)src2); | |
4498 | ||
4499 | /* trust that neither sort key contained illegally embedded zero bytes */ | |
4500 | return destLength; | |
4501 | } | |
4502 | ||
4503 | /* sortkey API */ | |
4504 | U_CAPI int32_t U_EXPORT2 | |
4505 | ucol_getSortKey(const UCollator *coll, | |
4506 | const UChar *source, | |
4507 | int32_t sourceLength, | |
4508 | uint8_t *result, | |
4509 | int32_t resultLength) | |
4510 | { | |
374ca955 A |
4511 | UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); |
4512 | if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | |
4513 | int32_t actualSrcLen = sourceLength; | |
4514 | if (actualSrcLen==-1 && source!=NULL) { | |
4515 | actualSrcLen = u_strlen(source); | |
4516 | } | |
4517 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, actualSrcLen); | |
4518 | } | |
4519 | ||
b75a7d8f | 4520 | UErrorCode status = U_ZERO_ERROR; |
374ca955 | 4521 | int32_t keySize = 0; |
b75a7d8f | 4522 | |
374ca955 A |
4523 | if(source != NULL) { |
4524 | // source == NULL is actually an error situation, but we would need to | |
4525 | // have an error code to return it. Until we introduce a new | |
4526 | // API, it stays like this | |
b75a7d8f | 4527 | |
374ca955 A |
4528 | /* this uses the function pointer that is set in updateinternalstate */ |
4529 | /* currently, there are two funcs: */ | |
4530 | /*ucol_calcSortKey(...);*/ | |
4531 | /*ucol_calcSortKeySimpleTertiary(...);*/ | |
4532 | ||
4533 | keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); | |
4534 | //((UCollator *)coll)->errorCode = status; /*semantically const */ | |
4535 | } | |
4536 | UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); | |
4537 | UTRACE_EXIT_STATUS(status); | |
b75a7d8f A |
4538 | return keySize; |
4539 | } | |
4540 | ||
4541 | /* this function is called by the C++ API for sortkey generation */ | |
4542 | U_CFUNC int32_t | |
4543 | ucol_getSortKeyWithAllocation(const UCollator *coll, | |
4544 | const UChar *source, int32_t sourceLength, | |
4545 | uint8_t **pResult, | |
4546 | UErrorCode *pErrorCode) { | |
4547 | *pResult = 0; | |
4548 | return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode); | |
4549 | } | |
4550 | ||
4551 | #define UCOL_FSEC_BUF_SIZE 256 | |
4552 | ||
4553 | /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */ | |
4554 | /* or if we run out of space while making a sortkey and want to return ASAP */ | |
4555 | int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) { | |
4556 | UErrorCode status = U_ZERO_ERROR; | |
374ca955 | 4557 | const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); |
b75a7d8f A |
4558 | uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); |
4559 | uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); | |
4560 | uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); | |
4561 | UBool compareIdent = (strength == UCOL_IDENTICAL); | |
4562 | UBool doCase = (coll->caseLevel == UCOL_ON); | |
4563 | UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
4564 | //UBool qShifted = shifted && (compareQuad == 0); | |
4565 | UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); | |
4566 | UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); | |
4567 | uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE]; | |
4568 | uint8_t *fSecs = fSecsBuff; | |
4569 | uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE; | |
4570 | uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL; | |
4571 | ||
4572 | uint32_t variableTopValue = coll->variableTopValue; | |
374ca955 | 4573 | uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); |
b75a7d8f A |
4574 | if(doHiragana) { |
4575 | UCOL_COMMON_BOT4++; | |
4576 | /* allocate one more space for hiragana */ | |
4577 | } | |
4578 | uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); | |
4579 | ||
4580 | uint32_t order = UCOL_NO_MORE_CES; | |
4581 | uint8_t primary1 = 0; | |
4582 | uint8_t primary2 = 0; | |
4583 | uint8_t secondary = 0; | |
4584 | uint8_t tertiary = 0; | |
4585 | int32_t caseShift = 0; | |
4586 | uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */ | |
4587 | ||
4588 | uint8_t caseSwitch = coll->caseSwitch; | |
4589 | uint8_t tertiaryMask = coll->tertiaryMask; | |
4590 | uint8_t tertiaryCommon = coll->tertiaryCommon; | |
4591 | ||
4592 | UBool wasShifted = FALSE; | |
4593 | UBool notIsContinuation = FALSE; | |
4594 | uint8_t leadPrimary = 0; | |
4595 | ||
4596 | ||
4597 | for(;;) { | |
4598 | order = ucol_IGetNextCE(coll, s, &status); | |
4599 | if(order == UCOL_NO_MORE_CES) { | |
4600 | break; | |
4601 | } | |
4602 | ||
4603 | if(order == 0) { | |
4604 | continue; | |
4605 | } | |
4606 | ||
4607 | notIsContinuation = !isContinuation(order); | |
4608 | ||
4609 | ||
4610 | if(notIsContinuation) { | |
4611 | tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); | |
4612 | } else { | |
4613 | tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
4614 | } | |
4615 | secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
4616 | primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
4617 | primary1 = (uint8_t)(order >> 8); | |
4618 | ||
4619 | ||
4620 | if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) | |
374ca955 | 4621 | || (!notIsContinuation && wasShifted)) |
b75a7d8f A |
4622 | || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ |
4623 | /* and other ignorables should be removed if following a shifted code point */ | |
4624 | if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ | |
4625 | /* we should just completely ignore it */ | |
4626 | continue; | |
4627 | } | |
4628 | if(compareQuad == 0) { | |
4629 | if(c4 > 0) { | |
4630 | currentSize += (c2/UCOL_BOT_COUNT4)+1; | |
4631 | c4 = 0; | |
4632 | } | |
4633 | currentSize++; | |
4634 | if(primary2 != 0) { | |
4635 | currentSize++; | |
4636 | } | |
4637 | } | |
4638 | wasShifted = TRUE; | |
4639 | } else { | |
4640 | wasShifted = FALSE; | |
4641 | /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ | |
4642 | /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ | |
4643 | /* calculate sortkey size */ | |
4644 | if(primary1 != UCOL_IGNORABLE) { | |
4645 | if(notIsContinuation) { | |
4646 | if(leadPrimary == primary1) { | |
4647 | currentSize++; | |
4648 | } else { | |
4649 | if(leadPrimary != 0) { | |
4650 | currentSize++; | |
4651 | } | |
4652 | if(primary2 == UCOL_IGNORABLE) { | |
4653 | /* one byter, not compressed */ | |
4654 | currentSize++; | |
4655 | leadPrimary = 0; | |
4656 | } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || | |
4657 | //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) { | |
4658 | (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { | |
4659 | /* not compressible */ | |
4660 | leadPrimary = 0; | |
4661 | currentSize+=2; | |
4662 | } else { /* compress */ | |
4663 | leadPrimary = primary1; | |
4664 | currentSize+=2; | |
4665 | } | |
4666 | } | |
4667 | } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ | |
4668 | currentSize++; | |
4669 | if(primary2 != UCOL_IGNORABLE) { | |
4670 | currentSize++; | |
4671 | } | |
4672 | } | |
4673 | } | |
4674 | ||
4675 | if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */ | |
4676 | if(!isFrenchSec){ | |
4677 | if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
4678 | c2++; | |
4679 | } else { | |
4680 | if(c2 > 0) { | |
4681 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
4682 | currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; | |
4683 | } else { | |
4684 | currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; | |
4685 | } | |
4686 | c2 = 0; | |
4687 | } | |
4688 | currentSize++; | |
4689 | } | |
4690 | } else { | |
4691 | fSecs[fSecsLen++] = secondary; | |
4692 | if(fSecsLen == fSecsMaxLen) { | |
4693 | if(fSecs == fSecsBuff) { | |
4694 | fSecs = (uint8_t *)uprv_malloc(2*fSecsLen); | |
4695 | } else { | |
4696 | fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); | |
4697 | } | |
4698 | if(fSecs == NULL) { | |
4699 | status = U_MEMORY_ALLOCATION_ERROR; | |
4700 | return -1; | |
4701 | } | |
4702 | fSecsMaxLen *= 2; | |
4703 | } | |
4704 | if(notIsContinuation) { | |
4705 | if (frenchStartPtr != NULL) { | |
4706 | /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ | |
4707 | uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); | |
4708 | frenchStartPtr = NULL; | |
4709 | } | |
4710 | } else { | |
4711 | if (frenchStartPtr == NULL) { | |
4712 | frenchStartPtr = fSecs+fSecsLen-2; | |
4713 | } | |
4714 | frenchEndPtr = fSecs+fSecsLen-1; | |
4715 | } | |
4716 | } | |
4717 | } | |
4718 | ||
4719 | if(doCase) { | |
4720 | if (caseShift == 0) { | |
4721 | currentSize++; | |
4722 | caseShift = UCOL_CASE_SHIFT_START; | |
4723 | } | |
4724 | if((tertiary&0x3F) > 0 && notIsContinuation) { | |
4725 | caseShift--; | |
4726 | if((tertiary &0xC0) != 0) { | |
4727 | if (caseShift == 0) { | |
4728 | currentSize++; | |
4729 | caseShift = UCOL_CASE_SHIFT_START; | |
4730 | } | |
4731 | caseShift--; | |
4732 | } | |
4733 | } | |
4734 | } else { | |
4735 | if(notIsContinuation) { | |
4736 | tertiary ^= caseSwitch; | |
4737 | } | |
4738 | } | |
4739 | ||
4740 | tertiary &= tertiaryMask; | |
4741 | if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */ | |
4742 | if (tertiary == tertiaryCommon && notIsContinuation) { | |
4743 | c3++; | |
4744 | } else { | |
4745 | if(c3 > 0) { | |
4746 | if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) | |
4747 | || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { | |
4748 | currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; | |
4749 | } else { | |
4750 | currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; | |
4751 | } | |
4752 | c3 = 0; | |
4753 | } | |
4754 | currentSize++; | |
4755 | } | |
4756 | } | |
4757 | ||
4758 | if(/*qShifted*/(compareQuad==0) && notIsContinuation) { | |
4759 | if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it | |
4760 | if(c4>0) { // Close this part | |
4761 | currentSize += (c4/UCOL_BOT_COUNT4)+1; | |
4762 | c4 = 0; | |
4763 | } | |
4764 | currentSize++; // Add the Hiragana | |
4765 | } else { // This wasn't Hiragana, so we can continue adding stuff | |
4766 | c4++; | |
4767 | } | |
4768 | } | |
4769 | ||
4770 | } | |
4771 | } | |
4772 | ||
4773 | if(!isFrenchSec){ | |
4774 | if(c2 > 0) { | |
4775 | currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); | |
4776 | } | |
4777 | } else { | |
4778 | uint32_t i = 0; | |
4779 | if(frenchStartPtr != NULL) { | |
4780 | uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); | |
4781 | } | |
4782 | for(i = 0; i<fSecsLen; i++) { | |
4783 | secondary = *(fSecs+fSecsLen-i-1); | |
4784 | /* This is compression code. */ | |
4785 | if (secondary == UCOL_COMMON2) { | |
4786 | ++c2; | |
4787 | } else { | |
4788 | if(c2 > 0) { | |
4789 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
4790 | currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); | |
4791 | } else { | |
4792 | currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); | |
4793 | } | |
4794 | c2 = 0; | |
4795 | } | |
4796 | currentSize++; | |
4797 | } | |
4798 | } | |
4799 | if(c2 > 0) { | |
4800 | currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); | |
4801 | } | |
4802 | if(fSecs != fSecsBuff) { | |
4803 | uprv_free(fSecs); | |
4804 | } | |
4805 | } | |
4806 | ||
4807 | if(c3 > 0) { | |
4808 | currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); | |
4809 | } | |
4810 | ||
4811 | if(c4 > 0 && compareQuad == 0) { | |
4812 | currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); | |
4813 | } | |
4814 | ||
4815 | if(compareIdent) { | |
4816 | currentSize += u_lengthOfIdenticalLevelRun(s->string, len); | |
4817 | } | |
4818 | return currentSize; | |
4819 | ||
4820 | } | |
4821 | ||
4822 | static | |
4823 | inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { | |
4824 | if (caseShift == 0) { | |
4825 | *(*cases)++ = UCOL_CASE_BYTE_START; | |
4826 | caseShift = UCOL_CASE_SHIFT_START; | |
4827 | } | |
4828 | } | |
4829 | ||
4830 | // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we | |
4831 | // know how many values we wanted to add, even if we didn't add them all | |
4832 | static | |
4833 | inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) { | |
4834 | size++; | |
4835 | if(primaries < limit) { | |
4836 | *(primaries)++ = value; | |
4837 | } | |
4838 | } | |
4839 | ||
374ca955 | 4840 | // Packs the secondary buffer when processing French locale. Adds the terminator. |
b75a7d8f A |
4841 | static |
4842 | inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { | |
4843 | uint8_t secondary; | |
4844 | int32_t count2 = 0; | |
4845 | uint32_t i = 0, size = 0; | |
4846 | // we use i here since the key size already accounts for terminators, so we'll discard the increment | |
374ca955 | 4847 | addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); |
b75a7d8f A |
4848 | /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ |
4849 | if(frenchStartPtr != NULL) { | |
4850 | uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); | |
4851 | } | |
4852 | for(i = 0; i<*secsize; i++) { | |
4853 | secondary = *(secondaries-i-1); | |
4854 | /* This is compression code. */ | |
4855 | if (secondary == UCOL_COMMON2) { | |
4856 | ++count2; | |
4857 | } else { | |
4858 | if (count2 > 0) { | |
4859 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
4860 | while (count2 > UCOL_TOP_COUNT2) { | |
4861 | addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); | |
4862 | count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
4863 | } | |
4864 | addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); | |
4865 | } else { | |
4866 | while (count2 > UCOL_BOT_COUNT2) { | |
4867 | addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); | |
4868 | count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
4869 | } | |
4870 | addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); | |
4871 | } | |
4872 | count2 = 0; | |
4873 | } | |
4874 | addWithIncrement(primaries, primEnd, size, secondary); | |
4875 | } | |
4876 | } | |
4877 | if (count2 > 0) { | |
4878 | while (count2 > UCOL_BOT_COUNT2) { | |
4879 | addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); | |
4880 | count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
4881 | } | |
4882 | addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); | |
4883 | } | |
4884 | *secsize = size; | |
4885 | return primaries; | |
4886 | } | |
4887 | ||
4888 | /* This is the sortkey work horse function */ | |
4889 | U_CFUNC int32_t U_CALLCONV | |
4890 | ucol_calcSortKey(const UCollator *coll, | |
4891 | const UChar *source, | |
4892 | int32_t sourceLength, | |
4893 | uint8_t **result, | |
4894 | uint32_t resultLength, | |
4895 | UBool allocateSKBuffer, | |
4896 | UErrorCode *status) | |
4897 | { | |
374ca955 A |
4898 | const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); |
4899 | ||
b75a7d8f A |
4900 | uint32_t i = 0; /* general purpose counter */ |
4901 | ||
4902 | /* Stack allocated buffers for buffers we use */ | |
4903 | uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER]; | |
4904 | ||
4905 | uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad; | |
4906 | ||
4907 | if(U_FAILURE(*status)) { | |
4908 | return 0; | |
4909 | } | |
4910 | ||
4911 | if(primaries == NULL && allocateSKBuffer == TRUE) { | |
4912 | primaries = *result = prim; | |
4913 | resultLength = UCOL_PRIMARY_MAX_BUFFER; | |
4914 | } | |
4915 | ||
4916 | uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER, | |
4917 | caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER; | |
4918 | ||
4919 | uint32_t sortKeySize = 1; /* it is always \0 terminated */ | |
4920 | ||
4921 | UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER]; | |
4922 | UChar *normSource = normBuffer; | |
4923 | int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER; | |
4924 | ||
4925 | int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); | |
4926 | ||
4927 | UColAttributeValue strength = coll->strength; | |
4928 | ||
4929 | uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); | |
4930 | uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); | |
4931 | uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); | |
4932 | UBool compareIdent = (strength == UCOL_IDENTICAL); | |
4933 | UBool doCase = (coll->caseLevel == UCOL_ON); | |
4934 | UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); | |
4935 | UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
4936 | //UBool qShifted = shifted && (compareQuad == 0); | |
4937 | UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); | |
4938 | const uint8_t *scriptOrder = coll->scriptOrder; | |
4939 | ||
4940 | uint32_t variableTopValue = coll->variableTopValue; | |
4941 | // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no | |
4942 | // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. | |
4943 | uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); | |
4944 | uint8_t UCOL_HIRAGANA_QUAD = 0; | |
4945 | if(doHiragana) { | |
4946 | UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; | |
4947 | /* allocate one more space for hiragana, value for hiragana */ | |
4948 | } | |
4949 | uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); | |
4950 | ||
4951 | /* support for special features like caselevel and funky secondaries */ | |
4952 | uint8_t *frenchStartPtr = NULL; | |
4953 | uint8_t *frenchEndPtr = NULL; | |
4954 | uint32_t caseShift = 0; | |
4955 | ||
4956 | sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0)); | |
4957 | ||
4958 | /* If we need to normalize, we'll do it all at once at the beginning! */ | |
4959 | UNormalizationMode normMode; | |
4960 | if(compareIdent) { | |
4961 | normMode = UNORM_NFD; | |
4962 | } else if(coll->normalizationMode != UCOL_OFF) { | |
4963 | normMode = UNORM_FCD; | |
4964 | } else { | |
4965 | normMode = UNORM_NONE; | |
4966 | } | |
4967 | ||
4968 | if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) { | |
4969 | len = unorm_internalNormalize(normSource, normSourceLen, | |
4970 | source, len, | |
4971 | normMode, FALSE, | |
4972 | status); | |
4973 | if(*status == U_BUFFER_OVERFLOW_ERROR) { | |
4974 | normSourceLen = len; | |
4975 | normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR); | |
4976 | if(normSource == NULL) { | |
4977 | *status = U_MEMORY_ALLOCATION_ERROR; | |
4978 | return 0; | |
4979 | } | |
4980 | *status = U_ZERO_ERROR; | |
4981 | len = unorm_internalNormalize(normSource, normSourceLen, | |
4982 | source, len, | |
4983 | normMode, FALSE, | |
4984 | status); | |
4985 | } | |
4986 | ||
4987 | if(U_FAILURE(*status)) { | |
4988 | return 0; | |
4989 | } | |
4990 | source = normSource; | |
4991 | } | |
4992 | ||
4993 | collIterate s; | |
4994 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
4995 | if(source == normSource) { | |
4996 | s.flags &= ~UCOL_ITER_NORM; | |
4997 | } | |
4998 | ||
4999 | if(resultLength == 0 || primaries == NULL) { | |
5000 | int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); | |
5001 | if(normSource != normBuffer) { | |
5002 | uprv_free(normSource); | |
5003 | } | |
5004 | return keyLen; | |
5005 | } | |
5006 | uint8_t *primarySafeEnd = primaries + resultLength - 2; | |
5007 | ||
5008 | uint32_t minBufferSize = UCOL_MAX_BUFFER; | |
5009 | ||
5010 | uint8_t *primStart = primaries; | |
5011 | uint8_t *secStart = secondaries; | |
5012 | uint8_t *terStart = tertiaries; | |
5013 | uint8_t *caseStart = cases; | |
5014 | uint8_t *quadStart = quads; | |
5015 | ||
5016 | uint32_t order = 0; | |
5017 | ||
5018 | uint8_t primary1 = 0; | |
5019 | uint8_t primary2 = 0; | |
5020 | uint8_t secondary = 0; | |
5021 | uint8_t tertiary = 0; | |
5022 | uint8_t caseSwitch = coll->caseSwitch; | |
5023 | uint8_t tertiaryMask = coll->tertiaryMask; | |
5024 | int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition; | |
5025 | uint8_t tertiaryTop = coll->tertiaryTop; | |
5026 | uint8_t tertiaryBottom = coll->tertiaryBottom; | |
5027 | uint8_t tertiaryCommon = coll->tertiaryCommon; | |
5028 | uint8_t caseBits = 0; | |
5029 | ||
5030 | UBool finished = FALSE; | |
5031 | UBool wasShifted = FALSE; | |
5032 | UBool notIsContinuation = FALSE; | |
5033 | ||
5034 | uint32_t prevBuffSize = 0; | |
5035 | ||
5036 | uint32_t count2 = 0, count3 = 0, count4 = 0; | |
5037 | uint8_t leadPrimary = 0; | |
5038 | ||
5039 | for(;;) { | |
5040 | for(i=prevBuffSize; i<minBufferSize; ++i) { | |
5041 | ||
5042 | order = ucol_IGetNextCE(coll, &s, status); | |
5043 | if(order == UCOL_NO_MORE_CES) { | |
5044 | finished = TRUE; | |
5045 | break; | |
5046 | } | |
5047 | ||
5048 | if(order == 0) { | |
5049 | continue; | |
5050 | } | |
5051 | ||
5052 | notIsContinuation = !isContinuation(order); | |
5053 | ||
5054 | if(notIsContinuation) { | |
5055 | tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); | |
5056 | } else { | |
5057 | tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
5058 | } | |
5059 | ||
5060 | secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
5061 | primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
5062 | primary1 = (uint8_t)(order >> 8); | |
5063 | ||
5064 | if(notIsContinuation) { | |
5065 | if(scriptOrder != NULL) { | |
5066 | primary1 = scriptOrder[primary1]; | |
5067 | } | |
5068 | } | |
5069 | ||
5070 | if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) | |
5071 | || (!notIsContinuation && wasShifted)) | |
5072 | || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ | |
5073 | /* and other ignorables should be removed if following a shifted code point */ | |
5074 | if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ | |
5075 | /* we should just completely ignore it */ | |
5076 | continue; | |
5077 | } | |
5078 | if(compareQuad == 0) { | |
5079 | if(count4 > 0) { | |
5080 | while (count4 > UCOL_BOT_COUNT4) { | |
5081 | *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
5082 | count4 -= UCOL_BOT_COUNT4; | |
5083 | } | |
5084 | *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); | |
5085 | count4 = 0; | |
5086 | } | |
5087 | /* We are dealing with a variable and we're treating them as shifted */ | |
5088 | /* This is a shifted ignorable */ | |
5089 | if(primary1 != 0) { /* we need to check this since we could be in continuation */ | |
5090 | *quads++ = primary1; | |
5091 | } | |
5092 | if(primary2 != 0) { | |
5093 | *quads++ = primary2; | |
5094 | } | |
5095 | } | |
5096 | wasShifted = TRUE; | |
5097 | } else { | |
5098 | wasShifted = FALSE; | |
5099 | /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ | |
5100 | /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ | |
5101 | /* regular and simple sortkey calc */ | |
5102 | if(primary1 != UCOL_IGNORABLE) { | |
5103 | if(notIsContinuation) { | |
5104 | if(leadPrimary == primary1) { | |
5105 | *primaries++ = primary2; | |
5106 | } else { | |
5107 | if(leadPrimary != 0) { | |
5108 | *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); | |
5109 | } | |
5110 | if(primary2 == UCOL_IGNORABLE) { | |
5111 | /* one byter, not compressed */ | |
5112 | *primaries++ = primary1; | |
5113 | leadPrimary = 0; | |
5114 | } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || | |
5115 | (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { | |
5116 | /* not compressible */ | |
5117 | leadPrimary = 0; | |
5118 | *primaries++ = primary1; | |
5119 | *primaries++ = primary2; | |
5120 | } else { /* compress */ | |
5121 | *primaries++ = leadPrimary = primary1; | |
5122 | *primaries++ = primary2; | |
5123 | } | |
5124 | } | |
5125 | } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ | |
5126 | *primaries++ = primary1; | |
5127 | if(primary2 != UCOL_IGNORABLE) { | |
5128 | *primaries++ = primary2; /* second part */ | |
5129 | } | |
5130 | } | |
5131 | } | |
5132 | ||
5133 | if(secondary > compareSec) { | |
5134 | if(!isFrenchSec) { | |
5135 | /* This is compression code. */ | |
5136 | if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
5137 | ++count2; | |
5138 | } else { | |
5139 | if (count2 > 0) { | |
5140 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
5141 | while (count2 > UCOL_TOP_COUNT2) { | |
5142 | *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); | |
5143 | count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
5144 | } | |
5145 | *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); | |
5146 | } else { | |
5147 | while (count2 > UCOL_BOT_COUNT2) { | |
5148 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
5149 | count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
5150 | } | |
5151 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); | |
5152 | } | |
5153 | count2 = 0; | |
5154 | } | |
5155 | *secondaries++ = secondary; | |
5156 | } | |
5157 | } else { | |
5158 | *secondaries++ = secondary; | |
5159 | /* Do the special handling for French secondaries */ | |
5160 | /* We need to get continuation elements and do intermediate restore */ | |
5161 | /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ | |
5162 | if(notIsContinuation) { | |
5163 | if (frenchStartPtr != NULL) { | |
5164 | /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ | |
5165 | uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); | |
5166 | frenchStartPtr = NULL; | |
5167 | } | |
5168 | } else { | |
5169 | if (frenchStartPtr == NULL) { | |
5170 | frenchStartPtr = secondaries - 2; | |
5171 | } | |
5172 | frenchEndPtr = secondaries-1; | |
5173 | } | |
5174 | } | |
5175 | } | |
5176 | ||
5177 | if(doCase) { | |
5178 | doCaseShift(&cases, caseShift); | |
5179 | if(notIsContinuation) { | |
5180 | caseBits = (uint8_t)(tertiary & 0xC0); | |
5181 | ||
5182 | if(tertiary != 0) { | |
5183 | if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
5184 | if((caseBits & 0xC0) == 0) { | |
5185 | *(cases-1) |= 1 << (--caseShift); | |
5186 | } else { | |
5187 | *(cases-1) |= 0 << (--caseShift); | |
5188 | /* second bit */ | |
5189 | doCaseShift(&cases, caseShift); | |
5190 | *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); | |
5191 | } | |
5192 | } else { | |
5193 | if((caseBits & 0xC0) == 0) { | |
5194 | *(cases-1) |= 0 << (--caseShift); | |
5195 | } else { | |
5196 | *(cases-1) |= 1 << (--caseShift); | |
5197 | /* second bit */ | |
5198 | doCaseShift(&cases, caseShift); | |
5199 | *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); | |
5200 | } | |
5201 | } | |
5202 | } | |
5203 | ||
5204 | } | |
5205 | } else { | |
5206 | if(notIsContinuation) { | |
5207 | tertiary ^= caseSwitch; | |
5208 | } | |
5209 | } | |
5210 | ||
5211 | tertiary &= tertiaryMask; | |
5212 | if(tertiary > compareTer) { | |
5213 | /* This is compression code. */ | |
5214 | /* sequence size check is included in the if clause */ | |
5215 | if (tertiary == tertiaryCommon && notIsContinuation) { | |
5216 | ++count3; | |
5217 | } else { | |
5218 | if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) | |
5219 | || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { | |
5220 | tertiary += tertiaryAddition; | |
5221 | } | |
5222 | if (count3 > 0) { | |
5223 | if ((tertiary > tertiaryCommon)) { | |
5224 | while (count3 > coll->tertiaryTopCount) { | |
5225 | *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); | |
5226 | count3 -= (uint32_t)coll->tertiaryTopCount; | |
5227 | } | |
5228 | *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); | |
5229 | } else { | |
5230 | while (count3 > coll->tertiaryBottomCount) { | |
5231 | *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); | |
5232 | count3 -= (uint32_t)coll->tertiaryBottomCount; | |
5233 | } | |
5234 | *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); | |
5235 | } | |
5236 | count3 = 0; | |
5237 | } | |
5238 | *tertiaries++ = tertiary; | |
5239 | } | |
5240 | } | |
5241 | ||
5242 | if(/*qShifted*/(compareQuad==0) && notIsContinuation) { | |
5243 | if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it | |
5244 | if(count4>0) { // Close this part | |
5245 | while (count4 > UCOL_BOT_COUNT4) { | |
5246 | *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
5247 | count4 -= UCOL_BOT_COUNT4; | |
5248 | } | |
5249 | *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); | |
5250 | count4 = 0; | |
5251 | } | |
5252 | *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana | |
5253 | } else { // This wasn't Hiragana, so we can continue adding stuff | |
5254 | count4++; | |
5255 | } | |
5256 | } | |
5257 | } | |
5258 | ||
5259 | if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ | |
5260 | if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ | |
5261 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5262 | if(source == normSource) { | |
5263 | s.flags &= ~UCOL_ITER_NORM; | |
5264 | } | |
5265 | sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); | |
5266 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5267 | finished = TRUE; | |
5268 | break; | |
5269 | } else { /* It's much nicer if we can actually reallocate */ | |
5270 | int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart); | |
5271 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); | |
5272 | if(U_SUCCESS(*status)) { | |
5273 | *result = primStart; | |
5274 | primarySafeEnd = primStart + resultLength - 2; | |
5275 | } else { | |
5276 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5277 | if(source == normSource) { | |
5278 | s.flags &= ~UCOL_ITER_NORM; | |
5279 | } | |
5280 | sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); | |
5281 | finished = TRUE; | |
5282 | break; | |
5283 | } | |
5284 | } | |
5285 | } | |
5286 | } | |
5287 | if(finished) { | |
5288 | break; | |
5289 | } else { | |
5290 | prevBuffSize = minBufferSize; | |
5291 | secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); | |
5292 | terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); | |
5293 | caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); | |
5294 | quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); | |
5295 | minBufferSize *= 2; | |
5296 | if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size | |
5297 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5298 | if(source == normSource) { | |
5299 | s.flags &= ~UCOL_ITER_NORM; | |
5300 | } | |
5301 | sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); | |
5302 | break; | |
5303 | } | |
5304 | } | |
5305 | } | |
5306 | ||
5307 | /* Here, we are generally done with processing */ | |
5308 | /* bailing out would not be too productive */ | |
5309 | ||
5310 | if(U_SUCCESS(*status)) { | |
5311 | sortKeySize += (primaries - primStart); | |
5312 | /* we have done all the CE's, now let's put them together to form a key */ | |
5313 | if(compareSec == 0) { | |
5314 | if (count2 > 0) { | |
5315 | while (count2 > UCOL_BOT_COUNT2) { | |
5316 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
5317 | count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
5318 | } | |
5319 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); | |
5320 | } | |
5321 | uint32_t secsize = secondaries-secStart; | |
5322 | if(!isFrenchSec) { // Regular situation, we know the length of secondaries | |
5323 | sortKeySize += secsize; | |
5324 | if(sortKeySize <= resultLength) { | |
5325 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5326 | uprv_memcpy(primaries, secStart, secsize); | |
5327 | primaries += secsize; | |
5328 | } else { | |
5329 | if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ | |
5330 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5331 | if(U_SUCCESS(*status)) { | |
5332 | *result = primStart; | |
5333 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5334 | uprv_memcpy(primaries, secStart, secsize); | |
5335 | primaries += secsize; | |
5336 | } | |
5337 | } else { | |
5338 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5339 | } | |
5340 | } | |
5341 | } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator | |
5342 | uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); | |
5343 | sortKeySize += secsize; | |
5344 | if(sortKeySize <= resultLength) { // if we managed to pack fine | |
5345 | primaries = newPrim; // update the primary pointer | |
5346 | } else { // overflow, need to reallocate and redo | |
5347 | if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ | |
5348 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5349 | if(U_SUCCESS(*status)) { | |
5350 | primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); | |
5351 | } | |
5352 | } else { | |
5353 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5354 | } | |
5355 | } | |
5356 | } | |
5357 | } | |
5358 | ||
5359 | if(doCase) { | |
5360 | uint32_t casesize = cases - caseStart; | |
5361 | sortKeySize += casesize; | |
5362 | if(sortKeySize <= resultLength) { | |
5363 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5364 | uprv_memcpy(primaries, caseStart, casesize); | |
5365 | primaries += casesize; | |
5366 | } else { | |
5367 | if(allocateSKBuffer == TRUE) { | |
5368 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5369 | if(U_SUCCESS(*status)) { | |
5370 | *result = primStart; | |
5371 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5372 | uprv_memcpy(primaries, caseStart, casesize); | |
5373 | } | |
5374 | } else { | |
5375 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5376 | } | |
5377 | } | |
5378 | } | |
5379 | ||
5380 | if(compareTer == 0) { | |
5381 | if (count3 > 0) { | |
5382 | if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { | |
5383 | while (count3 >= coll->tertiaryTopCount) { | |
5384 | *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); | |
5385 | count3 -= (uint32_t)coll->tertiaryTopCount; | |
5386 | } | |
5387 | *tertiaries++ = (uint8_t)(tertiaryTop - count3); | |
5388 | } else { | |
5389 | while (count3 > coll->tertiaryBottomCount) { | |
5390 | *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); | |
5391 | count3 -= (uint32_t)coll->tertiaryBottomCount; | |
5392 | } | |
5393 | *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); | |
5394 | } | |
5395 | } | |
5396 | uint32_t tersize = tertiaries - terStart; | |
5397 | sortKeySize += tersize; | |
5398 | if(sortKeySize <= resultLength) { | |
5399 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5400 | uprv_memcpy(primaries, terStart, tersize); | |
5401 | primaries += tersize; | |
5402 | } else { | |
5403 | if(allocateSKBuffer == TRUE) { | |
5404 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5405 | if(U_SUCCESS(*status)) { | |
5406 | *result = primStart; | |
5407 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5408 | uprv_memcpy(primaries, terStart, tersize); | |
5409 | } | |
5410 | } else { | |
5411 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5412 | } | |
5413 | } | |
5414 | ||
5415 | if(compareQuad == 0/*qShifted == TRUE*/) { | |
5416 | if(count4 > 0) { | |
5417 | while (count4 > UCOL_BOT_COUNT4) { | |
5418 | *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
5419 | count4 -= UCOL_BOT_COUNT4; | |
5420 | } | |
5421 | *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); | |
5422 | } | |
5423 | uint32_t quadsize = quads - quadStart; | |
5424 | sortKeySize += quadsize; | |
5425 | if(sortKeySize <= resultLength) { | |
5426 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5427 | uprv_memcpy(primaries, quadStart, quadsize); | |
5428 | primaries += quadsize; | |
5429 | } else { | |
5430 | if(allocateSKBuffer == TRUE) { | |
5431 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5432 | if(U_SUCCESS(*status)) { | |
5433 | *result = primStart; | |
5434 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5435 | uprv_memcpy(primaries, quadStart, quadsize); | |
5436 | } | |
5437 | } else { | |
5438 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5439 | } | |
5440 | } | |
5441 | } | |
5442 | ||
5443 | if(compareIdent) { | |
5444 | sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); | |
5445 | if(sortKeySize <= resultLength) { | |
5446 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5447 | primaries += u_writeIdenticalLevelRun(s.string, len, primaries); | |
5448 | } else { | |
5449 | if(allocateSKBuffer == TRUE) { | |
5450 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); | |
5451 | if(U_SUCCESS(*status)) { | |
5452 | *result = primStart; | |
5453 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
374ca955 | 5454 | u_writeIdenticalLevelRun(s.string, len, primaries); |
b75a7d8f A |
5455 | } |
5456 | } else { | |
5457 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5458 | } | |
5459 | } | |
5460 | } | |
5461 | } | |
5462 | *(primaries++) = '\0'; | |
5463 | } | |
5464 | ||
5465 | if(terStart != tert) { | |
5466 | uprv_free(terStart); | |
5467 | uprv_free(secStart); | |
5468 | uprv_free(caseStart); | |
5469 | uprv_free(quadStart); | |
5470 | } | |
5471 | ||
5472 | if(normSource != normBuffer) { | |
5473 | uprv_free(normSource); | |
5474 | } | |
5475 | ||
5476 | if(allocateSKBuffer == TRUE) { | |
5477 | *result = (uint8_t*)uprv_malloc(sortKeySize); | |
374ca955 A |
5478 | /* test for NULL */ |
5479 | if (*result == NULL) { | |
5480 | *status = U_MEMORY_ALLOCATION_ERROR; | |
5481 | return sortKeySize; | |
5482 | } | |
b75a7d8f A |
5483 | uprv_memcpy(*result, primStart, sortKeySize); |
5484 | if(primStart != prim) { | |
5485 | uprv_free(primStart); | |
5486 | } | |
5487 | } | |
5488 | ||
5489 | return sortKeySize; | |
5490 | } | |
5491 | ||
5492 | ||
5493 | U_CFUNC int32_t U_CALLCONV | |
5494 | ucol_calcSortKeySimpleTertiary(const UCollator *coll, | |
5495 | const UChar *source, | |
5496 | int32_t sourceLength, | |
5497 | uint8_t **result, | |
5498 | uint32_t resultLength, | |
5499 | UBool allocateSKBuffer, | |
5500 | UErrorCode *status) | |
5501 | { | |
5502 | U_ALIGN_CODE(16); | |
374ca955 A |
5503 | |
5504 | const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); | |
b75a7d8f A |
5505 | uint32_t i = 0; /* general purpose counter */ |
5506 | ||
5507 | /* Stack allocated buffers for buffers we use */ | |
5508 | uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; | |
5509 | ||
5510 | uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; | |
5511 | ||
5512 | if(U_FAILURE(*status)) { | |
5513 | return 0; | |
5514 | } | |
5515 | ||
5516 | if(primaries == NULL && allocateSKBuffer == TRUE) { | |
5517 | primaries = *result = prim; | |
5518 | resultLength = UCOL_PRIMARY_MAX_BUFFER; | |
5519 | } | |
5520 | ||
5521 | uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER; | |
5522 | ||
5523 | uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */ | |
5524 | ||
5525 | UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER]; | |
5526 | UChar *normSource = normBuffer; | |
5527 | int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER; | |
5528 | ||
5529 | int32_t len = sourceLength; | |
5530 | ||
5531 | /* If we need to normalize, we'll do it all at once at the beginning! */ | |
5532 | if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) { | |
5533 | len = unorm_internalNormalize(normSource, normSourceLen, | |
5534 | source, len, | |
5535 | UNORM_FCD, FALSE, | |
5536 | status); | |
5537 | if(*status == U_BUFFER_OVERFLOW_ERROR) { | |
5538 | normSourceLen = len; | |
5539 | normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR); | |
5540 | if(normSource == NULL) { | |
5541 | *status = U_MEMORY_ALLOCATION_ERROR; | |
5542 | return 0; | |
5543 | } | |
5544 | *status = U_ZERO_ERROR; | |
5545 | len = unorm_internalNormalize(normSource, normSourceLen, | |
5546 | source, len, | |
5547 | UNORM_FCD, FALSE, | |
5548 | status); | |
5549 | } | |
5550 | ||
5551 | if(U_FAILURE(*status)) { | |
5552 | return 0; | |
5553 | } | |
5554 | source = normSource; | |
5555 | } | |
5556 | ||
5557 | collIterate s; | |
5558 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5559 | if(source == normSource) { | |
5560 | s.flags &= ~UCOL_ITER_NORM; | |
5561 | } | |
5562 | ||
5563 | if(resultLength == 0 || primaries == NULL) { | |
5564 | int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); | |
5565 | if(normSource != normBuffer) { | |
5566 | uprv_free(normSource); | |
5567 | } | |
5568 | return t; | |
5569 | } | |
5570 | ||
5571 | uint8_t *primarySafeEnd = primaries + resultLength - 2; | |
5572 | ||
5573 | uint32_t minBufferSize = UCOL_MAX_BUFFER; | |
5574 | ||
5575 | uint8_t *primStart = primaries; | |
5576 | uint8_t *secStart = secondaries; | |
5577 | uint8_t *terStart = tertiaries; | |
5578 | ||
5579 | uint32_t order = 0; | |
5580 | ||
5581 | uint8_t primary1 = 0; | |
5582 | uint8_t primary2 = 0; | |
5583 | uint8_t secondary = 0; | |
5584 | uint8_t tertiary = 0; | |
5585 | uint8_t caseSwitch = coll->caseSwitch; | |
5586 | uint8_t tertiaryMask = coll->tertiaryMask; | |
5587 | int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition; | |
5588 | uint8_t tertiaryTop = coll->tertiaryTop; | |
5589 | uint8_t tertiaryBottom = coll->tertiaryBottom; | |
5590 | uint8_t tertiaryCommon = coll->tertiaryCommon; | |
5591 | ||
5592 | uint32_t prevBuffSize = 0; | |
5593 | ||
5594 | UBool finished = FALSE; | |
5595 | UBool notIsContinuation = FALSE; | |
5596 | ||
5597 | uint32_t count2 = 0, count3 = 0; | |
5598 | uint8_t leadPrimary = 0; | |
5599 | ||
5600 | for(;;) { | |
5601 | for(i=prevBuffSize; i<minBufferSize; ++i) { | |
5602 | ||
5603 | order = ucol_IGetNextCE(coll, &s, status); | |
5604 | ||
5605 | if(order == 0) { | |
5606 | continue; | |
5607 | } | |
5608 | ||
5609 | if(order == UCOL_NO_MORE_CES) { | |
5610 | finished = TRUE; | |
5611 | break; | |
5612 | } | |
5613 | ||
5614 | notIsContinuation = !isContinuation(order); | |
5615 | ||
5616 | if(notIsContinuation) { | |
5617 | tertiary = (uint8_t)((order & tertiaryMask)); | |
5618 | } else { | |
5619 | tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
5620 | } | |
5621 | secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
5622 | primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
5623 | primary1 = (uint8_t)(order >> 8); | |
5624 | ||
5625 | /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ | |
5626 | /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ | |
5627 | /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ | |
5628 | /* regular and simple sortkey calc */ | |
5629 | if(primary1 != UCOL_IGNORABLE) { | |
5630 | if(notIsContinuation) { | |
5631 | if(leadPrimary == primary1) { | |
5632 | *primaries++ = primary2; | |
5633 | } else { | |
5634 | if(leadPrimary != 0) { | |
5635 | *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); | |
5636 | } | |
5637 | if(primary2 == UCOL_IGNORABLE) { | |
5638 | /* one byter, not compressed */ | |
5639 | *primaries++ = primary1; | |
5640 | leadPrimary = 0; | |
5641 | } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || | |
374ca955 | 5642 | //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) |
b75a7d8f A |
5643 | (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { |
5644 | /* not compressible */ | |
5645 | leadPrimary = 0; | |
5646 | *primaries++ = primary1; | |
5647 | *primaries++ = primary2; | |
5648 | } else { /* compress */ | |
5649 | *primaries++ = leadPrimary = primary1; | |
5650 | *primaries++ = primary2; | |
5651 | } | |
5652 | } | |
5653 | } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ | |
5654 | *primaries++ = primary1; | |
5655 | if(primary2 != UCOL_IGNORABLE) { | |
5656 | *primaries++ = primary2; /* second part */ | |
5657 | } | |
5658 | } | |
5659 | } | |
5660 | ||
5661 | if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ | |
5662 | /* This is compression code. */ | |
5663 | if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
5664 | ++count2; | |
5665 | } else { | |
5666 | if (count2 > 0) { | |
5667 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
5668 | while (count2 > UCOL_TOP_COUNT2) { | |
5669 | *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); | |
5670 | count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
5671 | } | |
5672 | *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); | |
5673 | } else { | |
5674 | while (count2 > UCOL_BOT_COUNT2) { | |
5675 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
5676 | count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
5677 | } | |
5678 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); | |
5679 | } | |
5680 | count2 = 0; | |
5681 | } | |
5682 | *secondaries++ = secondary; | |
5683 | } | |
5684 | } | |
5685 | ||
5686 | if(notIsContinuation) { | |
5687 | tertiary ^= caseSwitch; | |
5688 | } | |
5689 | ||
5690 | if(tertiary > 0) { | |
5691 | /* This is compression code. */ | |
5692 | /* sequence size check is included in the if clause */ | |
5693 | if (tertiary == tertiaryCommon && notIsContinuation) { | |
5694 | ++count3; | |
5695 | } else { | |
5696 | if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { | |
5697 | tertiary += tertiaryAddition; | |
5698 | } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { | |
5699 | tertiary -= tertiaryAddition; | |
5700 | } | |
5701 | if (count3 > 0) { | |
5702 | if ((tertiary > tertiaryCommon)) { | |
5703 | while (count3 > coll->tertiaryTopCount) { | |
5704 | *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); | |
5705 | count3 -= (uint32_t)coll->tertiaryTopCount; | |
5706 | } | |
5707 | *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); | |
5708 | } else { | |
5709 | while (count3 > coll->tertiaryBottomCount) { | |
5710 | *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); | |
5711 | count3 -= (uint32_t)coll->tertiaryBottomCount; | |
5712 | } | |
5713 | *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); | |
5714 | } | |
5715 | count3 = 0; | |
5716 | } | |
5717 | *tertiaries++ = tertiary; | |
5718 | } | |
5719 | } | |
5720 | ||
5721 | if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ | |
5722 | if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ | |
5723 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5724 | if(source == normSource) { | |
5725 | s.flags &= ~UCOL_ITER_NORM; | |
5726 | } | |
5727 | sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); | |
5728 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5729 | finished = TRUE; | |
5730 | break; | |
5731 | } else { /* It's much nicer if we can actually reallocate */ | |
5732 | int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart); | |
5733 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); | |
5734 | if(U_SUCCESS(*status)) { | |
5735 | *result = primStart; | |
5736 | primarySafeEnd = primStart + resultLength - 2; | |
5737 | } else { | |
5738 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5739 | if(source == normSource) { | |
5740 | s.flags &= ~UCOL_ITER_NORM; | |
5741 | } | |
5742 | sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); | |
5743 | finished = TRUE; | |
5744 | break; | |
5745 | } | |
5746 | } | |
5747 | } | |
5748 | } | |
5749 | if(finished) { | |
5750 | break; | |
5751 | } else { | |
5752 | prevBuffSize = minBufferSize; | |
5753 | secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); | |
5754 | terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); | |
5755 | minBufferSize *= 2; | |
5756 | if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size | |
5757 | IInit_collIterate(coll, (UChar *)source, len, &s); | |
5758 | if(source == normSource) { | |
5759 | s.flags &= ~UCOL_ITER_NORM; | |
5760 | } | |
5761 | sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); | |
5762 | break; | |
5763 | } | |
5764 | } | |
5765 | } | |
5766 | ||
5767 | if(U_SUCCESS(*status)) { | |
5768 | sortKeySize += (primaries - primStart); | |
5769 | /* we have done all the CE's, now let's put them together to form a key */ | |
5770 | if (count2 > 0) { | |
5771 | while (count2 > UCOL_BOT_COUNT2) { | |
5772 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
5773 | count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
5774 | } | |
5775 | *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); | |
5776 | } | |
5777 | uint32_t secsize = secondaries-secStart; | |
5778 | sortKeySize += secsize; | |
5779 | if(sortKeySize <= resultLength) { | |
5780 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5781 | uprv_memcpy(primaries, secStart, secsize); | |
5782 | primaries += secsize; | |
5783 | } else { | |
5784 | if(allocateSKBuffer == TRUE) { | |
5785 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5786 | if(U_SUCCESS(*status)) { | |
5787 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5788 | *result = primStart; | |
5789 | uprv_memcpy(primaries, secStart, secsize); | |
5790 | } | |
5791 | } else { | |
5792 | *status = U_BUFFER_OVERFLOW_ERROR; | |
5793 | } | |
5794 | } | |
5795 | ||
5796 | if (count3 > 0) { | |
5797 | if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { | |
5798 | while (count3 >= coll->tertiaryTopCount) { | |
5799 | *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); | |
5800 | count3 -= (uint32_t)coll->tertiaryTopCount; | |
5801 | } | |
5802 | *tertiaries++ = (uint8_t)(tertiaryTop - count3); | |
5803 | } else { | |
5804 | while (count3 > coll->tertiaryBottomCount) { | |
5805 | *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); | |
5806 | count3 -= (uint32_t)coll->tertiaryBottomCount; | |
5807 | } | |
5808 | *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); | |
5809 | } | |
5810 | } | |
5811 | uint32_t tersize = tertiaries - terStart; | |
5812 | sortKeySize += tersize; | |
5813 | if(sortKeySize <= resultLength) { | |
5814 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5815 | uprv_memcpy(primaries, terStart, tersize); | |
5816 | primaries += tersize; | |
5817 | } else { | |
5818 | if(allocateSKBuffer == TRUE) { | |
5819 | primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); | |
5820 | if(U_SUCCESS(*status)) { | |
5821 | *result = primStart; | |
5822 | *(primaries++) = UCOL_LEVELTERMINATOR; | |
5823 | uprv_memcpy(primaries, terStart, tersize); | |
5824 | } | |
5825 | } else { | |
5826 | *status = U_MEMORY_ALLOCATION_ERROR; | |
5827 | } | |
5828 | } | |
5829 | ||
5830 | *(primaries++) = '\0'; | |
5831 | } | |
5832 | ||
5833 | if(terStart != tert) { | |
5834 | uprv_free(terStart); | |
5835 | uprv_free(secStart); | |
5836 | } | |
5837 | ||
5838 | if(normSource != normBuffer) { | |
5839 | uprv_free(normSource); | |
5840 | } | |
5841 | ||
5842 | if(allocateSKBuffer == TRUE) { | |
5843 | *result = (uint8_t*)uprv_malloc(sortKeySize); | |
374ca955 A |
5844 | /* test for NULL */ |
5845 | if (*result == NULL) { | |
5846 | *status = U_MEMORY_ALLOCATION_ERROR; | |
5847 | return sortKeySize; | |
5848 | } | |
b75a7d8f A |
5849 | uprv_memcpy(*result, primStart, sortKeySize); |
5850 | if(primStart != prim) { | |
5851 | uprv_free(primStart); | |
5852 | } | |
5853 | } | |
5854 | ||
5855 | return sortKeySize; | |
5856 | } | |
5857 | ||
5858 | static inline | |
5859 | UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { | |
5860 | UBool notIsContinuation = !isContinuation(CE); | |
374ca955 | 5861 | uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); |
b75a7d8f A |
5862 | if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) |
5863 | || (!notIsContinuation && *wasShifted)) | |
5864 | || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ | |
5865 | // The stuff below should probably be in the sortkey code... maybe not... | |
5866 | if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ | |
5867 | /* we should just completely ignore it */ | |
5868 | *wasShifted = TRUE; | |
5869 | //continue; | |
5870 | } | |
5871 | //*wasShifted = TRUE; | |
5872 | return TRUE; | |
5873 | } else { | |
5874 | *wasShifted = FALSE; | |
5875 | return FALSE; | |
5876 | } | |
5877 | } | |
5878 | static inline | |
5879 | void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { | |
5880 | if(level < maxLevel) { | |
5881 | dest[i++] = UCOL_LEVELTERMINATOR; | |
5882 | } else { | |
5883 | dest[i++] = 0; | |
5884 | } | |
5885 | } | |
5886 | ||
5887 | /** enumeration of level identifiers for partial sort key generation */ | |
5888 | enum { | |
5889 | UCOL_PSK_PRIMARY = 0, | |
5890 | UCOL_PSK_SECONDARY = 1, | |
5891 | UCOL_PSK_CASE = 2, | |
5892 | UCOL_PSK_TERTIARY = 3, | |
5893 | UCOL_PSK_QUATERNARY = 4, | |
5894 | UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ | |
5895 | UCOL_PSK_IDENTICAL = 6, | |
5896 | UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ | |
5897 | UCOL_PSK_LIMIT | |
5898 | }; | |
5899 | ||
374ca955 A |
5900 | /** collation state enum. *_SHIFT value is how much to shift right |
5901 | * to get the state piece to the right. *_MASK value should be | |
b75a7d8f A |
5902 | * ANDed with the shifted state. This data is stored in state[1] |
5903 | * field. | |
5904 | */ | |
5905 | enum { | |
5906 | UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ | |
5907 | UCOL_PSK_LEVEL_MASK = 7, /** three bits */ | |
5908 | UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ | |
374ca955 | 5909 | UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, |
b75a7d8f A |
5910 | /** can be only 0 or 1, since we get up to two bytes from primary or quaternary |
5911 | * This field is also used to denote that the French secondary level is finished | |
5912 | */ | |
5913 | UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ | |
5914 | UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ | |
5915 | UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ | |
5916 | UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ | |
5917 | /** When we do French we need to reverse secondary values. However, continuations | |
374ca955 | 5918 | * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba |
b75a7d8f A |
5919 | */ |
5920 | UCOL_PSK_USED_ELEMENTS_SHIFT = 7, | |
5921 | UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF, | |
5922 | UCOL_PSK_ITER_SKIP_SHIFT = 17, | |
5923 | UCOL_PSK_ITER_SKIP_MASK = 0x7FFF | |
5924 | }; | |
5925 | ||
5926 | ||
374ca955 A |
5927 | /** main sortkey part procedure. On the first call, |
5928 | * you should pass in a collator, an iterator, empty state | |
b75a7d8f A |
5929 | * state[0] == state[1] == 0, a buffer to hold results |
5930 | * number of bytes you need and an error code pointer. | |
5931 | * Make sure your buffer is big enough to hold the wanted | |
374ca955 A |
5932 | * number of sortkey bytes. I don't check. |
5933 | * The only meaningful status you can get back is | |
5934 | * U_BUFFER_OVERFLOW_ERROR, which basically means that you | |
b75a7d8f A |
5935 | * have been dealt a raw deal and that you probably won't |
5936 | * be able to use partial sortkey generation for this | |
5937 | * particular combination of string and collator. This | |
5938 | * is highly unlikely, but you should still check the error code. | |
374ca955 A |
5939 | * Any other status means that you're not in a sane situation |
5940 | * anymore. After the first call, preserve state values and | |
b75a7d8f A |
5941 | * use them on subsequent calls to obtain more bytes of a sortkey. |
5942 | * Use until the number of bytes written is smaller than the requested | |
5943 | * number of bytes. Generated sortkey is not compatible with the | |
5944 | * one generated by ucol_getSortKey, as we don't do any compression. | |
5945 | * However, levels are still terminated by a 1 (one) and the sortkey | |
5946 | * is terminated by a 0 (zero). Identical level is the same as in the | |
374ca955 A |
5947 | * regular sortkey - internal bocu-1 implementation is used. |
5948 | * For curious, although you cannot do much about this, here is | |
b75a7d8f A |
5949 | * the structure of state words. |
5950 | * state[0] - iterator state. Depends on the iterator implementation, | |
5951 | * but allows the iterator to continue where it stopped in | |
5952 | * the last iteration. | |
5953 | * state[1] - collation processing state. Here is the distribution | |
5954 | * of the bits: | |
5955 | * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary | |
5956 | * quaternary, quin (we don't use this one), identical and | |
5957 | * null (producing only zeroes - first one to terminate the | |
5958 | * sortkey and subsequent to fill the buffer). | |
5959 | * 3 - byte count. Number of bytes written on the primary level. | |
5960 | * 4 - was shifted. Whether the previous iteration finished in the | |
5961 | * shifted state. | |
5962 | * 5, 6 - French continuation bytes written. See the comment in the enum | |
374ca955 A |
5963 | * 7..16 - Used elements. Number of CEs that were already used from the |
5964 | * expansion buffer or number of bytes from a bocu sequence on | |
b75a7d8f | 5965 | * the identical level. |
374ca955 | 5966 | * 17..31 - iterator skip. Number of move operations iterator needs to |
b75a7d8f A |
5967 | * skip from the current state in order to continue. This is used |
5968 | * only if normalization is turned on, since the normalizing iterator | |
374ca955 | 5969 | * can return undefined state, which means that it's in the middle |
b75a7d8f A |
5970 | * of normalizing sequence. |
5971 | */ | |
374ca955 | 5972 | U_CAPI int32_t U_EXPORT2 |
b75a7d8f A |
5973 | ucol_nextSortKeyPart(const UCollator *coll, |
5974 | UCharIterator *iter, | |
5975 | uint32_t state[2], | |
5976 | uint8_t *dest, int32_t count, | |
5977 | UErrorCode *status) { | |
5978 | /* error checking */ | |
5979 | if(status==NULL || U_FAILURE(*status)) { | |
5980 | return 0; | |
5981 | } | |
374ca955 | 5982 | UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); |
b75a7d8f A |
5983 | if( coll==NULL || iter==NULL || |
5984 | state==NULL || | |
5985 | count<0 || (count>0 && dest==NULL) | |
5986 | ) { | |
5987 | *status=U_ILLEGAL_ARGUMENT_ERROR; | |
5988 | } | |
5989 | ||
374ca955 A |
5990 | UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", |
5991 | coll, iter, state[0], state[1], dest, count); | |
b75a7d8f A |
5992 | |
5993 | if(count==0) { | |
5994 | /* nothing to do */ | |
374ca955 | 5995 | UTRACE_EXIT_VALUE(0); |
b75a7d8f A |
5996 | return 0; |
5997 | } | |
5998 | ||
5999 | /** Setting up situation according to the state we got from the previous iteration */ | |
6000 | // The state of the iterator from the previous invocation | |
6001 | uint32_t iterState = state[0]; | |
6002 | // Has the last iteration ended in the shifted state | |
6003 | UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; | |
6004 | // What is the current level of the sortkey? | |
6005 | int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; | |
6006 | // Have we written only one byte from a two byte primary in the previous iteration? | |
6007 | // Also on secondary level - have we finished with the French secondary? | |
374ca955 | 6008 | int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; |
b75a7d8f A |
6009 | // number of bytes in the continuation buffer for French |
6010 | int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; | |
6011 | // Skip the CEs that we got from an extraction | |
6012 | // and delivered in the previous call | |
6013 | int32_t usedElements = (state[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT) & UCOL_PSK_USED_ELEMENTS_MASK; | |
6014 | // Number of times to skip because the iterator returned | |
374ca955 | 6015 | // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the |
b75a7d8f A |
6016 | // last valid state. |
6017 | int32_t iterSkips = (state[1] >> UCOL_PSK_ITER_SKIP_SHIFT) & UCOL_PSK_ITER_SKIP_MASK; | |
6018 | ||
6019 | /** values that depend on the collator attributes */ | |
374ca955 | 6020 | // strength of the collator. |
b75a7d8f A |
6021 | int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); |
6022 | // maximal level of the partial sortkey. Need to take whether case level is done | |
6023 | int32_t maxLevel = 0; | |
6024 | if(strength < UCOL_TERTIARY) { | |
6025 | if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { | |
6026 | maxLevel = UCOL_PSK_CASE; | |
6027 | } else { | |
6028 | maxLevel = strength; | |
6029 | } | |
6030 | } else { | |
6031 | if(strength == UCOL_TERTIARY) { | |
6032 | maxLevel = UCOL_PSK_TERTIARY; | |
6033 | } else if(strength == UCOL_QUATERNARY) { | |
6034 | maxLevel = UCOL_PSK_QUATERNARY; | |
6035 | } else { // identical | |
6036 | maxLevel = UCOL_IDENTICAL; | |
6037 | } | |
6038 | } | |
6039 | // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation | |
374ca955 | 6040 | uint8_t UCOL_HIRAGANA_QUAD = |
b75a7d8f A |
6041 | (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; |
6042 | // Boundary value that decides whether a CE is shifted or not | |
6043 | uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; | |
6044 | // Are we doing French collation? | |
6045 | UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); | |
6046 | ||
6047 | /** initializing the collation state */ | |
6048 | UBool notIsContinuation = FALSE; | |
6049 | uint32_t CE = UCOL_NO_MORE_CES; | |
6050 | ||
6051 | collIterate s; | |
6052 | IInit_collIterate(coll, NULL, -1, &s); | |
6053 | s.iterator = iter; | |
6054 | s.flags |= UCOL_USE_ITERATOR; | |
6055 | // This variable tells us whether we have produced some other levels in this iteration | |
374ca955 | 6056 | // before we moved to the identical level. In that case, we need to switch the |
b75a7d8f A |
6057 | // type of the iterator. |
6058 | UBool doingIdenticalFromStart = FALSE; | |
6059 | // Normalizing iterator | |
6060 | // The division for the array length may truncate the array size to | |
6061 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
6062 | // for all platforms anyway. | |
6063 | UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
6064 | UNormIterator *normIter = NULL; | |
6065 | // If the normalization is turned on for the collator and we are below identical level | |
6066 | // we will use a FCD normalizing iterator | |
6067 | if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { | |
6068 | normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
6069 | s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); | |
6070 | s.flags &= ~UCOL_ITER_NORM; | |
6071 | if(U_FAILURE(*status)) { | |
374ca955 | 6072 | UTRACE_EXIT_STATUS(*status); |
b75a7d8f A |
6073 | return 0; |
6074 | } | |
6075 | } else if(level == UCOL_PSK_IDENTICAL) { | |
374ca955 | 6076 | // for identical level, we need a NFD iterator. We need to instantiate it here, since we |
b75a7d8f A |
6077 | // will be updating the state - and this cannot be done on an ordinary iterator. |
6078 | normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
6079 | s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
6080 | s.flags &= ~UCOL_ITER_NORM; | |
6081 | if(U_FAILURE(*status)) { | |
374ca955 | 6082 | UTRACE_EXIT_STATUS(*status); |
b75a7d8f A |
6083 | return 0; |
6084 | } | |
6085 | doingIdenticalFromStart = TRUE; | |
6086 | } | |
6087 | ||
6088 | // This is the tentative new state of the iterator. The problem | |
6089 | // is that the iterator might return an undefined state, in | |
6090 | // which case we should save the last valid state and increase | |
6091 | // the iterator skip value. | |
6092 | uint32_t newState = 0; | |
6093 | ||
6094 | // First, we set the iterator to the last valid position | |
6095 | // from the last iteration. This was saved in state[0]. | |
6096 | if(iterState == 0) { | |
6097 | /* initial state */ | |
6098 | if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { | |
6099 | s.iterator->move(s.iterator, 0, UITER_LIMIT); | |
6100 | } else { | |
6101 | s.iterator->move(s.iterator, 0, UITER_START); | |
6102 | } | |
6103 | } else { | |
6104 | /* reset to previous state */ | |
6105 | s.iterator->setState(s.iterator, iterState, status); | |
6106 | if(U_FAILURE(*status)) { | |
374ca955 | 6107 | UTRACE_EXIT_STATUS(*status); |
b75a7d8f A |
6108 | return 0; |
6109 | } | |
6110 | } | |
6111 | ||
6112 | // Then, we may have to move more, if the normalizing iterator | |
6113 | // was going through a normalizing sequence. | |
6114 | if(iterSkips) { | |
6115 | // if we are on secondary level AND we do French, we need to go backward instead of forward | |
6116 | if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
6117 | s.iterator->move(s.iterator, -iterSkips, UITER_CURRENT); | |
6118 | } else { | |
6119 | s.iterator->move(s.iterator, iterSkips, UITER_CURRENT); | |
6120 | } | |
6121 | } | |
6122 | ||
6123 | ||
6124 | // Number of expansion CEs that were already consumed in the | |
6125 | // previous iteration for the last code point processed. We | |
374ca955 | 6126 | // want to clean out the expansion buffer, so that we can |
b75a7d8f A |
6127 | // get correct CEs. This value is persistent over iterations, |
6128 | // since we can have several iterations on the one expansion | |
6129 | // buffer. | |
6130 | int32_t consumedExpansionCEs = usedElements; | |
6131 | // Number of bytes already writted from a bocsu sequence. Since | |
6132 | // the longes bocsu sequence is 4 long, this can be up to 3. It | |
6133 | // shares the state field with consumedExpansionCEs value, since | |
6134 | // they cannot simultanously appear on the same level | |
6135 | int32_t bocsuBytesUsed = 0; | |
374ca955 | 6136 | // Clean out the expansion buffer unless we are on |
b75a7d8f A |
6137 | // identical level. In that case we use this field |
6138 | // to store the number of bytes already written | |
6139 | // from the previous bocsu sequence. | |
6140 | if(level < UCOL_PSK_IDENTICAL && usedElements != 0) { | |
6141 | while(usedElements-->0) { | |
374ca955 | 6142 | // If we're doing French and we are on the secondary level, |
b75a7d8f A |
6143 | // we go backwards. |
6144 | if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
6145 | CE = ucol_IGetPrevCE(coll, &s, status); | |
6146 | } else { | |
6147 | CE = ucol_IGetNextCE(coll, &s, status); | |
6148 | } | |
6149 | if(CE==UCOL_NO_MORE_CES) { | |
6150 | /* should not happen */ | |
6151 | *status=U_INTERNAL_PROGRAM_ERROR; | |
374ca955 | 6152 | UTRACE_EXIT_STATUS(*status); |
b75a7d8f A |
6153 | return 0; |
6154 | } | |
6155 | } | |
6156 | } else { | |
6157 | bocsuBytesUsed = usedElements; | |
6158 | } | |
6159 | ||
6160 | // This variable prevents the adjusting of iterator | |
374ca955 | 6161 | // skip variable when we are the first time on a |
b75a7d8f A |
6162 | // level. I hope there is a better way to do it, but |
6163 | // I could not think of it. | |
6164 | UBool firstTimeOnLevel = TRUE; | |
6165 | // French secondary needs to know whether the iterator state of zero came from previous level OR | |
6166 | // from a new invocation... | |
6167 | UBool wasDoingPrimary = FALSE; | |
374ca955 | 6168 | // Case level is kind of goofy. This variable tells us that |
b75a7d8f | 6169 | // we are still not done with the case level. |
374ca955 | 6170 | UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE; |
b75a7d8f A |
6171 | // destination buffer byte counter. When this guy |
6172 | // gets to count, we're done with the iteration | |
374ca955 A |
6173 | int32_t i = 0; |
6174 | // used to count the zero bytes written after we | |
b75a7d8f A |
6175 | // have finished with the sort key |
6176 | int32_t j = 0; | |
6177 | ||
6178 | ||
6179 | // Hm.... I think we're ready to plunge in. Basic story is as following: | |
6180 | // we have a fall through case based on level. This is used for initial | |
6181 | // positioning on iteration start. Every level processor contains a | |
6182 | // for(;;) which will be broken when we exhaust all the CEs. Other | |
6183 | // way to exit is a goto saveState, which happens when we have filled | |
6184 | // out our buffer. | |
6185 | switch(level) { | |
374ca955 | 6186 | case UCOL_PSK_PRIMARY: |
b75a7d8f A |
6187 | wasDoingPrimary = TRUE; |
6188 | for(;;) { | |
6189 | if(i==count) { | |
6190 | goto saveState; | |
6191 | } | |
6192 | // We should save the state only if we | |
6193 | // are sure that we are done with the | |
6194 | // previous iterator state | |
6195 | if(consumedExpansionCEs == 0 && byteCountOrFrenchDone == 0) { | |
6196 | newState = s.iterator->getState(s.iterator); | |
6197 | if(newState != UITER_NO_STATE) { | |
6198 | iterState = newState; | |
6199 | iterSkips = 0; | |
6200 | } else { | |
6201 | if(!firstTimeOnLevel && !byteCountOrFrenchDone) { | |
6202 | iterSkips++; | |
6203 | } | |
6204 | } | |
6205 | } | |
6206 | firstTimeOnLevel = FALSE; | |
6207 | CE = ucol_IGetNextCE(coll, &s, status); | |
6208 | if(CE==UCOL_NO_MORE_CES) { | |
6209 | // Add the level separator | |
6210 | terminatePSKLevel(level, maxLevel, i, dest); | |
6211 | byteCountOrFrenchDone=0; | |
6212 | // Restart the iteration an move to the | |
6213 | // second level | |
6214 | s.iterator->move(s.iterator, 0, UITER_START); | |
6215 | level = UCOL_PSK_SECONDARY; | |
6216 | break; | |
6217 | } | |
6218 | if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
6219 | CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ | |
6220 | if(CE != 0) { | |
6221 | if(byteCountOrFrenchDone == 0) { | |
6222 | // get the second byte of primary | |
6223 | dest[i++]=(uint8_t)(CE >> 8); | |
6224 | } else { | |
6225 | byteCountOrFrenchDone = 0; | |
6226 | } | |
6227 | if((CE &=0xff)!=0) { | |
6228 | if(i==count) { | |
6229 | /* overflow */ | |
6230 | byteCountOrFrenchDone=1; | |
6231 | goto saveState; | |
6232 | } | |
6233 | dest[i++]=(uint8_t)CE; | |
6234 | } | |
6235 | } | |
6236 | } | |
374ca955 | 6237 | if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { |
b75a7d8f A |
6238 | // s.pos != NULL means there is a normalization buffer in effect |
6239 | // in iterative case, this means that we are doing Thai (maybe discontiguos) | |
6240 | consumedExpansionCEs++; | |
6241 | } else { | |
6242 | consumedExpansionCEs = 0; | |
6243 | } | |
374ca955 | 6244 | if(s.pos && *s.pos == 0) { |
b75a7d8f A |
6245 | // maybe it is the end of Thai - we have to have |
6246 | // an extra skip | |
6247 | iterSkips++; | |
6248 | } | |
6249 | } | |
6250 | /* fall through to next level */ | |
6251 | case UCOL_PSK_SECONDARY: | |
6252 | if(strength >= UCOL_SECONDARY) { | |
6253 | if(!doingFrench) { | |
6254 | for(;;) { | |
6255 | if(i == count) { | |
6256 | goto saveState; | |
6257 | } | |
6258 | // We should save the state only if we | |
6259 | // are sure that we are done with the | |
6260 | // previous iterator state | |
6261 | if(consumedExpansionCEs == 0) { | |
6262 | newState = s.iterator->getState(s.iterator); | |
6263 | if(newState != UITER_NO_STATE) { | |
6264 | iterState = newState; | |
6265 | iterSkips = 0; | |
6266 | } else { | |
6267 | if(!firstTimeOnLevel) { | |
6268 | iterSkips++; | |
6269 | } | |
6270 | } | |
6271 | } | |
6272 | firstTimeOnLevel = FALSE; | |
6273 | CE = ucol_IGetNextCE(coll, &s, status); | |
6274 | if(CE==UCOL_NO_MORE_CES) { | |
6275 | // Add the level separator | |
6276 | terminatePSKLevel(level, maxLevel, i, dest); | |
6277 | byteCountOrFrenchDone=0; | |
6278 | // Restart the iteration an move to the | |
6279 | // second level | |
374ca955 | 6280 | s.iterator->move(s.iterator, 0, UITER_START); |
b75a7d8f A |
6281 | level = UCOL_PSK_CASE; |
6282 | break; | |
6283 | } | |
6284 | if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
6285 | CE >>= 8; /* get secondary */ | |
6286 | if(CE != 0) { | |
6287 | dest[i++]=(uint8_t)CE; | |
6288 | } | |
6289 | } | |
6290 | if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { | |
6291 | consumedExpansionCEs++; | |
6292 | } else { | |
6293 | consumedExpansionCEs = 0; | |
6294 | } | |
374ca955 | 6295 | if(s.pos && *s.pos == 0) { |
b75a7d8f A |
6296 | iterSkips++; |
6297 | } | |
6298 | } | |
6299 | } else { // French secondary processing | |
6300 | uint8_t frenchBuff[UCOL_MAX_BUFFER]; | |
6301 | int32_t frenchIndex = 0; | |
6302 | // Here we are going backwards. | |
374ca955 A |
6303 | // If the iterator is at the beggining, it should be |
6304 | // moved to end. | |
b75a7d8f A |
6305 | if(wasDoingPrimary) { |
6306 | s.iterator->move(s.iterator, 0, UITER_LIMIT); | |
6307 | } | |
6308 | for(;;) { | |
6309 | if(i == count) { | |
6310 | goto saveState; | |
6311 | } | |
6312 | if(consumedExpansionCEs == 0) { | |
6313 | newState = s.iterator->getState(s.iterator); | |
6314 | if(newState != UITER_NO_STATE) { | |
6315 | iterState = newState; | |
6316 | iterSkips = 0; | |
374ca955 | 6317 | } else { |
b75a7d8f A |
6318 | if(!firstTimeOnLevel) { |
6319 | iterSkips++; | |
6320 | } | |
6321 | } | |
6322 | } | |
6323 | firstTimeOnLevel = FALSE; | |
6324 | CE = ucol_IGetPrevCE(coll, &s, status); | |
6325 | if(CE==UCOL_NO_MORE_CES) { | |
6326 | // Add the level separator | |
6327 | terminatePSKLevel(level, maxLevel, i, dest); | |
6328 | byteCountOrFrenchDone=0; | |
6329 | // Restart the iteration an move to the next level | |
6330 | s.iterator->move(s.iterator, 0, UITER_START); | |
6331 | level = UCOL_PSK_CASE; | |
6332 | break; | |
6333 | } | |
374ca955 | 6334 | if(isContinuation(CE)) { // if it's a continuation, we want to save it and |
b75a7d8f A |
6335 | // reverse when we get a first non-continuation CE. |
6336 | CE >>= 8; | |
6337 | frenchBuff[frenchIndex++] = (uint8_t)CE; | |
374ca955 | 6338 | } else if(!isShiftedCE(CE, LVT, &wasShifted)) { |
b75a7d8f A |
6339 | CE >>= 8; /* get secondary */ |
6340 | if(!frenchIndex) { | |
6341 | if(CE != 0) { | |
6342 | dest[i++]=(uint8_t)CE; | |
6343 | } | |
6344 | } else { | |
6345 | frenchBuff[frenchIndex++] = (uint8_t)CE; | |
6346 | frenchIndex -= usedFrench; | |
6347 | usedFrench = 0; | |
6348 | while(i < count && frenchIndex) { | |
6349 | dest[i++] = frenchBuff[--frenchIndex]; | |
6350 | usedFrench++; | |
6351 | } | |
6352 | } | |
6353 | } | |
6354 | if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { | |
6355 | consumedExpansionCEs++; | |
6356 | } else { | |
6357 | consumedExpansionCEs = 0; | |
6358 | } | |
6359 | if(s.pos && *s.pos == 0) { | |
6360 | iterSkips++; | |
6361 | } | |
6362 | } | |
6363 | } | |
6364 | } else { | |
6365 | level = UCOL_PSK_CASE; | |
6366 | } | |
6367 | /* fall through to next level */ | |
6368 | case UCOL_PSK_CASE: | |
6369 | if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { | |
6370 | uint32_t caseShift = UCOL_CASE_SHIFT_START; | |
6371 | uint8_t caseByte = UCOL_CASE_BYTE_START; | |
6372 | uint8_t caseBits = 0; | |
6373 | ||
6374 | for(;;) { | |
6375 | if(i == count) { | |
6376 | goto saveState; | |
6377 | } | |
6378 | // We should save the state only if we | |
6379 | // are sure that we are done with the | |
6380 | // previous iterator state | |
6381 | if(consumedExpansionCEs == 0) { | |
6382 | newState = s.iterator->getState(s.iterator); | |
6383 | if(newState != UITER_NO_STATE) { | |
6384 | iterState = newState; | |
6385 | iterSkips = 0; | |
6386 | } else { | |
6387 | if(!firstTimeOnLevel) { | |
6388 | iterSkips++; | |
6389 | } | |
6390 | } | |
6391 | } | |
6392 | firstTimeOnLevel = FALSE; | |
6393 | CE = ucol_IGetNextCE(coll, &s, status); | |
6394 | if(CE==UCOL_NO_MORE_CES) { | |
6395 | // On the case level we might have an unfinished | |
6396 | // case byte. Add one if it's started. | |
6397 | if(caseShift != UCOL_CASE_SHIFT_START) { | |
6398 | dest[i++] = caseByte; | |
6399 | } | |
6400 | // This is kind of tricky - situation where | |
374ca955 | 6401 | // we need to keep the iterator in the old |
b75a7d8f A |
6402 | // state, but don't need to bring anything |
6403 | // to the next invocation | |
6404 | if(i < count) { | |
6405 | // Add the level separator | |
6406 | terminatePSKLevel(level, maxLevel, i, dest); | |
6407 | // Restart the iteration and move to the | |
6408 | // next level | |
6409 | s.iterator->move(s.iterator, 0, UITER_START); | |
6410 | level = UCOL_PSK_TERTIARY; | |
6411 | } else { | |
6412 | dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE; | |
6413 | } | |
6414 | break; | |
6415 | } | |
6416 | ||
6417 | if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
6418 | if(!isContinuation(CE)) { | |
6419 | CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); | |
6420 | caseBits = (uint8_t)(CE & 0xC0); | |
374ca955 | 6421 | // this copies the case level logic from the |
b75a7d8f A |
6422 | // sort key generation code |
6423 | if(CE != 0) { | |
6424 | if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
6425 | if((caseBits & 0xC0) == 0) { | |
6426 | caseByte |= 1 << (--caseShift); | |
6427 | } else { | |
6428 | caseByte |= 0 << (--caseShift); | |
6429 | /* second bit */ | |
6430 | if(caseShift == 0) { | |
6431 | dest[i++] = caseByte; | |
6432 | caseShift = UCOL_CASE_SHIFT_START; | |
6433 | caseByte = UCOL_CASE_BYTE_START; | |
6434 | } | |
6435 | caseByte |= ((caseBits>>6)&1) << (--caseShift); | |
6436 | } | |
6437 | } else { | |
6438 | if((caseBits & 0xC0) == 0) { | |
6439 | caseByte |= 0 << (--caseShift); | |
6440 | } else { | |
6441 | caseByte |= 1 << (--caseShift); | |
6442 | /* second bit */ | |
6443 | if(caseShift == 0) { | |
6444 | dest[i++] = caseByte; | |
6445 | caseShift = UCOL_CASE_SHIFT_START; | |
6446 | caseByte = UCOL_CASE_BYTE_START; | |
6447 | } | |
6448 | caseByte |= ((caseBits>>7)&1) << (--caseShift); | |
6449 | } | |
6450 | } | |
6451 | } | |
6452 | ||
6453 | } | |
6454 | } | |
6455 | // Not sure this is correct for the case level - revisit | |
6456 | if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { | |
6457 | consumedExpansionCEs++; | |
6458 | } else { | |
6459 | consumedExpansionCEs = 0; | |
6460 | } | |
6461 | if(s.pos && *s.pos == 0) { | |
6462 | iterSkips++; | |
6463 | } | |
6464 | } | |
6465 | } else { | |
6466 | level = UCOL_PSK_TERTIARY; | |
6467 | } | |
6468 | /* fall through to next level */ | |
6469 | case UCOL_PSK_TERTIARY: | |
6470 | if(strength >= UCOL_TERTIARY) { | |
6471 | for(;;) { | |
6472 | if(i == count) { | |
6473 | goto saveState; | |
6474 | } | |
6475 | // We should save the state only if we | |
6476 | // are sure that we are done with the | |
6477 | // previous iterator state | |
6478 | if(consumedExpansionCEs == 0) { | |
6479 | newState = s.iterator->getState(s.iterator); | |
6480 | if(newState != UITER_NO_STATE) { | |
6481 | iterState = newState; | |
6482 | iterSkips = 0; | |
6483 | } else { | |
6484 | if(!firstTimeOnLevel) { | |
6485 | iterSkips++; | |
6486 | } | |
6487 | } | |
6488 | } | |
6489 | firstTimeOnLevel = FALSE; | |
6490 | CE = ucol_IGetNextCE(coll, &s, status); | |
6491 | if(CE==UCOL_NO_MORE_CES) { | |
6492 | // Add the level separator | |
6493 | terminatePSKLevel(level, maxLevel, i, dest); | |
6494 | byteCountOrFrenchDone=0; | |
6495 | // Restart the iteration an move to the | |
6496 | // second level | |
6497 | s.iterator->move(s.iterator, 0, UITER_START); | |
6498 | level = UCOL_PSK_QUATERNARY; | |
6499 | break; | |
6500 | } | |
6501 | if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
6502 | notIsContinuation = !isContinuation(CE); | |
6503 | ||
6504 | if(notIsContinuation) { | |
6505 | CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); | |
6506 | CE ^= coll->caseSwitch; | |
6507 | CE &= coll->tertiaryMask; | |
6508 | } else { | |
6509 | CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); | |
6510 | } | |
6511 | ||
6512 | if(CE != 0) { | |
6513 | dest[i++]=(uint8_t)CE; | |
6514 | } | |
6515 | } | |
6516 | if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { | |
6517 | consumedExpansionCEs++; | |
6518 | } else { | |
6519 | consumedExpansionCEs = 0; | |
6520 | } | |
6521 | if(s.pos && *s.pos == 0) { | |
6522 | iterSkips++; | |
6523 | } | |
6524 | } | |
6525 | } else { | |
6526 | // if we're not doing tertiary | |
6527 | // skip to the end | |
6528 | level = UCOL_PSK_NULL; | |
6529 | } | |
6530 | /* fall through to next level */ | |
6531 | case UCOL_PSK_QUATERNARY: | |
6532 | if(strength >= UCOL_QUATERNARY) { | |
6533 | for(;;) { | |
6534 | if(i == count) { | |
6535 | goto saveState; | |
6536 | } | |
6537 | // We should save the state only if we | |
6538 | // are sure that we are done with the | |
6539 | // previous iterator state | |
6540 | if(consumedExpansionCEs == 0) { | |
6541 | newState = s.iterator->getState(s.iterator); | |
6542 | if(newState != UITER_NO_STATE) { | |
6543 | iterState = newState; | |
6544 | iterSkips = 0; | |
6545 | } else { | |
6546 | if(!firstTimeOnLevel) { | |
6547 | iterSkips++; | |
6548 | } | |
6549 | } | |
6550 | } | |
6551 | firstTimeOnLevel = FALSE; | |
6552 | CE = ucol_IGetNextCE(coll, &s, status); | |
6553 | if(CE==UCOL_NO_MORE_CES) { | |
6554 | // Add the level separator | |
6555 | terminatePSKLevel(level, maxLevel, i, dest); | |
374ca955 | 6556 | //dest[i++] = UCOL_LEVELTERMINATOR; |
b75a7d8f A |
6557 | byteCountOrFrenchDone=0; |
6558 | // Restart the iteration an move to the | |
6559 | // second level | |
6560 | s.iterator->move(s.iterator, 0, UITER_START); | |
6561 | level = UCOL_PSK_QUIN; | |
6562 | break; | |
6563 | } | |
6564 | if(isShiftedCE(CE, LVT, &wasShifted)) { | |
6565 | CE >>= 16; /* get primary */ | |
6566 | if(CE != 0) { | |
6567 | if(byteCountOrFrenchDone == 0) { | |
6568 | dest[i++]=(uint8_t)(CE >> 8); | |
6569 | } else { | |
6570 | byteCountOrFrenchDone = 0; | |
6571 | } | |
6572 | if((CE &=0xff)!=0) { | |
6573 | if(i==count) { | |
6574 | /* overflow */ | |
6575 | byteCountOrFrenchDone=1; | |
6576 | goto saveState; | |
6577 | } | |
6578 | dest[i++]=(uint8_t)CE; | |
6579 | } | |
6580 | } | |
6581 | } else { | |
6582 | notIsContinuation = !isContinuation(CE); | |
6583 | if(notIsContinuation) { | |
6584 | if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it | |
6585 | dest[i++] = UCOL_HIRAGANA_QUAD; | |
6586 | } else { | |
6587 | dest[i++] = 0xFF; | |
6588 | } | |
6589 | } | |
6590 | } | |
6591 | if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { | |
6592 | consumedExpansionCEs++; | |
6593 | } else { | |
6594 | consumedExpansionCEs = 0; | |
6595 | } | |
6596 | if(s.pos && *s.pos == 0) { | |
6597 | iterSkips++; | |
6598 | } | |
6599 | } | |
6600 | } else { | |
6601 | // if we're not doing quaternary | |
6602 | // skip to the end | |
6603 | level = UCOL_PSK_NULL; | |
6604 | } | |
6605 | /* fall through to next level */ | |
6606 | case UCOL_PSK_QUIN: | |
6607 | level = UCOL_PSK_IDENTICAL; | |
6608 | /* fall through to next level */ | |
6609 | case UCOL_PSK_IDENTICAL: | |
6610 | if(strength >= UCOL_IDENTICAL) { | |
6611 | UChar32 first, second; | |
6612 | int32_t bocsuBytesWritten = 0; | |
374ca955 | 6613 | // We always need to do identical on |
b75a7d8f A |
6614 | // the NFD form of the string. |
6615 | if(normIter == NULL) { | |
6616 | // we arrived from the level below and | |
6617 | // normalization was not turned on. | |
6618 | // therefore, we need to make a fresh NFD iterator | |
6619 | normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
6620 | s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
374ca955 | 6621 | } else if(!doingIdenticalFromStart) { |
b75a7d8f | 6622 | // there is an iterator, but we did some other levels. |
374ca955 A |
6623 | // therefore, we have a FCD iterator - need to make |
6624 | // a NFD one. | |
b75a7d8f A |
6625 | // normIter being at the beginning does not guarantee |
6626 | // that the underlying iterator is at the beginning | |
6627 | iter->move(iter, 0, UITER_START); | |
6628 | s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
6629 | } | |
6630 | // At this point we have a NFD iterator that is positioned | |
6631 | // in the right place | |
6632 | if(U_FAILURE(*status)) { | |
374ca955 | 6633 | UTRACE_EXIT_STATUS(*status); |
b75a7d8f A |
6634 | return 0; |
6635 | } | |
6636 | first = uiter_previous32(s.iterator); | |
6637 | // maybe we're at the start of the string | |
6638 | if(first == U_SENTINEL) { | |
6639 | first = 0; | |
6640 | } else { | |
6641 | uiter_next32(s.iterator); | |
6642 | } | |
6643 | ||
6644 | j = 0; | |
6645 | for(;;) { | |
6646 | if(i == count) { | |
6647 | if(j+1 < bocsuBytesWritten) { | |
6648 | bocsuBytesUsed = j+1; | |
6649 | } | |
6650 | goto saveState; | |
6651 | } | |
6652 | ||
374ca955 | 6653 | // On identical level, we will always save |
b75a7d8f A |
6654 | // the state if we reach this point, since |
6655 | // we don't depend on getNextCE for content | |
6656 | // all the content is in our buffer and we | |
6657 | // already either stored the full buffer OR | |
6658 | // otherwise we won't arrive here. | |
6659 | newState = s.iterator->getState(s.iterator); | |
6660 | if(newState != UITER_NO_STATE) { | |
6661 | iterState = newState; | |
6662 | iterSkips = 0; | |
6663 | } else { | |
6664 | iterSkips++; | |
6665 | } | |
6666 | ||
6667 | uint8_t buff[4]; | |
6668 | second = uiter_next32(s.iterator); | |
6669 | ||
6670 | // end condition for identical level | |
6671 | if(second == U_SENTINEL) { | |
6672 | terminatePSKLevel(level, maxLevel, i, dest); | |
6673 | level = UCOL_PSK_NULL; | |
6674 | break; | |
6675 | } | |
6676 | bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); | |
6677 | first = second; | |
6678 | ||
6679 | j = 0; | |
6680 | if(bocsuBytesUsed != 0) { | |
6681 | while(bocsuBytesUsed-->0) { | |
6682 | j++; | |
6683 | } | |
6684 | } | |
6685 | ||
6686 | while(i < count && j < bocsuBytesWritten) { | |
6687 | dest[i++] = buff[j++]; | |
6688 | } | |
6689 | } | |
6690 | ||
6691 | } else { | |
6692 | level = UCOL_PSK_NULL; | |
6693 | } | |
6694 | /* fall through to next level */ | |
6695 | case UCOL_PSK_NULL: | |
6696 | j = i; | |
6697 | while(j<count) { | |
6698 | dest[j++]=0; | |
6699 | } | |
6700 | break; | |
6701 | default: | |
6702 | *status = U_INTERNAL_PROGRAM_ERROR; | |
374ca955 | 6703 | UTRACE_EXIT_STATUS(*status); |
b75a7d8f A |
6704 | return 0; |
6705 | } | |
6706 | ||
6707 | saveState: | |
6708 | // Now we need to return stuff. First we want to see whether we have | |
6709 | // done everything for the current state of iterator. | |
374ca955 | 6710 | if(consumedExpansionCEs || byteCountOrFrenchDone |
b75a7d8f | 6711 | || dontAdvanceIteratorBecauseWeNeedALevelTerminator) { |
374ca955 A |
6712 | // Any of above mean that the previous transaction |
6713 | // wasn't finished and that we should store the | |
b75a7d8f A |
6714 | // previous iterator state. |
6715 | state[0] = iterState; | |
6716 | } else { | |
374ca955 | 6717 | // The transaction is complete. We will continue in |
b75a7d8f A |
6718 | // next iteration. |
6719 | if((newState = s.iterator->getState(s.iterator))!= UITER_NO_STATE) { | |
6720 | state[0] = s.iterator->getState(s.iterator); | |
6721 | iterSkips = 0; | |
6722 | } else { | |
6723 | state[0] = iterState; | |
6724 | iterSkips++; | |
6725 | } | |
6726 | } | |
6727 | // Store the number of elements processed. On CE levels, this is | |
6728 | // the number of expansion CEs processed. On identical level, this | |
6729 | // is the number of bocsu bytes written. | |
6730 | if(level < UCOL_PSK_IDENTICAL) { | |
6731 | if((consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) != consumedExpansionCEs) { | |
6732 | *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6733 | } | |
6734 | state[1] = (consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT; | |
6735 | } else { | |
6736 | if((bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) != bocsuBytesUsed) { | |
6737 | *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6738 | } | |
6739 | state[1] = (bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT; | |
6740 | } | |
6741 | ||
6742 | // Next we put in the level of comparison | |
374ca955 | 6743 | state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); |
b75a7d8f A |
6744 | |
6745 | // If we are doing French, we need to store whether we have just finished the French level | |
6746 | if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
6747 | state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); | |
6748 | } else { | |
6749 | state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); | |
6750 | } | |
6751 | ||
6752 | // Was the latest CE shifted | |
6753 | if(wasShifted) { | |
6754 | state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; | |
6755 | } | |
6756 | // Check for iterSkips overflow | |
6757 | if((iterSkips & UCOL_PSK_ITER_SKIP_MASK) != iterSkips) { | |
6758 | *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6759 | } | |
6760 | // Store iterSkips | |
6761 | state[1] |= ((iterSkips & UCOL_PSK_ITER_SKIP_MASK) << UCOL_PSK_ITER_SKIP_SHIFT); | |
6762 | ||
6763 | // Check for French overflow | |
6764 | if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { | |
6765 | *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6766 | } | |
6767 | // Store number of bytes written in the French secondary continuation sequence | |
6768 | state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); | |
6769 | ||
6770 | ||
6771 | // If we have used normalizing iterator, get rid of it | |
6772 | if(normIter != NULL) { | |
6773 | unorm_closeIter(normIter); | |
6774 | } | |
6775 | ||
6776 | // Return number of meaningful sortkey bytes. | |
374ca955 A |
6777 | UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", |
6778 | dest,i, state[0], state[1]); | |
6779 | UTRACE_EXIT_VALUE(i); | |
b75a7d8f A |
6780 | return i; |
6781 | } | |
6782 | ||
6783 | /** | |
6784 | * Produce a bound for a given sortkey and a number of levels. | |
6785 | */ | |
374ca955 | 6786 | U_CAPI int32_t U_EXPORT2 |
b75a7d8f A |
6787 | ucol_getBound(const uint8_t *source, |
6788 | int32_t sourceLength, | |
6789 | UColBoundMode boundType, | |
6790 | uint32_t noOfLevels, | |
6791 | uint8_t *result, | |
6792 | int32_t resultLength, | |
6793 | UErrorCode *status) { | |
374ca955 | 6794 | // consistency checks |
b75a7d8f A |
6795 | if(status == NULL || U_FAILURE(*status)) { |
6796 | return 0; | |
6797 | } | |
6798 | if(source == NULL) { | |
6799 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6800 | return 0; | |
6801 | } | |
6802 | ||
6803 | int32_t sourceIndex = 0; | |
6804 | // Scan the string until we skip enough of the key OR reach the end of the key | |
6805 | do { | |
6806 | sourceIndex++; | |
6807 | if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { | |
6808 | noOfLevels--; | |
6809 | } | |
374ca955 | 6810 | } while (noOfLevels > 0 |
b75a7d8f A |
6811 | && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); |
6812 | ||
6813 | if((source[sourceIndex] == 0 || sourceIndex == sourceLength) | |
6814 | && noOfLevels > 0) { | |
6815 | *status = U_SORT_KEY_TOO_SHORT_WARNING; | |
6816 | } | |
6817 | ||
6818 | ||
6819 | // READ ME: this code assumes that the values for boundType | |
6820 | // enum will not changes. They are set so that the enum value | |
374ca955 | 6821 | // corresponds to the number of extra bytes each bound type |
b75a7d8f A |
6822 | // needs. |
6823 | if(result != NULL && resultLength >= sourceIndex+boundType) { | |
6824 | uprv_memcpy(result, source, sourceIndex); | |
6825 | switch(boundType) { | |
6826 | // Lower bound just gets terminated. No extra bytes | |
6827 | case UCOL_BOUND_LOWER: // = 0 | |
6828 | break; | |
6829 | // Upper bound needs one extra byte | |
6830 | case UCOL_BOUND_UPPER: // = 1 | |
6831 | result[sourceIndex++] = 2; | |
6832 | break; | |
6833 | // Upper long bound needs two extra bytes | |
6834 | case UCOL_BOUND_UPPER_LONG: // = 2 | |
6835 | result[sourceIndex++] = 0xFF; | |
6836 | result[sourceIndex++] = 0xFF; | |
6837 | break; | |
6838 | default: | |
6839 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6840 | return 0; | |
6841 | } | |
6842 | result[sourceIndex++] = 0; | |
6843 | ||
6844 | return sourceIndex; | |
6845 | } else { | |
374ca955 | 6846 | return sourceIndex+boundType+1; |
b75a7d8f A |
6847 | } |
6848 | } | |
6849 | ||
6850 | static | |
6851 | inline void uprv_appendByteToHexString(char *dst, uint8_t val) { | |
6852 | uint32_t len = (uint32_t)uprv_strlen(dst); | |
6853 | *(dst+len) = T_CString_itosOffset((val >> 4)); | |
6854 | *(dst+len+1) = T_CString_itosOffset((val & 0xF)); | |
6855 | *(dst+len+2) = 0; | |
6856 | } | |
6857 | ||
6858 | /* this function makes a string with representation of a sortkey */ | |
6859 | U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) { | |
6860 | int32_t strength = UCOL_PRIMARY; | |
6861 | uint32_t res_size = 0; | |
6862 | UBool doneCase = FALSE; | |
6863 | ||
6864 | char *current = buffer; | |
6865 | const uint8_t *currentSk = sortkey; | |
6866 | ||
6867 | uprv_strcpy(current, "["); | |
6868 | ||
6869 | while(strength <= UCOL_QUATERNARY && strength <= coll->strength) { | |
6870 | if(strength > UCOL_PRIMARY) { | |
374ca955 | 6871 | uprv_strcat(current, " . "); |
b75a7d8f A |
6872 | } |
6873 | while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */ | |
6874 | uprv_appendByteToHexString(current, *currentSk++); | |
6875 | uprv_strcat(current, " "); | |
6876 | } | |
6877 | if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) { | |
6878 | doneCase = TRUE; | |
6879 | } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) { | |
6880 | strength ++; | |
6881 | } | |
6882 | uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */ | |
6883 | if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) { | |
6884 | break; | |
6885 | } | |
6886 | } | |
6887 | ||
6888 | if(coll->strength == UCOL_IDENTICAL) { | |
6889 | uprv_strcat(current, " . "); | |
6890 | while(*currentSk != 0) { | |
6891 | uprv_appendByteToHexString(current, *currentSk++); | |
6892 | uprv_strcat(current, " "); | |
6893 | } | |
6894 | ||
6895 | uprv_appendByteToHexString(current, *currentSk++); | |
6896 | } | |
6897 | uprv_strcat(current, "]"); | |
6898 | ||
6899 | if(res_size > *len) { | |
6900 | return NULL; | |
6901 | } | |
6902 | ||
6903 | return buffer; | |
6904 | } | |
6905 | ||
6906 | ||
6907 | /****************************************************************************/ | |
6908 | /* Following are the functions that deal with the properties of a collator */ | |
6909 | /* there are new APIs and some compatibility APIs */ | |
6910 | /****************************************************************************/ | |
6911 | ||
6912 | static inline void | |
6913 | ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, | |
6914 | int32_t *primShift, int32_t *secShift, int32_t *terShift) { | |
6915 | uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; | |
6916 | UBool reverseSecondary = FALSE; | |
6917 | if(!isContinuation(CE)) { | |
6918 | tertiary = (uint8_t)((CE & coll->tertiaryMask)); | |
6919 | tertiary ^= coll->caseSwitch; | |
6920 | reverseSecondary = TRUE; | |
6921 | } else { | |
6922 | tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); | |
6923 | tertiary &= UCOL_REMOVE_CASE; | |
6924 | reverseSecondary = FALSE; | |
6925 | } | |
6926 | ||
6927 | secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); | |
6928 | primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); | |
6929 | primary1 = (uint8_t)(CE >> 8); | |
6930 | ||
6931 | if(primary1 != 0) { | |
6932 | coll->latinOneCEs[ch] |= (primary1 << *primShift); | |
6933 | *primShift -= 8; | |
6934 | } | |
6935 | if(primary2 != 0) { | |
6936 | if(*primShift < 0) { | |
6937 | coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; | |
6938 | coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
6939 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
6940 | return; | |
6941 | } | |
6942 | coll->latinOneCEs[ch] |= (primary2 << *primShift); | |
6943 | *primShift -= 8; | |
6944 | } | |
6945 | if(secondary != 0) { | |
6946 | if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary | |
6947 | coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary | |
6948 | coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); | |
374ca955 | 6949 | } else { // normal case |
b75a7d8f A |
6950 | coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); |
6951 | } | |
6952 | *secShift -= 8; | |
6953 | } | |
6954 | if(tertiary != 0) { | |
6955 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); | |
6956 | *terShift -= 8; | |
6957 | } | |
6958 | } | |
6959 | ||
6960 | static inline UBool | |
6961 | ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { | |
6962 | uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); | |
6963 | if(newTable == NULL) { | |
6964 | *status = U_MEMORY_ALLOCATION_ERROR; | |
6965 | coll->latinOneFailed = TRUE; | |
6966 | return FALSE; | |
6967 | } | |
6968 | int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); | |
6969 | uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); | |
6970 | uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); | |
6971 | uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); | |
6972 | uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); | |
6973 | coll->latinOneTableLen = size; | |
6974 | uprv_free(coll->latinOneCEs); | |
6975 | coll->latinOneCEs = newTable; | |
6976 | return TRUE; | |
6977 | } | |
6978 | ||
6979 | static UBool | |
6980 | ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { | |
6981 | UBool result = TRUE; | |
6982 | if(coll->latinOneCEs == NULL) { | |
6983 | coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); | |
6984 | if(coll->latinOneCEs == NULL) { | |
6985 | *status = U_MEMORY_ALLOCATION_ERROR; | |
6986 | return FALSE; | |
6987 | } | |
6988 | coll->latinOneTableLen = UCOL_LATINONETABLELEN; | |
6989 | } | |
6990 | UChar ch = 0; | |
6991 | UCollationElements *it = ucol_openElements(coll, &ch, 1, status); | |
6992 | uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); | |
6993 | ||
6994 | int32_t primShift = 24, secShift = 24, terShift = 24; | |
6995 | uint32_t CE = 0; | |
6996 | int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; | |
6997 | ||
6998 | // TODO: make safe if you get more than you wanted... | |
6999 | for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { | |
7000 | primShift = 24; secShift = 24; terShift = 24; | |
7001 | if(ch < 0x100) { | |
7002 | CE = coll->latinOneMapping[ch]; | |
7003 | } else { | |
7004 | CE = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); | |
374ca955 A |
7005 | if(CE == UCOL_NOT_FOUND && coll->UCA) { |
7006 | CE = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch); | |
b75a7d8f A |
7007 | } |
7008 | } | |
7009 | if(CE < UCOL_NOT_FOUND) { | |
7010 | ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); | |
7011 | } else { | |
7012 | switch (getCETag(CE)) { | |
7013 | case EXPANSION_TAG: | |
374ca955 | 7014 | case DIGIT_TAG: |
b75a7d8f | 7015 | ucol_setText(it, &ch, 1, status); |
374ca955 | 7016 | while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { |
b75a7d8f A |
7017 | if(primShift < 0 || secShift < 0 || terShift < 0) { |
7018 | coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; | |
7019 | coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
7020 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
7021 | break; | |
7022 | } | |
7023 | ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); | |
7024 | } | |
7025 | break; | |
7026 | case CONTRACTION_TAG: | |
7027 | // here is the trick | |
7028 | // F2 is contraction. We do something very similar to contractions | |
7029 | // but have two indices, one in the real contraction table and the | |
7030 | // other to where we stuffed things. This hopes that we don't have | |
7031 | // many contractions (this should work for latin-1 tables). | |
7032 | { | |
7033 | if((CE & 0x00FFF000) != 0) { | |
7034 | *status = U_UNSUPPORTED_ERROR; | |
374ca955 | 7035 | coll->latinOneFailed = TRUE; |
b75a7d8f A |
7036 | return FALSE; |
7037 | } | |
7038 | ||
7039 | const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); | |
7040 | ||
7041 | CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table | |
374ca955 | 7042 | |
b75a7d8f A |
7043 | coll->latinOneCEs[ch] = CE; |
7044 | coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; | |
7045 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; | |
7046 | ||
7047 | // We're going to jump into contraction table, pick the elements | |
7048 | // and use them | |
7049 | do { | |
7050 | CE = *(coll->contractionCEs + | |
7051 | (UCharOffset - coll->contractionIndex)); | |
374ca955 | 7052 | if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { |
b75a7d8f A |
7053 | uint32_t size; |
7054 | uint32_t i; /* general counter */ | |
7055 | uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ | |
7056 | size = getExpansionCount(CE); | |
7057 | //CE = *CEOffset++; | |
7058 | if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ | |
7059 | for(i = 0; i<size; i++) { | |
7060 | if(primShift < 0 || secShift < 0 || terShift < 0) { | |
7061 | coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7062 | coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7063 | coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7064 | break; | |
7065 | } | |
7066 | ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); | |
7067 | } | |
7068 | } else { /* else, we do */ | |
7069 | while(*CEOffset != 0) { | |
7070 | if(primShift < 0 || secShift < 0 || terShift < 0) { | |
7071 | coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7072 | coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7073 | coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7074 | break; | |
7075 | } | |
7076 | ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); | |
7077 | } | |
7078 | } | |
7079 | contractionOffset++; | |
7080 | } else if(CE < UCOL_NOT_FOUND) { | |
7081 | ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); | |
7082 | } else { | |
7083 | coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7084 | coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7085 | coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
7086 | contractionOffset++; | |
374ca955 | 7087 | } |
b75a7d8f A |
7088 | UCharOffset++; |
7089 | primShift = 24; secShift = 24; terShift = 24; | |
7090 | if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate | |
7091 | if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { | |
374ca955 | 7092 | coll->latinOneFailed = TRUE; |
b75a7d8f A |
7093 | return FALSE; |
7094 | } | |
7095 | } | |
7096 | } while(*UCharOffset != 0xFFFF); | |
7097 | } | |
7098 | break; | |
7099 | default: | |
7100 | coll->latinOneFailed = TRUE; | |
7101 | result = FALSE; | |
7102 | break; | |
7103 | } | |
7104 | } | |
7105 | } | |
7106 | ucol_closeElements(it); | |
7107 | // compact table | |
7108 | if(contractionOffset < coll->latinOneTableLen) { | |
7109 | if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { | |
374ca955 A |
7110 | coll->latinOneFailed = TRUE; |
7111 | return FALSE; | |
b75a7d8f A |
7112 | } |
7113 | } | |
7114 | return result; | |
7115 | } | |
7116 | ||
7117 | void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { | |
7118 | if(U_SUCCESS(*status)) { | |
7119 | if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
7120 | coll->caseSwitch = UCOL_CASE_SWITCH; | |
7121 | } else { | |
7122 | coll->caseSwitch = UCOL_NO_CASE_SWITCH; | |
7123 | } | |
7124 | ||
7125 | if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { | |
7126 | coll->tertiaryMask = UCOL_REMOVE_CASE; | |
7127 | coll->tertiaryCommon = UCOL_COMMON3_NORMAL; | |
7128 | coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF; | |
7129 | coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; | |
7130 | coll->tertiaryBottom = UCOL_COMMON_BOT3; | |
7131 | } else { | |
7132 | coll->tertiaryMask = UCOL_KEEP_CASE; | |
7133 | coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; | |
7134 | if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
7135 | coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; | |
7136 | coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; | |
7137 | coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; | |
7138 | } else { | |
7139 | coll->tertiaryCommon = UCOL_COMMON3_NORMAL; | |
7140 | coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; | |
7141 | coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; | |
7142 | } | |
7143 | } | |
7144 | ||
7145 | /* Set the compression values */ | |
7146 | uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1); | |
7147 | coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ | |
7148 | coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); | |
7149 | ||
7150 | if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY | |
7151 | && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) { | |
7152 | coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; | |
7153 | } else { | |
7154 | coll->sortKeyGen = ucol_calcSortKey; | |
7155 | } | |
374ca955 | 7156 | if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF |
b75a7d8f A |
7157 | && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) { |
7158 | if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { | |
7159 | if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it | |
7160 | //fprintf(stderr, "F"); | |
7161 | coll->latinOneUse = TRUE; | |
7162 | } else { | |
7163 | coll->latinOneUse = FALSE; | |
7164 | } | |
374ca955 A |
7165 | if(*status == U_UNSUPPORTED_ERROR) { |
7166 | *status = U_ZERO_ERROR; | |
7167 | } | |
b75a7d8f A |
7168 | } else { // latin1Table exists and it doesn't need to be regenerated, just use it |
7169 | coll->latinOneUse = TRUE; | |
7170 | } | |
7171 | } else { | |
7172 | coll->latinOneUse = FALSE; | |
374ca955 | 7173 | } |
b75a7d8f A |
7174 | } |
7175 | ||
7176 | } | |
7177 | ||
7178 | U_CAPI uint32_t U_EXPORT2 | |
7179 | ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { | |
7180 | if(U_FAILURE(*status) || coll == NULL) { | |
7181 | return 0; | |
7182 | } | |
7183 | if(len == -1) { | |
7184 | len = u_strlen(varTop); | |
7185 | } | |
7186 | if(len == 0) { | |
7187 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7188 | return 0; | |
7189 | } | |
7190 | ||
7191 | collIterate s; | |
7192 | IInit_collIterate(coll, varTop, len, &s); | |
7193 | ||
7194 | uint32_t CE = ucol_IGetNextCE(coll, &s, status); | |
7195 | ||
7196 | /* here we check if we have consumed all characters */ | |
7197 | /* you can put in either one character or a contraction */ | |
7198 | /* you shouldn't put more... */ | |
7199 | if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { | |
7200 | *status = U_CE_NOT_FOUND_ERROR; | |
7201 | return 0; | |
7202 | } | |
7203 | ||
7204 | uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); | |
7205 | ||
7206 | if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { | |
7207 | *status = U_PRIMARY_TOO_LONG_ERROR; | |
7208 | return 0; | |
7209 | } | |
374ca955 A |
7210 | if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { |
7211 | coll->variableTopValueisDefault = FALSE; | |
7212 | coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; | |
7213 | } | |
b75a7d8f A |
7214 | |
7215 | return CE & UCOL_PRIMARYMASK; | |
7216 | } | |
7217 | ||
7218 | U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { | |
7219 | if(U_FAILURE(*status) || coll == NULL) { | |
7220 | return 0; | |
7221 | } | |
7222 | return coll->variableTopValue<<16; | |
7223 | } | |
7224 | ||
7225 | U_CAPI void U_EXPORT2 | |
7226 | ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { | |
7227 | if(U_FAILURE(*status) || coll == NULL) { | |
7228 | return; | |
7229 | } | |
374ca955 A |
7230 | |
7231 | if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { | |
7232 | coll->variableTopValueisDefault = FALSE; | |
7233 | coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; | |
7234 | } | |
b75a7d8f A |
7235 | } |
7236 | /* Attribute setter API */ | |
7237 | U_CAPI void U_EXPORT2 | |
7238 | ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { | |
7239 | if(U_FAILURE(*status) || coll == NULL) { | |
7240 | return; | |
7241 | } | |
7242 | UColAttributeValue oldFrench = coll->frenchCollation; | |
7243 | UColAttributeValue oldCaseFirst = coll->caseFirst; | |
7244 | switch(attr) { | |
7245 | case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ | |
7246 | if(value == UCOL_ON) { | |
7247 | coll->numericCollation = UCOL_ON; | |
7248 | coll->numericCollationisDefault = FALSE; | |
7249 | } else if (value == UCOL_OFF) { | |
7250 | coll->numericCollation = UCOL_OFF; | |
7251 | coll->numericCollationisDefault = FALSE; | |
7252 | } else if (value == UCOL_DEFAULT) { | |
7253 | coll->numericCollationisDefault = TRUE; | |
7254 | coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; | |
7255 | } else { | |
7256 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7257 | } | |
7258 | break; | |
7259 | case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ | |
7260 | if(value == UCOL_ON) { | |
7261 | coll->hiraganaQ = UCOL_ON; | |
7262 | coll->hiraganaQisDefault = FALSE; | |
7263 | } else if (value == UCOL_OFF) { | |
7264 | coll->hiraganaQ = UCOL_OFF; | |
7265 | coll->hiraganaQisDefault = FALSE; | |
7266 | } else if (value == UCOL_DEFAULT) { | |
7267 | coll->hiraganaQisDefault = TRUE; | |
7268 | coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; | |
7269 | } else { | |
7270 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7271 | } | |
7272 | break; | |
7273 | case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ | |
7274 | if(value == UCOL_ON) { | |
7275 | coll->frenchCollation = UCOL_ON; | |
7276 | coll->frenchCollationisDefault = FALSE; | |
7277 | } else if (value == UCOL_OFF) { | |
7278 | coll->frenchCollation = UCOL_OFF; | |
7279 | coll->frenchCollationisDefault = FALSE; | |
7280 | } else if (value == UCOL_DEFAULT) { | |
7281 | coll->frenchCollationisDefault = TRUE; | |
7282 | coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; | |
7283 | } else { | |
7284 | *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
7285 | } | |
7286 | break; | |
7287 | case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ | |
7288 | if(value == UCOL_SHIFTED) { | |
7289 | coll->alternateHandling = UCOL_SHIFTED; | |
7290 | coll->alternateHandlingisDefault = FALSE; | |
7291 | } else if (value == UCOL_NON_IGNORABLE) { | |
7292 | coll->alternateHandling = UCOL_NON_IGNORABLE; | |
7293 | coll->alternateHandlingisDefault = FALSE; | |
7294 | } else if (value == UCOL_DEFAULT) { | |
7295 | coll->alternateHandlingisDefault = TRUE; | |
7296 | coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; | |
7297 | } else { | |
7298 | *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
7299 | } | |
7300 | break; | |
7301 | case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ | |
7302 | if(value == UCOL_LOWER_FIRST) { | |
7303 | coll->caseFirst = UCOL_LOWER_FIRST; | |
7304 | coll->caseFirstisDefault = FALSE; | |
7305 | } else if (value == UCOL_UPPER_FIRST) { | |
7306 | coll->caseFirst = UCOL_UPPER_FIRST; | |
7307 | coll->caseFirstisDefault = FALSE; | |
7308 | } else if (value == UCOL_OFF) { | |
7309 | coll->caseFirst = UCOL_OFF; | |
7310 | coll->caseFirstisDefault = FALSE; | |
7311 | } else if (value == UCOL_DEFAULT) { | |
7312 | coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; | |
7313 | coll->caseFirstisDefault = TRUE; | |
7314 | } else { | |
7315 | *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
7316 | } | |
7317 | break; | |
7318 | case UCOL_CASE_LEVEL: /* do we have an extra case level */ | |
7319 | if(value == UCOL_ON) { | |
7320 | coll->caseLevel = UCOL_ON; | |
7321 | coll->caseLevelisDefault = FALSE; | |
7322 | } else if (value == UCOL_OFF) { | |
7323 | coll->caseLevel = UCOL_OFF; | |
7324 | coll->caseLevelisDefault = FALSE; | |
7325 | } else if (value == UCOL_DEFAULT) { | |
7326 | coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; | |
7327 | coll->caseLevelisDefault = TRUE; | |
7328 | } else { | |
7329 | *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
7330 | } | |
7331 | break; | |
7332 | case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ | |
7333 | if(value == UCOL_ON) { | |
7334 | coll->normalizationMode = UCOL_ON; | |
7335 | coll->normalizationModeisDefault = FALSE; | |
7336 | } else if (value == UCOL_OFF) { | |
7337 | coll->normalizationMode = UCOL_OFF; | |
7338 | coll->normalizationModeisDefault = FALSE; | |
7339 | } else if (value == UCOL_DEFAULT) { | |
7340 | coll->normalizationModeisDefault = TRUE; | |
7341 | coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; | |
7342 | } else { | |
7343 | *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
7344 | } | |
7345 | break; | |
7346 | case UCOL_STRENGTH: /* attribute for strength */ | |
7347 | if (value == UCOL_DEFAULT) { | |
7348 | coll->strengthisDefault = TRUE; | |
7349 | coll->strength = (UColAttributeValue)coll->options->strength; | |
7350 | } else if (value <= UCOL_IDENTICAL) { | |
7351 | coll->strengthisDefault = FALSE; | |
7352 | coll->strength = value; | |
7353 | } else { | |
7354 | *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
7355 | } | |
7356 | break; | |
7357 | case UCOL_ATTRIBUTE_COUNT: | |
7358 | default: | |
7359 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7360 | break; | |
7361 | } | |
7362 | if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { | |
7363 | coll->latinOneRegenTable = TRUE; | |
374ca955 | 7364 | } else { |
b75a7d8f A |
7365 | coll->latinOneRegenTable = FALSE; |
7366 | } | |
7367 | ucol_updateInternalState(coll, status); | |
7368 | } | |
7369 | ||
7370 | U_CAPI UColAttributeValue U_EXPORT2 | |
7371 | ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { | |
7372 | if(U_FAILURE(*status) || coll == NULL) { | |
7373 | return UCOL_DEFAULT; | |
7374 | } | |
7375 | switch(attr) { | |
7376 | case UCOL_NUMERIC_COLLATION: | |
374ca955 | 7377 | return coll->numericCollation; |
b75a7d8f A |
7378 | case UCOL_HIRAGANA_QUATERNARY_MODE: |
7379 | return coll->hiraganaQ; | |
7380 | case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ | |
7381 | return coll->frenchCollation; | |
7382 | case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ | |
7383 | return coll->alternateHandling; | |
7384 | case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ | |
7385 | return coll->caseFirst; | |
7386 | case UCOL_CASE_LEVEL: /* do we have an extra case level */ | |
7387 | return coll->caseLevel; | |
7388 | case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ | |
7389 | return coll->normalizationMode; | |
7390 | case UCOL_STRENGTH: /* attribute for strength */ | |
7391 | return coll->strength; | |
7392 | case UCOL_ATTRIBUTE_COUNT: | |
7393 | default: | |
7394 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7395 | break; | |
7396 | } | |
7397 | return UCOL_DEFAULT; | |
7398 | } | |
7399 | ||
7400 | U_CAPI void U_EXPORT2 | |
7401 | ucol_setStrength( UCollator *coll, | |
7402 | UCollationStrength strength) | |
7403 | { | |
7404 | UErrorCode status = U_ZERO_ERROR; | |
7405 | ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); | |
7406 | } | |
7407 | ||
7408 | U_CAPI UCollationStrength U_EXPORT2 | |
7409 | ucol_getStrength(const UCollator *coll) | |
7410 | { | |
7411 | UErrorCode status = U_ZERO_ERROR; | |
7412 | return ucol_getAttribute(coll, UCOL_STRENGTH, &status); | |
7413 | } | |
7414 | ||
7415 | /****************************************************************************/ | |
7416 | /* Following are misc functions */ | |
7417 | /* there are new APIs and some compatibility APIs */ | |
7418 | /****************************************************************************/ | |
7419 | ||
7420 | U_CAPI UCollator* U_EXPORT2 | |
7421 | ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) | |
7422 | { | |
7423 | UCollator * localCollator; | |
7424 | int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); | |
7425 | char *stackBufferChars = (char *)stackBuffer; | |
7426 | ||
7427 | if (status == NULL || U_FAILURE(*status)){ | |
7428 | return 0; | |
7429 | } | |
7430 | if ((stackBuffer && !pBufferSize) || !coll){ | |
7431 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7432 | return 0; | |
7433 | } | |
7434 | /* Pointers on 64-bit platforms need to be aligned | |
7435 | * on a 64-bit boundry in memory. | |
7436 | */ | |
7437 | if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { | |
7438 | int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); | |
7439 | *pBufferSize -= offsetUp; | |
7440 | stackBufferChars += offsetUp; | |
7441 | } | |
7442 | stackBuffer = (void *)stackBufferChars; | |
7443 | ||
7444 | if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ | |
7445 | *pBufferSize = bufferSizeNeeded; | |
7446 | return 0; | |
7447 | } | |
7448 | if (!stackBuffer || *pBufferSize < bufferSizeNeeded) { | |
7449 | /* allocate one here...*/ | |
7450 | int32_t length; | |
7451 | const UChar * rules = ucol_getRules(coll, &length); | |
7452 | ||
7453 | localCollator = ucol_openRules(rules, | |
7454 | length, | |
7455 | ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status), | |
7456 | ucol_getStrength(coll), | |
7457 | NULL, | |
7458 | status); | |
7459 | if (U_SUCCESS(*status)) | |
7460 | { | |
7461 | *status = U_SAFECLONE_ALLOCATED_WARNING; | |
7462 | } | |
7463 | } else { | |
7464 | localCollator = (UCollator *)stackBuffer; | |
374ca955 | 7465 | uprv_memcpy(localCollator, coll, sizeof(UCollator)); |
b75a7d8f | 7466 | localCollator->freeOnClose = FALSE; |
374ca955 A |
7467 | localCollator->requestedLocale = NULL; // zero copies of pointers |
7468 | localCollator->validLocale = NULL; | |
b75a7d8f A |
7469 | } |
7470 | return localCollator; | |
7471 | } | |
7472 | ||
7473 | U_CAPI int32_t U_EXPORT2 | |
7474 | ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { | |
7475 | UErrorCode status = U_ZERO_ERROR; | |
7476 | int32_t len = 0; | |
7477 | int32_t UCAlen = 0; | |
7478 | const UChar* ucaRules = 0; | |
7479 | const UChar *rules = ucol_getRules(coll, &len); | |
7480 | if(delta == UCOL_FULL_RULES) { | |
7481 | /* take the UCA rules and append real rules at the end */ | |
7482 | /* UCA rules will be probably coming from the root RB */ | |
7483 | ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status); | |
374ca955 A |
7484 | /* |
7485 | UResourceBundle* cresb = ures_getByKeyWithFallback(coll->rb, "collations", NULL, &status); | |
7486 | UResourceBundle* uca = ures_getByKeyWithFallback(cresb, "UCA", NULL, &status); | |
7487 | ucaRules = ures_getStringByKey(uca,"Sequence",&UCAlen,&status); | |
7488 | ures_close(uca); | |
7489 | ures_close(cresb); | |
7490 | */ | |
b75a7d8f A |
7491 | } |
7492 | if(U_FAILURE(status)) { | |
7493 | return 0; | |
7494 | } | |
7495 | if(buffer!=0 && bufferLen>0){ | |
7496 | *buffer=0; | |
7497 | if(UCAlen > 0) { | |
7498 | u_memcpy(buffer, ucaRules, uprv_min(UCAlen, bufferLen)); | |
7499 | } | |
7500 | if(len > 0 && bufferLen > UCAlen) { | |
7501 | u_memcpy(buffer+UCAlen, rules, uprv_min(len, bufferLen-UCAlen)); | |
7502 | } | |
7503 | } | |
7504 | return u_terminateUChars(buffer, bufferLen, len+UCAlen, &status); | |
7505 | } | |
7506 | ||
7507 | static const UChar _NUL = 0; | |
7508 | ||
7509 | U_CAPI const UChar* U_EXPORT2 | |
7510 | ucol_getRules( const UCollator *coll, | |
7511 | int32_t *length) | |
7512 | { | |
7513 | if(coll->rules != NULL) { | |
7514 | *length = coll->rulesLength; | |
7515 | return coll->rules; | |
7516 | } else { | |
7517 | UErrorCode status = U_ZERO_ERROR; | |
374ca955 | 7518 | if(coll->elements != NULL) { |
b75a7d8f A |
7519 | if(U_SUCCESS(status)) { |
7520 | /*Semantic const */ | |
374ca955 | 7521 | ((UCollator *)coll)->rules = ures_getStringByKey(coll->elements, "Sequence", length, &status); |
b75a7d8f A |
7522 | ((UCollator *)coll)->rulesLength = *length; |
7523 | ((UCollator *)coll)->freeRulesOnClose = FALSE; | |
b75a7d8f A |
7524 | return coll->rules; |
7525 | } | |
7526 | } | |
7527 | *length = 0; | |
7528 | return &_NUL; | |
7529 | } | |
7530 | } | |
7531 | ||
7532 | U_CAPI int32_t U_EXPORT2 | |
7533 | ucol_getDisplayName( const char *objLoc, | |
7534 | const char *dispLoc, | |
7535 | UChar *result, | |
7536 | int32_t resultLength, | |
7537 | UErrorCode *status) | |
7538 | { | |
7539 | ||
7540 | if(U_FAILURE(*status)) return -1; | |
7541 | UnicodeString dst; | |
7542 | if(!(result==NULL && resultLength==0)) { | |
7543 | // NULL destination for pure preflighting: empty dummy string | |
7544 | // otherwise, alias the destination buffer | |
7545 | dst.setTo(result, 0, resultLength); | |
7546 | } | |
7547 | Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst); | |
7548 | return dst.extract(result, resultLength, *status); | |
7549 | } | |
7550 | ||
7551 | U_CAPI const char* U_EXPORT2 | |
7552 | ucol_getAvailable(int32_t index) | |
7553 | { | |
7554 | return uloc_getAvailable(index); | |
7555 | } | |
7556 | ||
7557 | U_CAPI int32_t U_EXPORT2 | |
7558 | ucol_countAvailable() | |
7559 | { | |
7560 | return uloc_countAvailable(); | |
7561 | } | |
7562 | ||
374ca955 A |
7563 | #if !UCONFIG_NO_SERVICE |
7564 | U_CAPI UEnumeration* U_EXPORT2 | |
7565 | ucol_openAvailableLocales(UErrorCode *status) { | |
7566 | // This is a wrapper over Collator::getAvailableLocales() | |
7567 | if (U_FAILURE(*status)) { | |
7568 | return NULL; | |
7569 | } | |
7570 | StringEnumeration *s = Collator::getAvailableLocales(); | |
7571 | if (s == NULL) { | |
7572 | *status = U_MEMORY_ALLOCATION_ERROR; | |
7573 | return NULL; | |
7574 | } | |
7575 | return uenum_openStringEnumeration(s, status); | |
7576 | } | |
7577 | #endif | |
7578 | ||
7579 | // Note: KEYWORDS[0] != RESOURCE_NAME - alan | |
7580 | ||
7581 | static const char* RESOURCE_NAME = "collations"; | |
7582 | ||
7583 | static const char* KEYWORDS[] = { "collation" }; | |
7584 | ||
7585 | #define KEYWORD_COUNT (sizeof(KEYWORDS)/sizeof(KEYWORDS[0])) | |
7586 | ||
7587 | U_CAPI UEnumeration* U_EXPORT2 | |
7588 | ucol_getKeywords(UErrorCode *status) { | |
7589 | UEnumeration *result = NULL; | |
7590 | if (U_SUCCESS(*status)) { | |
7591 | return uenum_openCharStringsEnumeration(KEYWORDS, KEYWORD_COUNT, status); | |
7592 | } | |
7593 | return result; | |
7594 | } | |
7595 | ||
7596 | U_CAPI UEnumeration* U_EXPORT2 | |
7597 | ucol_getKeywordValues(const char *keyword, UErrorCode *status) { | |
7598 | // hard-coded to accept exactly one collation keyword | |
7599 | // modify if additional collation keyword is added later | |
7600 | if (U_SUCCESS(*status) && | |
7601 | keyword==NULL || uprv_strcmp(keyword, KEYWORDS[0])!=0) { | |
7602 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
7603 | return NULL; | |
7604 | } | |
7605 | return ures_getKeywordValues(U_ICUDATA_COLL, RESOURCE_NAME, status); | |
7606 | } | |
7607 | ||
7608 | U_CAPI int32_t U_EXPORT2 | |
7609 | ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity, | |
7610 | const char* keyword, const char* locale, | |
7611 | UBool* isAvailable, UErrorCode* status) { | |
7612 | // N.B.: Resource name is "collations" but keyword is "collation" | |
7613 | return ures_getFunctionalEquivalent(result, resultCapacity, U_ICUDATA_COLL, | |
7614 | "collations", keyword, locale, | |
7615 | isAvailable, TRUE, status); | |
7616 | } | |
7617 | ||
b75a7d8f A |
7618 | U_CAPI void U_EXPORT2 |
7619 | ucol_getVersion(const UCollator* coll, | |
7620 | UVersionInfo versionInfo) | |
7621 | { | |
7622 | /* RunTime version */ | |
7623 | uint8_t rtVersion = UCOL_RUNTIME_VERSION; | |
7624 | /* Builder version*/ | |
7625 | uint8_t bdVersion = coll->image->version[0]; | |
7626 | ||
7627 | /* Charset Version. Need to get the version from cnv files | |
7628 | * makeconv should populate cnv files with version and | |
7629 | * an api has to be provided in ucnv.h to obtain this version | |
7630 | */ | |
7631 | uint8_t csVersion = 0; | |
7632 | ||
7633 | /* combine the version info */ | |
7634 | uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); | |
7635 | ||
7636 | /* Tailoring rules */ | |
7637 | versionInfo[0] = (uint8_t)(cmbVersion>>8); | |
7638 | versionInfo[1] = (uint8_t)cmbVersion; | |
7639 | versionInfo[2] = coll->image->version[1]; | |
374ca955 A |
7640 | if(coll->UCA) { |
7641 | versionInfo[3] = coll->UCA->image->UCAVersion[0]; | |
7642 | } else { | |
7643 | versionInfo[3] = 0; | |
7644 | } | |
b75a7d8f A |
7645 | } |
7646 | ||
7647 | ||
7648 | /* This internal API checks whether a character is tailored or not */ | |
7649 | U_CAPI UBool U_EXPORT2 | |
7650 | ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { | |
7651 | uint32_t CE = UCOL_NOT_FOUND; | |
7652 | const UChar *ContractionStart = NULL; | |
7653 | if(U_SUCCESS(*status) && coll != NULL) { | |
374ca955 | 7654 | if(coll == coll->UCA) { |
b75a7d8f A |
7655 | return FALSE; |
7656 | } else if(u < 0x100) { /* latin-1 */ | |
7657 | CE = coll->latinOneMapping[u]; | |
374ca955 | 7658 | if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { |
b75a7d8f A |
7659 | return FALSE; |
7660 | } | |
7661 | } else { /* regular */ | |
7662 | /*CE = ucmpe32_get(coll->mapping, u);*/ | |
7663 | CE = UTRIE_GET32_FROM_LEAD(coll->mapping, u); | |
7664 | ||
7665 | } | |
7666 | ||
7667 | if(isContraction(CE)) { | |
7668 | ContractionStart = (UChar *)coll->image+getContractOffset(CE); | |
7669 | CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); | |
7670 | } | |
7671 | ||
7672 | if(CE == UCOL_NOT_FOUND) { | |
7673 | return FALSE; | |
7674 | } else { | |
7675 | return TRUE; | |
7676 | } | |
7677 | } else { | |
7678 | return FALSE; | |
7679 | } | |
7680 | } | |
7681 | ||
7682 | ||
7683 | /****************************************************************************/ | |
7684 | /* Following are the string compare functions */ | |
7685 | /* */ | |
7686 | /****************************************************************************/ | |
7687 | ||
7688 | ||
7689 | /* ucol_checkIdent internal function. Does byte level string compare. */ | |
7690 | /* Used by strcoll if strength == identical and strings */ | |
7691 | /* are otherwise equal. Moved out-of-line because this */ | |
7692 | /* is a rare case. */ | |
7693 | /* */ | |
7694 | /* Comparison must be done on NFD normalized strings. */ | |
7695 | /* FCD is not good enough. */ | |
7696 | /* */ | |
7697 | /* TODO: make an incremental NFD Comparison function, which could */ | |
7698 | /* be of general use */ | |
7699 | ||
7700 | static | |
7701 | UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) | |
7702 | { | |
7703 | ||
374ca955 | 7704 | // TODO: When we have an UChar iterator, we need to access the whole string. One |
b75a7d8f A |
7705 | // useful modification would be a UChar iterator extract API, since reset next next... |
7706 | // is not optimal. | |
7707 | // TODO: Handle long strings. Do the same in compareUsingSortKeys. | |
7708 | ||
7709 | // When we arrive here, we can have normal strings or UCharIterators. Currently they are both | |
374ca955 | 7710 | // of same type, but that doesn't really mean that it will stay that way. |
b75a7d8f A |
7711 | |
7712 | // The division for the array length may truncate the array size to | |
7713 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
7714 | // for all platforms anyway. | |
7715 | UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
7716 | UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
7717 | //UChar sStackBuf[256], tStackBuf[256]; | |
7718 | //int32_t sBufSize = 256, tBufSize = 256; | |
7719 | int32_t comparison; | |
7720 | int32_t sLen = 0; | |
7721 | UChar *sBuf = NULL; | |
7722 | int32_t tLen = 0; | |
7723 | UChar *tBuf = NULL; | |
7724 | UBool freeSBuf = FALSE, freeTBuf = FALSE; | |
7725 | ||
7726 | if (sColl->flags & UCOL_USE_ITERATOR) { | |
7727 | UNormIterator *sNIt = NULL, *tNIt = NULL; | |
7728 | sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); | |
7729 | tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); | |
7730 | sColl->iterator->move(sColl->iterator, 0, UITER_START); | |
7731 | tColl->iterator->move(tColl->iterator, 0, UITER_START); | |
7732 | UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); | |
7733 | UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); | |
7734 | comparison = u_strCompareIter(sIt, tIt, TRUE); | |
7735 | unorm_closeIter(sNIt); | |
7736 | unorm_closeIter(tNIt); | |
7737 | } else { | |
7738 | sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1; | |
7739 | sBuf = sColl->string; | |
7740 | tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1; | |
7741 | tBuf = tColl->string; | |
7742 | ||
7743 | if (normalize) { | |
7744 | *status = U_ZERO_ERROR; | |
7745 | if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) { | |
7746 | sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, | |
7747 | sBuf, sLen, | |
7748 | FALSE, 0, | |
7749 | status); | |
7750 | if(*status == U_BUFFER_OVERFLOW_ERROR) { | |
7751 | if(!u_growBufferFromStatic(sColl->stackWritableBuffer, | |
7752 | &sColl->writableBuffer, | |
7753 | (int32_t *)&sColl->writableBufSize, sLen, | |
7754 | 0) | |
7755 | ) { | |
7756 | *status = U_MEMORY_ALLOCATION_ERROR; | |
7757 | return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ | |
7758 | } | |
7759 | *status = U_ZERO_ERROR; | |
7760 | sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, | |
7761 | sBuf, sLen, | |
7762 | FALSE, 0, | |
7763 | status); | |
7764 | } | |
7765 | if(freeSBuf) { | |
7766 | uprv_free(sBuf); | |
7767 | freeSBuf = FALSE; | |
7768 | } | |
7769 | sBuf = sColl->writableBuffer; | |
7770 | if (sBuf != sColl->stackWritableBuffer) { | |
7771 | sColl->flags |= UCOL_ITER_ALLOCATED; | |
7772 | } | |
7773 | } | |
7774 | ||
7775 | *status = U_ZERO_ERROR; | |
7776 | if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) { | |
7777 | tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, | |
7778 | tBuf, tLen, | |
7779 | FALSE, 0, | |
7780 | status); | |
7781 | if(*status == U_BUFFER_OVERFLOW_ERROR) { | |
7782 | if(!u_growBufferFromStatic(tColl->stackWritableBuffer, | |
7783 | &tColl->writableBuffer, | |
7784 | (int32_t *)&tColl->writableBufSize, tLen, | |
7785 | 0) | |
7786 | ) { | |
7787 | *status = U_MEMORY_ALLOCATION_ERROR; | |
7788 | return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ | |
7789 | } | |
7790 | *status = U_ZERO_ERROR; | |
7791 | tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, | |
7792 | tBuf, tLen, | |
7793 | FALSE, 0, | |
7794 | status); | |
7795 | } | |
7796 | if(freeTBuf) { | |
7797 | uprv_free(tBuf); | |
7798 | freeTBuf = FALSE; | |
7799 | } | |
7800 | tBuf = tColl->writableBuffer; | |
7801 | if (tBuf != tColl->stackWritableBuffer) { | |
7802 | tColl->flags |= UCOL_ITER_ALLOCATED; | |
7803 | } | |
7804 | } | |
7805 | } | |
7806 | ||
7807 | if (sLen == -1 && tLen == -1) { | |
7808 | comparison = u_strcmpCodePointOrder(sBuf, tBuf); | |
7809 | } else { | |
7810 | if (sLen == -1) { | |
7811 | sLen = u_strlen(sBuf); | |
7812 | } | |
7813 | if (tLen == -1) { | |
7814 | tLen = u_strlen(tBuf); | |
7815 | } | |
7816 | comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen)); | |
7817 | if (comparison == 0) { | |
7818 | comparison = sLen - tLen; | |
7819 | } | |
7820 | } | |
7821 | } | |
7822 | ||
7823 | if (comparison < 0) { | |
7824 | return UCOL_LESS; | |
7825 | } else if (comparison == 0) { | |
7826 | return UCOL_EQUAL; | |
7827 | } else /* comparison > 0 */ { | |
7828 | return UCOL_GREATER; | |
7829 | } | |
7830 | } | |
7831 | ||
7832 | /* CEBuf - A struct and some inline functions to handle the saving */ | |
7833 | /* of CEs in a buffer within ucol_strcoll */ | |
7834 | ||
7835 | #define UCOL_CEBUF_SIZE 512 | |
7836 | typedef struct ucol_CEBuf { | |
7837 | uint32_t *buf; | |
7838 | uint32_t *endp; | |
7839 | uint32_t *pos; | |
7840 | uint32_t localArray[UCOL_CEBUF_SIZE]; | |
7841 | } ucol_CEBuf; | |
7842 | ||
7843 | ||
7844 | static | |
7845 | inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { | |
7846 | (b)->buf = (b)->pos = (b)->localArray; | |
7847 | (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; | |
7848 | }; | |
7849 | ||
7850 | static | |
7851 | void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) { | |
7852 | uint32_t oldSize; | |
7853 | uint32_t newSize; | |
7854 | uint32_t *newBuf; | |
7855 | ||
7856 | ci->flags |= UCOL_ITER_ALLOCATED; | |
7857 | oldSize = b->pos - b->buf; | |
7858 | newSize = oldSize * 2; | |
7859 | newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); | |
7860 | if(newBuf != NULL) { | |
7861 | uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); | |
7862 | if (b->buf != b->localArray) { | |
7863 | uprv_free(b->buf); | |
7864 | } | |
7865 | b->buf = newBuf; | |
7866 | b->endp = b->buf + newSize; | |
7867 | b->pos = b->buf + oldSize; | |
7868 | } | |
7869 | } | |
7870 | ||
7871 | static | |
7872 | inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) { | |
7873 | if (b->pos == b->endp) { | |
7874 | ucol_CEBuf_Expand(b, ci); | |
7875 | } | |
7876 | *(b)->pos++ = ce; | |
7877 | }; | |
7878 | ||
7879 | /* This is a trick string compare function that goes in and uses sortkeys to compare */ | |
7880 | /* It is used when compare gets in trouble and needs to bail out */ | |
7881 | static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, | |
7882 | collIterate *tColl) | |
7883 | { | |
7884 | uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; | |
7885 | uint8_t *sourceKeyP = sourceKey; | |
7886 | uint8_t *targetKeyP = targetKey; | |
7887 | int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; | |
7888 | const UCollator *coll = sColl->coll; | |
7889 | UChar *source = NULL; | |
7890 | UChar *target = NULL; | |
7891 | UChar sStackBuf[256], tStackBuf[256]; | |
7892 | int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; | |
7893 | int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; | |
7894 | ||
7895 | // TODO: Handle long strings. Do the same in ucol_checkIdent. | |
7896 | if(sColl->flags & UCOL_USE_ITERATOR) { | |
7897 | sColl->iterator->move(sColl->iterator, 0, UITER_START); | |
7898 | tColl->iterator->move(tColl->iterator, 0, UITER_START); | |
7899 | source = sStackBuf; | |
7900 | UChar *sBufp = source; | |
7901 | target = tStackBuf; | |
7902 | UChar *tBufp = target; | |
7903 | while(sColl->iterator->hasNext(sColl->iterator)) { | |
7904 | *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator); | |
7905 | } | |
7906 | while(tColl->iterator->hasNext(tColl->iterator)) { | |
7907 | *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator); | |
7908 | } | |
7909 | sourceLength = sBufp - source; | |
7910 | targetLength = tBufp - target; | |
7911 | } else { // no iterators | |
7912 | sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; | |
7913 | targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; | |
7914 | source = sColl->string; | |
7915 | target = tColl->string; | |
7916 | } | |
7917 | ||
7918 | ||
7919 | ||
7920 | sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); | |
7921 | if(sourceKeyLen > UCOL_MAX_BUFFER) { | |
7922 | sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); | |
7923 | if(sourceKeyP != NULL) { | |
7924 | sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); | |
7925 | } | |
7926 | } | |
7927 | ||
7928 | targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); | |
7929 | if(targetKeyLen > UCOL_MAX_BUFFER) { | |
7930 | targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); | |
7931 | if(targetKeyP != NULL) { | |
7932 | targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); | |
7933 | } | |
7934 | } | |
7935 | ||
7936 | int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); | |
7937 | ||
7938 | if(sourceKeyP != sourceKey) { | |
7939 | uprv_free(sourceKeyP); | |
7940 | } | |
7941 | ||
7942 | if(targetKeyP != targetKey) { | |
7943 | uprv_free(targetKeyP); | |
7944 | } | |
7945 | ||
7946 | if(result<0) { | |
7947 | return UCOL_LESS; | |
7948 | } else if(result>0) { | |
7949 | return UCOL_GREATER; | |
7950 | } else { | |
7951 | return UCOL_EQUAL; | |
7952 | } | |
7953 | } | |
7954 | ||
7955 | ||
374ca955 | 7956 | static inline UCollationResult |
b75a7d8f A |
7957 | ucol_strcollRegular( collIterate *sColl, collIterate *tColl, |
7958 | // const UCollator *coll, | |
7959 | // const UChar *source, | |
7960 | // int32_t sourceLength, | |
7961 | // const UChar *target, | |
7962 | // int32_t targetLength, | |
7963 | UErrorCode *status) | |
7964 | { | |
7965 | U_ALIGN_CODE(16); | |
7966 | ||
7967 | const UCollator *coll = sColl->coll; | |
7968 | ||
7969 | ||
7970 | // setting up the collator parameters | |
7971 | UColAttributeValue strength = coll->strength; | |
7972 | UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); | |
7973 | ||
7974 | UBool checkSecTer = initialCheckSecTer; | |
7975 | UBool checkTertiary = (strength >= UCOL_TERTIARY); | |
7976 | UBool checkQuad = (strength >= UCOL_QUATERNARY); | |
7977 | UBool checkIdent = (strength == UCOL_IDENTICAL); | |
7978 | UBool checkCase = (coll->caseLevel == UCOL_ON); | |
7979 | UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; | |
7980 | UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
7981 | UBool qShifted = shifted && checkQuad; | |
7982 | UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; | |
7983 | ||
7984 | if(doHiragana && shifted) { | |
7985 | return (ucol_compareUsingSortKeys(sColl, tColl)); | |
7986 | } | |
7987 | uint8_t caseSwitch = coll->caseSwitch; | |
7988 | uint8_t tertiaryMask = coll->tertiaryMask; | |
7989 | ||
7990 | // This is the lowest primary value that will not be ignored if shifted | |
7991 | uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; | |
7992 | ||
7993 | UCollationResult result = UCOL_EQUAL; | |
7994 | UCollationResult hirResult = UCOL_EQUAL; | |
7995 | ||
7996 | // Preparing the CE buffers. They will be filled during the primary phase | |
7997 | ucol_CEBuf sCEs; | |
7998 | ucol_CEBuf tCEs; | |
7999 | UCOL_INIT_CEBUF(&sCEs); | |
8000 | UCOL_INIT_CEBUF(&tCEs); | |
8001 | ||
8002 | uint32_t secS = 0, secT = 0; | |
8003 | uint32_t sOrder=0, tOrder=0; | |
8004 | ||
8005 | // Non shifted primary processing is quite simple | |
8006 | if(!shifted) { | |
8007 | for(;;) { | |
8008 | ||
8009 | // We fetch CEs until we hit a non ignorable primary or end. | |
8010 | do { | |
8011 | // We get the next CE | |
8012 | sOrder = ucol_IGetNextCE(coll, sColl, status); | |
8013 | // Stuff it in the buffer | |
8014 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8015 | // And keep just the primary part. | |
8016 | sOrder &= UCOL_PRIMARYMASK; | |
8017 | } while(sOrder == 0); | |
8018 | ||
8019 | // see the comments on the above block | |
8020 | do { | |
8021 | tOrder = ucol_IGetNextCE(coll, tColl, status); | |
8022 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8023 | tOrder &= UCOL_PRIMARYMASK; | |
8024 | } while(tOrder == 0); | |
8025 | ||
8026 | // if both primaries are the same | |
8027 | if(sOrder == tOrder) { | |
8028 | // and there are no more CEs, we advance to the next level | |
8029 | if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { | |
8030 | break; | |
8031 | } | |
8032 | if(doHiragana && hirResult == UCOL_EQUAL) { | |
8033 | if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { | |
374ca955 | 8034 | hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) |
b75a7d8f A |
8035 | ? UCOL_LESS:UCOL_GREATER; |
8036 | } | |
8037 | } | |
8038 | } else { | |
8039 | // if two primaries are different, we are done | |
8040 | result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; | |
8041 | goto commonReturn; | |
8042 | } | |
8043 | } // no primary difference... do the rest from the buffers | |
8044 | } else { // shifted - do a slightly more complicated processing :) | |
8045 | for(;;) { | |
8046 | UBool sInShifted = FALSE; | |
8047 | UBool tInShifted = FALSE; | |
8048 | // This version of code can be refactored. However, it seems easier to understand this way. | |
8049 | // Source loop. Sam as the target loop. | |
8050 | for(;;) { | |
8051 | sOrder = ucol_IGetNextCE(coll, sColl, status); | |
8052 | if(sOrder == UCOL_NO_MORE_CES) { | |
8053 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8054 | break; | |
374ca955 A |
8055 | } else if(sOrder == 0 |
8056 | || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { | |
b75a7d8f A |
8057 | /* UCA amendment - ignore ignorables that follow shifted code points */ |
8058 | continue; | |
8059 | } else if(isContinuation(sOrder)) { | |
8060 | if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ | |
8061 | if(sInShifted) { | |
8062 | sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ | |
8063 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8064 | continue; | |
8065 | } else { | |
8066 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8067 | break; | |
8068 | } | |
8069 | } else { /* Just lower level values */ | |
8070 | if(sInShifted) { | |
8071 | continue; | |
8072 | } else { | |
8073 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8074 | continue; | |
8075 | } | |
8076 | } | |
8077 | } else { /* regular */ | |
8078 | if((sOrder & UCOL_PRIMARYMASK) > LVT) { | |
8079 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8080 | break; | |
8081 | } else { | |
8082 | if((sOrder & UCOL_PRIMARYMASK) > 0) { | |
8083 | sInShifted = TRUE; | |
8084 | sOrder &= UCOL_PRIMARYMASK; | |
8085 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8086 | continue; | |
8087 | } else { | |
8088 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); | |
8089 | sInShifted = FALSE; | |
8090 | continue; | |
8091 | } | |
8092 | } | |
8093 | } | |
8094 | } | |
8095 | sOrder &= UCOL_PRIMARYMASK; | |
8096 | sInShifted = FALSE; | |
8097 | ||
8098 | for(;;) { | |
8099 | tOrder = ucol_IGetNextCE(coll, tColl, status); | |
8100 | if(tOrder == UCOL_NO_MORE_CES) { | |
8101 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8102 | break; | |
8103 | } else if(tOrder == 0 | |
374ca955 | 8104 | || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { |
b75a7d8f A |
8105 | /* UCA amendment - ignore ignorables that follow shifted code points */ |
8106 | continue; | |
8107 | } else if(isContinuation(tOrder)) { | |
8108 | if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ | |
8109 | if(tInShifted) { | |
8110 | tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ | |
8111 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8112 | continue; | |
8113 | } else { | |
8114 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8115 | break; | |
8116 | } | |
8117 | } else { /* Just lower level values */ | |
8118 | if(tInShifted) { | |
8119 | continue; | |
8120 | } else { | |
8121 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8122 | continue; | |
8123 | } | |
8124 | } | |
8125 | } else { /* regular */ | |
8126 | if((tOrder & UCOL_PRIMARYMASK) > LVT) { | |
8127 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8128 | break; | |
8129 | } else { | |
8130 | if((tOrder & UCOL_PRIMARYMASK) > 0) { | |
8131 | tInShifted = TRUE; | |
8132 | tOrder &= UCOL_PRIMARYMASK; | |
8133 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8134 | continue; | |
8135 | } else { | |
8136 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); | |
8137 | tInShifted = FALSE; | |
8138 | continue; | |
8139 | } | |
8140 | } | |
8141 | } | |
8142 | } | |
8143 | tOrder &= UCOL_PRIMARYMASK; | |
8144 | tInShifted = FALSE; | |
8145 | ||
8146 | if(sOrder == tOrder) { | |
8147 | /* | |
8148 | if(doHiragana && hirResult == UCOL_EQUAL) { | |
8149 | if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { | |
374ca955 | 8150 | hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) |
b75a7d8f A |
8151 | ? UCOL_LESS:UCOL_GREATER; |
8152 | } | |
8153 | } | |
8154 | */ | |
8155 | if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { | |
8156 | break; | |
8157 | } else { | |
8158 | sOrder = 0; tOrder = 0; | |
8159 | continue; | |
8160 | } | |
8161 | } else { | |
8162 | result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; | |
8163 | goto commonReturn; | |
8164 | } | |
8165 | } /* no primary difference... do the rest from the buffers */ | |
8166 | } | |
8167 | ||
8168 | /* now, we're gonna reexamine collected CEs */ | |
8169 | uint32_t *sCE; | |
8170 | uint32_t *tCE; | |
8171 | ||
8172 | /* This is the secondary level of comparison */ | |
8173 | if(checkSecTer) { | |
8174 | if(!isFrenchSec) { /* normal */ | |
8175 | sCE = sCEs.buf; | |
8176 | tCE = tCEs.buf; | |
8177 | for(;;) { | |
8178 | while (secS == 0) { | |
8179 | secS = *(sCE++) & UCOL_SECONDARYMASK; | |
8180 | } | |
8181 | ||
8182 | while(secT == 0) { | |
8183 | secT = *(tCE++) & UCOL_SECONDARYMASK; | |
8184 | } | |
8185 | ||
8186 | if(secS == secT) { | |
8187 | if(secS == UCOL_NO_MORE_CES_SECONDARY) { | |
8188 | break; | |
8189 | } else { | |
8190 | secS = 0; secT = 0; | |
8191 | continue; | |
8192 | } | |
8193 | } else { | |
8194 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
8195 | goto commonReturn; | |
8196 | } | |
8197 | } | |
8198 | } else { /* do the French */ | |
8199 | uint32_t *sCESave = NULL; | |
8200 | uint32_t *tCESave = NULL; | |
8201 | sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ | |
8202 | tCE = tCEs.pos-2; | |
8203 | for(;;) { | |
8204 | while (secS == 0 && sCE >= sCEs.buf) { | |
8205 | if(sCESave == 0) { | |
8206 | secS = *(sCE--); | |
8207 | if(isContinuation(secS)) { | |
8208 | while(isContinuation(secS = *(sCE--))); | |
8209 | /* after this, secS has the start of continuation, and sCEs points before that */ | |
8210 | sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ | |
8211 | sCE+=2; /* need to point to the first continuation CP */ | |
8212 | /* However, now you can just continue doing stuff */ | |
8213 | } | |
8214 | } else { | |
8215 | secS = *(sCE++); | |
8216 | if(!isContinuation(secS)) { /* This means we have finished with this cont */ | |
8217 | sCE = sCESave; /* reset the pointer to before continuation */ | |
8218 | sCESave = 0; | |
8219 | continue; | |
8220 | } | |
8221 | } | |
8222 | secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ | |
8223 | } | |
8224 | ||
8225 | while(secT == 0 && tCE >= tCEs.buf) { | |
8226 | if(tCESave == 0) { | |
8227 | secT = *(tCE--); | |
8228 | if(isContinuation(secT)) { | |
8229 | while(isContinuation(secT = *(tCE--))); | |
8230 | /* after this, secS has the start of continuation, and sCEs points before that */ | |
8231 | tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ | |
8232 | tCE+=2; /* need to point to the first continuation CP */ | |
8233 | /* However, now you can just continue doing stuff */ | |
8234 | } | |
8235 | } else { | |
8236 | secT = *(tCE++); | |
8237 | if(!isContinuation(secT)) { /* This means we have finished with this cont */ | |
8238 | tCE = tCESave; /* reset the pointer to before continuation */ | |
8239 | tCESave = 0; | |
8240 | continue; | |
8241 | } | |
8242 | } | |
8243 | secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ | |
8244 | } | |
8245 | ||
8246 | if(secS == secT) { | |
8247 | if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { | |
8248 | break; | |
8249 | } else { | |
8250 | secS = 0; secT = 0; | |
8251 | continue; | |
8252 | } | |
8253 | } else { | |
8254 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
8255 | goto commonReturn; | |
8256 | } | |
8257 | } | |
8258 | } | |
8259 | } | |
8260 | ||
8261 | /* doing the case bit */ | |
8262 | if(checkCase) { | |
8263 | sCE = sCEs.buf; | |
8264 | tCE = tCEs.buf; | |
8265 | for(;;) { | |
8266 | while((secS & UCOL_REMOVE_CASE) == 0) { | |
8267 | if(!isContinuation(*sCE++)) { | |
8268 | secS =*(sCE-1) & UCOL_TERT_CASE_MASK; | |
8269 | secS ^= caseSwitch; | |
8270 | } else { | |
8271 | secS = 0; | |
8272 | } | |
8273 | } | |
8274 | ||
8275 | while((secT & UCOL_REMOVE_CASE) == 0) { | |
8276 | if(!isContinuation(*tCE++)) { | |
8277 | secT = *(tCE-1) & UCOL_TERT_CASE_MASK; | |
8278 | secT ^= caseSwitch; | |
8279 | } else { | |
8280 | secT = 0; | |
8281 | } | |
8282 | } | |
8283 | ||
8284 | if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { | |
8285 | result = UCOL_LESS; | |
8286 | goto commonReturn; | |
8287 | } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { | |
8288 | result = UCOL_GREATER; | |
8289 | goto commonReturn; | |
8290 | } | |
8291 | ||
8292 | if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { | |
8293 | break; | |
8294 | } else { | |
8295 | secS = 0; | |
8296 | secT = 0; | |
8297 | } | |
8298 | } | |
8299 | } | |
8300 | ||
8301 | /* Tertiary level */ | |
8302 | if(checkTertiary) { | |
8303 | secS = 0; | |
8304 | secT = 0; | |
8305 | sCE = sCEs.buf; | |
8306 | tCE = tCEs.buf; | |
8307 | for(;;) { | |
8308 | while((secS & UCOL_REMOVE_CASE) == 0) { | |
8309 | secS = *(sCE++) & tertiaryMask; | |
8310 | if(!isContinuation(secS)) { | |
8311 | secS ^= caseSwitch; | |
8312 | } else { | |
8313 | secS &= UCOL_REMOVE_CASE; | |
8314 | } | |
8315 | } | |
8316 | ||
8317 | while((secT & UCOL_REMOVE_CASE) == 0) { | |
8318 | secT = *(tCE++) & tertiaryMask; | |
8319 | if(!isContinuation(secT)) { | |
8320 | secT ^= caseSwitch; | |
8321 | } else { | |
8322 | secT &= UCOL_REMOVE_CASE; | |
8323 | } | |
8324 | } | |
8325 | ||
8326 | if(secS == secT) { | |
8327 | if((secS & UCOL_REMOVE_CASE) == 1) { | |
8328 | break; | |
8329 | } else { | |
8330 | secS = 0; secT = 0; | |
8331 | continue; | |
8332 | } | |
8333 | } else { | |
8334 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
8335 | goto commonReturn; | |
8336 | } | |
8337 | } | |
8338 | } | |
8339 | ||
8340 | ||
8341 | if(qShifted /*checkQuad*/) { | |
8342 | UBool sInShifted = TRUE; | |
8343 | UBool tInShifted = TRUE; | |
8344 | secS = 0; | |
8345 | secT = 0; | |
8346 | sCE = sCEs.buf; | |
8347 | tCE = tCEs.buf; | |
8348 | for(;;) { | |
8349 | while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) { | |
8350 | secS = *(sCE++); | |
8351 | if(isContinuation(secS)) { | |
8352 | if(!sInShifted) { | |
8353 | continue; | |
8354 | } | |
8355 | } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ | |
8356 | secS = UCOL_PRIMARYMASK; | |
8357 | sInShifted = FALSE; | |
8358 | } else { | |
8359 | sInShifted = TRUE; | |
8360 | } | |
8361 | } | |
8362 | secS &= UCOL_PRIMARYMASK; | |
8363 | ||
8364 | ||
8365 | while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) { | |
8366 | secT = *(tCE++); | |
8367 | if(isContinuation(secT)) { | |
8368 | if(!tInShifted) { | |
8369 | continue; | |
8370 | } | |
8371 | } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { | |
8372 | secT = UCOL_PRIMARYMASK; | |
8373 | tInShifted = FALSE; | |
8374 | } else { | |
8375 | tInShifted = TRUE; | |
8376 | } | |
8377 | } | |
8378 | secT &= UCOL_PRIMARYMASK; | |
8379 | ||
8380 | if(secS == secT) { | |
8381 | if(secS == UCOL_NO_MORE_CES_PRIMARY) { | |
8382 | break; | |
8383 | } else { | |
8384 | secS = 0; secT = 0; | |
8385 | continue; | |
8386 | } | |
8387 | } else { | |
8388 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
8389 | goto commonReturn; | |
8390 | } | |
8391 | } | |
8392 | } else if(doHiragana && hirResult != UCOL_EQUAL) { | |
8393 | // If we're fine on quaternaries, we might be different | |
8394 | // on Hiragana. This, however, might fail us in shifted. | |
8395 | result = hirResult; | |
8396 | goto commonReturn; | |
8397 | } | |
8398 | ||
8399 | /* For IDENTICAL comparisons, we use a bitwise character comparison */ | |
8400 | /* as a tiebreaker if all else is equal. */ | |
8401 | /* Getting here should be quite rare - strings are not identical - */ | |
8402 | /* that is checked first, but compared == through all other checks. */ | |
8403 | if(checkIdent) | |
8404 | { | |
8405 | //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); | |
8406 | result = ucol_checkIdent(sColl, tColl, TRUE, status); | |
8407 | } | |
8408 | ||
8409 | commonReturn: | |
8410 | if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { | |
8411 | freeHeapWritableBuffer(sColl); | |
8412 | freeHeapWritableBuffer(tColl); | |
8413 | ||
8414 | if (sCEs.buf != sCEs.localArray ) { | |
8415 | uprv_free(sCEs.buf); | |
8416 | } | |
8417 | if (tCEs.buf != tCEs.localArray ) { | |
8418 | uprv_free(tCEs.buf); | |
8419 | } | |
8420 | } | |
8421 | ||
8422 | return result; | |
8423 | } | |
8424 | ||
8425 | ||
374ca955 A |
8426 | static inline uint32_t |
8427 | ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, | |
b75a7d8f A |
8428 | uint32_t CE, const UChar *s, int32_t *index, int32_t len) { |
8429 | const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); | |
8430 | int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; | |
8431 | int32_t offset = 1; | |
8432 | UChar schar = 0, tchar = 0; | |
8433 | ||
8434 | for(;;) { | |
8435 | if(len == -1) { | |
8436 | if(s[*index] == 0) { // end of string | |
8437 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); | |
8438 | } else { | |
8439 | schar = s[*index]; | |
8440 | } | |
8441 | } else { | |
8442 | if(*index == len) { | |
8443 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); | |
8444 | } else { | |
8445 | schar = s[*index]; | |
8446 | } | |
8447 | } | |
8448 | ||
8449 | while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ | |
8450 | offset++; | |
8451 | } | |
8452 | ||
8453 | if (schar == tchar) { | |
8454 | (*index)++; | |
8455 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); | |
8456 | } | |
8457 | else | |
8458 | { | |
8459 | if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { | |
8460 | return UCOL_BAIL_OUT_CE; | |
8461 | } | |
8462 | // skip completely ignorables | |
8463 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); | |
8464 | if(isZeroCE == 0) { // we have to ignore completely ignorables | |
8465 | (*index)++; | |
8466 | continue; | |
8467 | } | |
8468 | ||
8469 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); | |
8470 | } | |
8471 | } | |
8472 | } | |
8473 | ||
8474 | ||
374ca955 A |
8475 | /** |
8476 | * This is a fast strcoll, geared towards text in Latin-1. | |
b75a7d8f A |
8477 | * It supports contractions of size two, French secondaries |
8478 | * and case switching. You can use it with strengths primary | |
8479 | * to tertiary. It does not support shifted and case level. | |
8480 | * It relies on the table build by setupLatin1Table. If it | |
8481 | * doesn't understand something, it will go to the regular | |
374ca955 | 8482 | * strcoll. |
b75a7d8f | 8483 | */ |
374ca955 | 8484 | static inline UCollationResult |
b75a7d8f A |
8485 | ucol_strcollUseLatin1( const UCollator *coll, |
8486 | const UChar *source, | |
8487 | int32_t sLen, | |
8488 | const UChar *target, | |
8489 | int32_t tLen, | |
374ca955 | 8490 | UErrorCode *status) |
b75a7d8f A |
8491 | { |
8492 | U_ALIGN_CODE(16); | |
8493 | int32_t strength = coll->strength; | |
8494 | ||
8495 | int32_t sIndex = 0, tIndex = 0; | |
8496 | UChar sChar = 0, tChar = 0; | |
8497 | uint32_t sOrder=0, tOrder=0; | |
8498 | ||
8499 | UBool endOfSource = FALSE, endOfTarget = FALSE; | |
8500 | ||
8501 | uint32_t *elements = coll->latinOneCEs; | |
8502 | ||
8503 | UBool haveContractions = FALSE; // if we have contractions in our string | |
8504 | // we cannot do French secondary | |
8505 | ||
8506 | // Do the primary level | |
8507 | for(;;) { | |
8508 | while(sOrder==0) { // this loop skips primary ignorables | |
8509 | // sOrder=getNextlatinOneCE(source); | |
8510 | if(sLen==-1) { // handling zero terminated strings | |
8511 | sChar=source[sIndex++]; | |
8512 | if(sChar==0) { | |
8513 | endOfSource = TRUE; | |
8514 | break; | |
8515 | } | |
8516 | } else { // handling strings with known length | |
8517 | if(sIndex==sLen) { | |
8518 | endOfSource = TRUE; | |
8519 | break; | |
8520 | } | |
8521 | sChar=source[sIndex++]; | |
8522 | } | |
8523 | if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) | |
8524 | //fprintf(stderr, "R"); | |
8525 | goto returnRegular; | |
8526 | //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); | |
8527 | } | |
8528 | sOrder = elements[sChar]; | |
8529 | if(sOrder >= UCOL_NOT_FOUND) { // if we got a special | |
8530 | // specials can basically be either contractions or bail-out signs. If we get anything | |
8531 | // else, we'll bail out anywasy | |
8532 | if(getCETag(sOrder) == CONTRACTION_TAG) { | |
8533 | sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); | |
8534 | haveContractions = TRUE; // if there are contractions, we cannot do French secondary | |
8535 | // However, if there are contractions in the table, but we always use just one char, | |
8536 | // we might be able to do French. This should be checked out. | |
8537 | } | |
8538 | if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
8539 | //fprintf(stderr, "S"); | |
8540 | goto returnRegular; | |
8541 | //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); | |
8542 | } | |
8543 | } | |
8544 | } | |
8545 | ||
8546 | while(tOrder==0) { // this loop skips primary ignorables | |
8547 | // tOrder=getNextlatinOneCE(target); | |
8548 | if(tLen==-1) { // handling zero terminated strings | |
8549 | tChar=target[tIndex++]; | |
8550 | if(tChar==0) { | |
374ca955 | 8551 | if(endOfSource) { // this is different than source loop, |
b75a7d8f A |
8552 | // as we already know that source loop is done here, |
8553 | // so we can either finish the primary loop if both | |
374ca955 | 8554 | // strings are done or anounce the result if only |
b75a7d8f A |
8555 | // target is done. Same below. |
8556 | goto endOfPrimLoop; | |
8557 | } else { | |
8558 | return UCOL_GREATER; | |
8559 | } | |
8560 | } | |
8561 | } else { // handling strings with known length | |
8562 | if(tIndex==tLen) { | |
8563 | if(endOfSource) { | |
8564 | goto endOfPrimLoop; | |
8565 | } else { | |
8566 | return UCOL_GREATER; | |
8567 | } | |
8568 | } | |
8569 | tChar=target[tIndex++]; | |
8570 | } | |
8571 | if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) | |
8572 | //fprintf(stderr, "R"); | |
8573 | goto returnRegular; | |
8574 | //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); | |
8575 | } | |
8576 | tOrder = elements[tChar]; | |
8577 | if(tOrder >= UCOL_NOT_FOUND) { | |
8578 | // Handling specials, see the comments for source | |
8579 | if(getCETag(tOrder) == CONTRACTION_TAG) { | |
8580 | tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); | |
8581 | haveContractions = TRUE; | |
8582 | } | |
8583 | if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
8584 | //fprintf(stderr, "S"); | |
8585 | goto returnRegular; | |
8586 | //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); | |
8587 | } | |
8588 | } | |
8589 | } | |
8590 | if(endOfSource) { // source is finished, but target is not, say the result. | |
8591 | return UCOL_LESS; | |
8592 | } | |
8593 | ||
8594 | if(sOrder == tOrder) { // if we have same CEs, we continue the loop | |
8595 | sOrder = 0; tOrder = 0; | |
8596 | continue; | |
8597 | } else { | |
8598 | // compare current top bytes | |
8599 | if(((sOrder^tOrder)&0xFF000000)!=0) { | |
8600 | // top bytes differ, return difference | |
8601 | if(sOrder < tOrder) { | |
8602 | return UCOL_LESS; | |
8603 | } else if(sOrder > tOrder) { | |
8604 | return UCOL_GREATER; | |
8605 | } | |
8606 | // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); | |
8607 | // since we must return enum value | |
8608 | } | |
8609 | ||
8610 | // top bytes match, continue with following bytes | |
8611 | sOrder<<=8; | |
8612 | tOrder<<=8; | |
374ca955 | 8613 | } |
b75a7d8f A |
8614 | } |
8615 | ||
8616 | endOfPrimLoop: | |
374ca955 | 8617 | // after primary loop, we definitely know the sizes of strings, |
b75a7d8f A |
8618 | // so we set it and use simpler loop for secondaries and tertiaries |
8619 | sLen = sIndex; tLen = tIndex; | |
8620 | if(strength >= UCOL_SECONDARY) { | |
8621 | // adjust the table beggining | |
8622 | elements += coll->latinOneTableLen; | |
8623 | endOfSource = FALSE; endOfTarget = FALSE; | |
8624 | ||
8625 | if(coll->frenchCollation == UCOL_OFF) { // non French | |
8626 | // This loop is a simplified copy of primary loop | |
374ca955 A |
8627 | // at this point we know that whole strings are latin-1, so we don't |
8628 | // check for that. We also know that we only have contractions as | |
b75a7d8f A |
8629 | // specials. |
8630 | sIndex = 0; tIndex = 0; | |
8631 | for(;;) { | |
8632 | while(sOrder==0) { | |
8633 | if(sIndex==sLen) { | |
8634 | endOfSource = TRUE; | |
8635 | break; | |
8636 | } | |
8637 | sChar=source[sIndex++]; | |
8638 | sOrder = elements[sChar]; | |
8639 | if(sOrder > UCOL_NOT_FOUND) { | |
8640 | sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); | |
8641 | } | |
8642 | } | |
8643 | ||
8644 | while(tOrder==0) { | |
8645 | if(tIndex==tLen) { | |
8646 | if(endOfSource) { | |
8647 | goto endOfSecLoop; | |
8648 | } else { | |
8649 | return UCOL_GREATER; | |
8650 | } | |
8651 | } | |
8652 | tChar=target[tIndex++]; | |
8653 | tOrder = elements[tChar]; | |
8654 | if(tOrder > UCOL_NOT_FOUND) { | |
8655 | tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); | |
8656 | } | |
8657 | } | |
8658 | if(endOfSource) { | |
8659 | return UCOL_LESS; | |
8660 | } | |
8661 | ||
8662 | if(sOrder == tOrder) { | |
8663 | sOrder = 0; tOrder = 0; | |
8664 | continue; | |
8665 | } else { | |
8666 | // see primary loop for comments on this | |
8667 | if(((sOrder^tOrder)&0xFF000000)!=0) { | |
8668 | if(sOrder < tOrder) { | |
8669 | return UCOL_LESS; | |
8670 | } else if(sOrder > tOrder) { | |
8671 | return UCOL_GREATER; | |
8672 | } | |
8673 | } | |
8674 | sOrder<<=8; | |
8675 | tOrder<<=8; | |
374ca955 | 8676 | } |
b75a7d8f A |
8677 | } |
8678 | } else { // French | |
8679 | if(haveContractions) { // if we have contractions, we have to bail out | |
8680 | // since we don't really know how to handle them here | |
8681 | goto returnRegular; | |
8682 | //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); | |
8683 | } | |
8684 | // For French, we go backwards | |
8685 | sIndex = sLen; tIndex = tLen; | |
8686 | for(;;) { | |
8687 | while(sOrder==0) { | |
8688 | if(sIndex==0) { | |
8689 | endOfSource = TRUE; | |
8690 | break; | |
8691 | } | |
8692 | sChar=source[--sIndex]; | |
8693 | sOrder = elements[sChar]; | |
8694 | // don't even look for contractions | |
8695 | } | |
8696 | ||
8697 | while(tOrder==0) { | |
8698 | if(tIndex==0) { | |
8699 | if(endOfSource) { | |
8700 | goto endOfSecLoop; | |
8701 | } else { | |
8702 | return UCOL_GREATER; | |
8703 | } | |
8704 | } | |
8705 | tChar=target[--tIndex]; | |
8706 | tOrder = elements[tChar]; | |
8707 | // don't even look for contractions | |
8708 | } | |
8709 | if(endOfSource) { | |
8710 | return UCOL_LESS; | |
8711 | } | |
8712 | ||
8713 | if(sOrder == tOrder) { | |
8714 | sOrder = 0; tOrder = 0; | |
8715 | continue; | |
8716 | } else { | |
8717 | // see the primary loop for comments | |
8718 | if(((sOrder^tOrder)&0xFF000000)!=0) { | |
8719 | if(sOrder < tOrder) { | |
8720 | return UCOL_LESS; | |
8721 | } else if(sOrder > tOrder) { | |
8722 | return UCOL_GREATER; | |
8723 | } | |
8724 | } | |
8725 | sOrder<<=8; | |
8726 | tOrder<<=8; | |
374ca955 | 8727 | } |
b75a7d8f A |
8728 | } |
8729 | } | |
374ca955 | 8730 | } |
b75a7d8f A |
8731 | |
8732 | endOfSecLoop: | |
8733 | if(strength >= UCOL_TERTIARY) { | |
8734 | // tertiary loop is the same as secondary (except no French) | |
8735 | elements += coll->latinOneTableLen; | |
8736 | sIndex = 0; tIndex = 0; | |
8737 | endOfSource = FALSE; endOfTarget = FALSE; | |
8738 | for(;;) { | |
8739 | while(sOrder==0) { | |
8740 | if(sIndex==sLen) { | |
8741 | endOfSource = TRUE; | |
8742 | break; | |
8743 | } | |
8744 | sChar=source[sIndex++]; | |
8745 | sOrder = elements[sChar]; | |
8746 | if(sOrder > UCOL_NOT_FOUND) { | |
8747 | sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); | |
8748 | } | |
8749 | } | |
8750 | while(tOrder==0) { | |
8751 | if(tIndex==tLen) { | |
8752 | if(endOfSource) { | |
8753 | return UCOL_EQUAL; // if both strings are at the end, they are equal | |
8754 | } else { | |
8755 | return UCOL_GREATER; | |
8756 | } | |
8757 | } | |
8758 | tChar=target[tIndex++]; | |
8759 | tOrder = elements[tChar]; | |
8760 | if(tOrder > UCOL_NOT_FOUND) { | |
8761 | tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); | |
8762 | } | |
8763 | } | |
8764 | if(endOfSource) { | |
8765 | return UCOL_LESS; | |
8766 | } | |
8767 | if(sOrder == tOrder) { | |
8768 | sOrder = 0; tOrder = 0; | |
8769 | continue; | |
8770 | } else { | |
8771 | if(((sOrder^tOrder)&0xff000000)!=0) { | |
8772 | if(sOrder < tOrder) { | |
8773 | return UCOL_LESS; | |
8774 | } else if(sOrder > tOrder) { | |
8775 | return UCOL_GREATER; | |
8776 | } | |
8777 | } | |
8778 | sOrder<<=8; | |
8779 | tOrder<<=8; | |
374ca955 | 8780 | } |
b75a7d8f | 8781 | } |
374ca955 | 8782 | } |
b75a7d8f A |
8783 | return UCOL_EQUAL; |
8784 | ||
8785 | returnRegular: | |
8786 | // Preparing the context objects for iterating over strings | |
8787 | collIterate sColl, tColl; | |
8788 | ||
8789 | IInit_collIterate(coll, source, sLen, &sColl); | |
8790 | IInit_collIterate(coll, target, tLen, &tColl); | |
374ca955 | 8791 | return ucol_strcollRegular(&sColl, &tColl, status); |
b75a7d8f A |
8792 | } |
8793 | ||
8794 | ||
8795 | U_CAPI UCollationResult U_EXPORT2 | |
8796 | ucol_strcollIter( const UCollator *coll, | |
8797 | UCharIterator *sIter, | |
8798 | UCharIterator *tIter, | |
8799 | UErrorCode *status) { | |
374ca955 A |
8800 | if(!status || U_FAILURE(*status)) { |
8801 | return UCOL_EQUAL; | |
8802 | } | |
8803 | ||
8804 | UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); | |
8805 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); | |
8806 | ||
8807 | if (sIter == tIter) { | |
8808 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
b75a7d8f A |
8809 | return UCOL_EQUAL; |
8810 | } | |
8811 | if(sIter == NULL || tIter == NULL || coll == NULL) { | |
8812 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
374ca955 | 8813 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
b75a7d8f A |
8814 | return UCOL_EQUAL; |
8815 | } | |
8816 | ||
8817 | UCollationResult result = UCOL_EQUAL; | |
8818 | ||
8819 | // Preparing the context objects for iterating over strings | |
8820 | collIterate sColl, tColl; | |
8821 | // The division for the array length may truncate the array size to | |
8822 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
8823 | // for all platforms anyway. | |
8824 | UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
8825 | UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
8826 | UNormIterator *sNormIter = NULL, *tNormIter = NULL; | |
8827 | ||
8828 | IInit_collIterate(coll, NULL, -1, &sColl); | |
8829 | sColl.iterator = sIter; | |
8830 | sColl.flags |= UCOL_USE_ITERATOR; | |
8831 | IInit_collIterate(coll, NULL, -1, &tColl); | |
8832 | tColl.flags |= UCOL_USE_ITERATOR; | |
8833 | tColl.iterator = tIter; | |
8834 | ||
8835 | if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { | |
8836 | sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); | |
8837 | sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); | |
8838 | sColl.flags &= ~UCOL_ITER_NORM; | |
8839 | ||
8840 | tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); | |
8841 | tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); | |
8842 | tColl.flags &= ~UCOL_ITER_NORM; | |
8843 | } | |
8844 | ||
8845 | UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; | |
374ca955 A |
8846 | |
8847 | while((sChar = sColl.iterator->next(sColl.iterator)) == | |
b75a7d8f | 8848 | (tChar = tColl.iterator->next(tColl.iterator))) { |
374ca955 A |
8849 | if(UCOL_ISTHAIPREVOWEL(sChar)) { |
8850 | break; | |
8851 | } | |
b75a7d8f A |
8852 | if(sChar == U_SENTINEL) { |
8853 | result = UCOL_EQUAL; | |
8854 | goto end_compare; | |
8855 | } | |
8856 | } | |
8857 | ||
8858 | if(sChar == U_SENTINEL) { | |
8859 | tChar = tColl.iterator->previous(tColl.iterator); | |
8860 | } | |
8861 | ||
8862 | if(tChar == U_SENTINEL) { | |
8863 | sChar = sColl.iterator->previous(sColl.iterator); | |
8864 | } | |
8865 | ||
8866 | sChar = sColl.iterator->previous(sColl.iterator); | |
8867 | tChar = tColl.iterator->previous(tColl.iterator); | |
8868 | ||
8869 | if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) | |
8870 | { | |
8871 | // We are stopped in the middle of a contraction. | |
8872 | // Scan backwards through the == part of the string looking for the start of the contraction. | |
8873 | // It doesn't matter which string we scan, since they are the same in this region. | |
8874 | do | |
8875 | { | |
8876 | sChar = sColl.iterator->previous(sColl.iterator); | |
8877 | tChar = tColl.iterator->previous(tColl.iterator); | |
8878 | } | |
8879 | while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); | |
8880 | } | |
8881 | ||
8882 | ||
8883 | if(U_SUCCESS(*status)) { | |
8884 | result = ucol_strcollRegular(&sColl, &tColl, status); | |
8885 | } | |
8886 | ||
8887 | end_compare: | |
8888 | if(sNormIter || tNormIter) { | |
8889 | unorm_closeIter(sNormIter); | |
8890 | unorm_closeIter(tNormIter); | |
8891 | } | |
8892 | ||
374ca955 | 8893 | UTRACE_EXIT_VALUE_STATUS(result, *status) |
b75a7d8f A |
8894 | return result; |
8895 | } | |
8896 | ||
8897 | ||
8898 | ||
8899 | /* */ | |
8900 | /* ucol_strcoll Main public API string comparison function */ | |
8901 | /* */ | |
8902 | U_CAPI UCollationResult U_EXPORT2 | |
8903 | ucol_strcoll( const UCollator *coll, | |
8904 | const UChar *source, | |
8905 | int32_t sourceLength, | |
8906 | const UChar *target, | |
8907 | int32_t targetLength) { | |
8908 | U_ALIGN_CODE(16); | |
374ca955 A |
8909 | |
8910 | UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); | |
8911 | if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | |
8912 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); | |
8913 | UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); | |
8914 | UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); | |
8915 | } | |
8916 | ||
b75a7d8f A |
8917 | UErrorCode status = U_ZERO_ERROR; |
8918 | if(source == NULL || target == NULL) { | |
374ca955 | 8919 | // do not crash, but return. Should have |
b75a7d8f | 8920 | // status argument to return error. |
374ca955 | 8921 | UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL); |
b75a7d8f A |
8922 | return UCOL_EQUAL; |
8923 | } | |
8924 | collIterate sColl, tColl; | |
8925 | ||
8926 | /* Scan the strings. Find: */ | |
8927 | /* The length of any leading portion that is equal */ | |
8928 | /* Whether they are exactly equal. (in which case we just return) */ | |
8929 | const UChar *pSrc = source; | |
8930 | const UChar *pTarg = target; | |
8931 | int32_t equalLength; | |
8932 | ||
8933 | if (sourceLength == -1 && targetLength == -1) { | |
8934 | // Both strings are null terminated. | |
8935 | // Check for them being the same string, and scan through | |
8936 | // any leading equal portion. | |
8937 | if (source==target) { | |
374ca955 | 8938 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
b75a7d8f A |
8939 | return UCOL_EQUAL; |
8940 | } | |
8941 | ||
8942 | for (;;) { | |
8943 | if ( *pSrc != *pTarg || *pSrc == 0) { | |
8944 | break; | |
8945 | } | |
374ca955 A |
8946 | if(UCOL_ISTHAIPREVOWEL(*pSrc)) { |
8947 | break; | |
8948 | } | |
b75a7d8f A |
8949 | pSrc++; |
8950 | pTarg++; | |
8951 | } | |
8952 | if (*pSrc == 0 && *pTarg == 0) { | |
374ca955 | 8953 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
b75a7d8f A |
8954 | return UCOL_EQUAL; |
8955 | } | |
8956 | equalLength = pSrc - source; | |
8957 | } | |
8958 | else | |
8959 | { | |
8960 | // One or both strings has an explicit length. | |
8961 | /* check if source and target are same strings */ | |
8962 | ||
8963 | if (source==target && sourceLength==targetLength) { | |
374ca955 | 8964 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
b75a7d8f A |
8965 | return UCOL_EQUAL; |
8966 | } | |
8967 | const UChar *pSrcEnd = source + sourceLength; | |
8968 | const UChar *pTargEnd = target + targetLength; | |
8969 | ||
8970 | ||
8971 | // Scan while the strings are bitwise ==, or until one is exhausted. | |
8972 | for (;;) { | |
8973 | if (pSrc == pSrcEnd || pTarg == pTargEnd) { | |
8974 | break; | |
8975 | } | |
8976 | if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { | |
8977 | break; | |
8978 | } | |
8979 | if (*pSrc != *pTarg) { | |
8980 | break; | |
8981 | } | |
374ca955 A |
8982 | if(UCOL_ISTHAIPREVOWEL(*pSrc)) { // they are the same here, so any will do |
8983 | break; | |
8984 | } | |
b75a7d8f A |
8985 | pSrc++; |
8986 | pTarg++; | |
8987 | } | |
8988 | equalLength = pSrc - source; | |
8989 | ||
8990 | // If we made it all the way through both strings, we are done. They are == | |
8991 | if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ | |
8992 | (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) { /* and also at end of dest string */ | |
374ca955 | 8993 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
b75a7d8f A |
8994 | return UCOL_EQUAL; |
8995 | } | |
8996 | } | |
8997 | if (equalLength > 0) { | |
8998 | /* There is an identical portion at the beginning of the two strings. */ | |
8999 | /* If the identical portion ends within a contraction or a comibining */ | |
9000 | /* character sequence, back up to the start of that sequence. */ | |
9001 | pSrc = source + equalLength; /* point to the first differing chars */ | |
9002 | pTarg = target + equalLength; | |
9003 | if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) || | |
9004 | pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)) | |
9005 | { | |
9006 | // We are stopped in the middle of a contraction. | |
9007 | // Scan backwards through the == part of the string looking for the start of the contraction. | |
9008 | // It doesn't matter which string we scan, since they are the same in this region. | |
9009 | do | |
9010 | { | |
9011 | equalLength--; | |
9012 | pSrc--; | |
9013 | } | |
9014 | while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); | |
9015 | } | |
9016 | ||
9017 | source += equalLength; | |
9018 | target += equalLength; | |
9019 | if (sourceLength > 0) { | |
9020 | sourceLength -= equalLength; | |
9021 | } | |
9022 | if (targetLength > 0) { | |
9023 | targetLength -= equalLength; | |
9024 | } | |
9025 | } | |
9026 | ||
374ca955 | 9027 | UCollationResult returnVal; |
b75a7d8f A |
9028 | if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { |
9029 | // Preparing the context objects for iterating over strings | |
9030 | IInit_collIterate(coll, source, sourceLength, &sColl); | |
9031 | IInit_collIterate(coll, target, targetLength, &tColl); | |
374ca955 | 9032 | returnVal = ucol_strcollRegular(&sColl, &tColl, &status); |
b75a7d8f | 9033 | } else { |
374ca955 | 9034 | returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); |
b75a7d8f | 9035 | } |
374ca955 A |
9036 | UTRACE_EXIT_VALUE(returnVal); |
9037 | return returnVal; | |
b75a7d8f A |
9038 | } |
9039 | ||
9040 | /* convenience function for comparing strings */ | |
9041 | U_CAPI UBool U_EXPORT2 | |
9042 | ucol_greater( const UCollator *coll, | |
9043 | const UChar *source, | |
9044 | int32_t sourceLength, | |
9045 | const UChar *target, | |
9046 | int32_t targetLength) | |
9047 | { | |
9048 | return (ucol_strcoll(coll, source, sourceLength, target, targetLength) | |
9049 | == UCOL_GREATER); | |
9050 | } | |
9051 | ||
9052 | /* convenience function for comparing strings */ | |
9053 | U_CAPI UBool U_EXPORT2 | |
9054 | ucol_greaterOrEqual( const UCollator *coll, | |
9055 | const UChar *source, | |
9056 | int32_t sourceLength, | |
9057 | const UChar *target, | |
9058 | int32_t targetLength) | |
9059 | { | |
9060 | return (ucol_strcoll(coll, source, sourceLength, target, targetLength) | |
9061 | != UCOL_LESS); | |
9062 | } | |
9063 | ||
9064 | /* convenience function for comparing strings */ | |
9065 | U_CAPI UBool U_EXPORT2 | |
9066 | ucol_equal( const UCollator *coll, | |
9067 | const UChar *source, | |
9068 | int32_t sourceLength, | |
9069 | const UChar *target, | |
9070 | int32_t targetLength) | |
9071 | { | |
9072 | return (ucol_strcoll(coll, source, sourceLength, target, targetLength) | |
9073 | == UCOL_EQUAL); | |
9074 | } | |
9075 | ||
9076 | /* returns the locale name the collation data comes from */ | |
9077 | U_CAPI const char * U_EXPORT2 | |
9078 | ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { | |
374ca955 A |
9079 | return ucol_getLocaleByType(coll, type, status); |
9080 | } | |
9081 | ||
9082 | U_CAPI const char * U_EXPORT2 | |
9083 | ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { | |
b75a7d8f A |
9084 | const char *result = NULL; |
9085 | if(status == NULL || U_FAILURE(*status)) { | |
9086 | return NULL; | |
9087 | } | |
374ca955 A |
9088 | UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); |
9089 | UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); | |
9090 | ||
b75a7d8f A |
9091 | switch(type) { |
9092 | case ULOC_ACTUAL_LOCALE: | |
9093 | // validLocale is set only if service registration has explicitly set the | |
9094 | // requested and valid locales. if this is the case, the actual locale | |
9095 | // is considered to be the valid locale. | |
9096 | if (coll->validLocale != NULL) { | |
9097 | result = coll->validLocale; | |
9098 | } else if(coll->elements != NULL) { | |
9099 | result = ures_getLocale(coll->elements, status); | |
9100 | } | |
9101 | break; | |
9102 | case ULOC_VALID_LOCALE: | |
9103 | if (coll->validLocale != NULL) { | |
9104 | result = coll->validLocale; | |
9105 | } else if(coll->rb != NULL) { | |
9106 | result = ures_getLocale(coll->rb, status); | |
374ca955 | 9107 | } |
b75a7d8f A |
9108 | break; |
9109 | case ULOC_REQUESTED_LOCALE: | |
9110 | result = coll->requestedLocale; | |
9111 | break; | |
9112 | default: | |
9113 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
9114 | } | |
374ca955 A |
9115 | UTRACE_DATA1(UTRACE_INFO, "result = %s", result); |
9116 | UTRACE_EXIT_STATUS(*status); | |
b75a7d8f A |
9117 | return result; |
9118 | } | |
9119 | ||
9120 | U_CAPI USet * U_EXPORT2 | |
374ca955 | 9121 | ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) |
b75a7d8f A |
9122 | { |
9123 | if(status == NULL || U_FAILURE(*status)) { | |
9124 | return NULL; | |
9125 | } | |
374ca955 | 9126 | if(coll == NULL || coll->UCA == NULL) { |
b75a7d8f A |
9127 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
9128 | } | |
9129 | UParseError parseError; | |
9130 | UColTokenParser src; | |
9131 | int32_t rulesLen = 0; | |
9132 | const UChar *rules = ucol_getRules(coll, &rulesLen); | |
9133 | const UChar *current = NULL; | |
9134 | UBool startOfRules = TRUE; | |
9135 | // we internally use the C++ class, for the following reasons: | |
9136 | // 1. we need to utilize canonical iterator, which is a C++ only class | |
9137 | // 2. canonical iterator returns UnicodeStrings - USet cannot take them | |
9138 | // 3. USet is internally really UnicodeSet, C is just a wrapper | |
9139 | UnicodeSet *tailored = new UnicodeSet(); | |
9140 | UnicodeString pattern; | |
374ca955 A |
9141 | UnicodeString empty; |
9142 | CanonicalIterator it(empty, *status); | |
b75a7d8f A |
9143 | |
9144 | ||
9145 | // The idea is to tokenize the rule set. For each non-reset token, | |
374ca955 A |
9146 | // we add all the canonicaly equivalent FCD sequences |
9147 | ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status); | |
b75a7d8f A |
9148 | while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) { |
9149 | startOfRules = FALSE; | |
9150 | if(src.parsedToken.strength != UCOL_TOK_RESET) { | |
9151 | const UChar *stuff = src.source+(src.parsedToken.charsOffset); | |
9152 | it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status); | |
9153 | pattern = it.next(); | |
9154 | while(!pattern.isBogus()) { | |
9155 | if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) { | |
9156 | tailored->add(pattern); | |
9157 | } | |
9158 | pattern = it.next(); | |
9159 | } | |
9160 | } | |
9161 | } | |
9162 | ucol_tok_closeTokenList(&src); | |
9163 | return (USet *)tailored; | |
9164 | } | |
9165 | ||
9166 | U_CAPI UBool U_EXPORT2 | |
9167 | ucol_equals(const UCollator *source, const UCollator *target) { | |
9168 | UErrorCode status = U_ZERO_ERROR; | |
9169 | // if pointers are equal, collators are equal | |
9170 | if(source == target) { | |
9171 | return TRUE; | |
9172 | } | |
9173 | int32_t i = 0, j = 0; | |
9174 | // if any of attributes are different, collators are not equal | |
9175 | for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { | |
9176 | if(ucol_getAttribute(source, (UColAttribute)i, &status) != ucol_getAttribute(target, (UColAttribute)i, &status) || U_FAILURE(status)) { | |
9177 | return FALSE; | |
9178 | } | |
9179 | } | |
9180 | ||
9181 | int32_t sourceRulesLen = 0, targetRulesLen = 0; | |
9182 | const UChar *sourceRules = ucol_getRules(source, &sourceRulesLen); | |
9183 | const UChar *targetRules = ucol_getRules(target, &targetRulesLen); | |
9184 | ||
9185 | if(sourceRulesLen == targetRulesLen && u_strncmp(sourceRules, targetRules, sourceRulesLen) == 0) { | |
9186 | // all the attributes are equal and the rules are equal - collators are equal | |
9187 | return(TRUE); | |
374ca955 | 9188 | } |
b75a7d8f A |
9189 | // hard part, need to construct tree from rules and see if they yield the same tailoring |
9190 | UBool result = TRUE; | |
9191 | UParseError parseError; | |
9192 | UColTokenParser sourceParser, targetParser; | |
9193 | int32_t sourceListLen = 0, targetListLen = 0; | |
374ca955 A |
9194 | ucol_tok_initTokenList(&sourceParser, sourceRules, sourceRulesLen, source->UCA, &status); |
9195 | ucol_tok_initTokenList(&targetParser, targetRules, targetRulesLen, target->UCA, &status); | |
b75a7d8f A |
9196 | sourceListLen = ucol_tok_assembleTokenList(&sourceParser, &parseError, &status); |
9197 | targetListLen = ucol_tok_assembleTokenList(&targetParser, &parseError, &status); | |
9198 | ||
9199 | if(sourceListLen != targetListLen) { | |
9200 | // different number of resets | |
9201 | result = FALSE; | |
9202 | } else { | |
9203 | UColToken *sourceReset = NULL, *targetReset = NULL; | |
9204 | UChar *sourceResetString = NULL, *targetResetString = NULL; | |
9205 | int32_t sourceStringLen = 0, targetStringLen = 0; | |
9206 | for(i = 0; i < sourceListLen; i++) { | |
9207 | sourceReset = sourceParser.lh[i].reset; | |
9208 | sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF); | |
9209 | sourceStringLen = sourceReset->source >> 24; | |
9210 | for(j = 0; j < sourceListLen; j++) { | |
9211 | targetReset = targetParser.lh[j].reset; | |
9212 | targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF); | |
9213 | targetStringLen = targetReset->source >> 24; | |
9214 | if(sourceStringLen == targetStringLen && (u_strncmp(sourceResetString, targetResetString, sourceStringLen) == 0)) { | |
9215 | sourceReset = sourceParser.lh[i].first; | |
9216 | targetReset = targetParser.lh[j].first; | |
9217 | while(sourceReset != NULL && targetReset != NULL) { | |
9218 | sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF); | |
9219 | sourceStringLen = sourceReset->source >> 24; | |
9220 | targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF); | |
9221 | targetStringLen = targetReset->source >> 24; | |
9222 | if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) { | |
9223 | result = FALSE; | |
9224 | goto returnResult; | |
9225 | } | |
9226 | // probably also need to check the expansions | |
9227 | if(sourceReset->expansion) { | |
9228 | if(!targetReset->expansion) { | |
9229 | result = FALSE; | |
9230 | goto returnResult; | |
9231 | } else { | |
9232 | // compare expansions | |
9233 | sourceResetString = sourceParser.source+(sourceReset->expansion& 0xFFFFFF); | |
9234 | sourceStringLen = sourceReset->expansion >> 24; | |
9235 | targetResetString = targetParser.source+(targetReset->expansion & 0xFFFFFF); | |
9236 | targetStringLen = targetReset->expansion >> 24; | |
9237 | if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) { | |
9238 | result = FALSE; | |
9239 | goto returnResult; | |
9240 | } | |
9241 | } | |
9242 | } else { | |
9243 | if(targetReset->expansion) { | |
9244 | result = FALSE; | |
9245 | goto returnResult; | |
9246 | } | |
9247 | } | |
9248 | sourceReset = sourceReset->next; | |
9249 | targetReset = targetReset->next; | |
9250 | } | |
9251 | if(sourceReset != targetReset) { // at least one is not NULL | |
9252 | // there are more tailored elements in one list | |
9253 | result = FALSE; | |
9254 | goto returnResult; | |
9255 | } | |
9256 | ||
9257 | ||
9258 | break; | |
9259 | } | |
9260 | } | |
9261 | // couldn't find the reset anchor, so the collators are not equal | |
9262 | if(j == sourceListLen) { | |
9263 | result = FALSE; | |
9264 | goto returnResult; | |
9265 | } | |
9266 | } | |
9267 | } | |
9268 | ||
9269 | returnResult: | |
9270 | ucol_tok_closeTokenList(&sourceParser); | |
9271 | ucol_tok_closeTokenList(&targetParser); | |
9272 | return result; | |
9273 | ||
9274 | } | |
374ca955 A |
9275 | |
9276 | U_CAPI void U_EXPORT2 | |
9277 | ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { | |
9278 | if(coll && coll->UCA) { | |
9279 | uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); | |
9280 | } | |
9281 | } | |
9282 | ||
9283 | U_CAPI int32_t U_EXPORT2 | |
9284 | ucol_cloneBinary(const UCollator *coll, | |
9285 | uint8_t *buffer, int32_t capacity, | |
9286 | UErrorCode *status) | |
9287 | { | |
9288 | int32_t length = 0; | |
9289 | if(U_FAILURE(*status)) { | |
9290 | return length; | |
9291 | } | |
9292 | if(coll->hasRealData == TRUE) { | |
9293 | length = coll->image->size; | |
9294 | if(length <= capacity) { | |
9295 | uprv_memcpy(buffer, coll->image, length); | |
9296 | } | |
9297 | } else { | |
9298 | length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); | |
9299 | if(length <= capacity) { | |
9300 | /* build the UCATableHeader with minimal entries */ | |
9301 | /* do not copy the header from the UCA file because its values are wrong! */ | |
9302 | /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ | |
9303 | ||
9304 | /* reset everything */ | |
9305 | uprv_memset(buffer, 0, length); | |
9306 | ||
9307 | /* set the tailoring-specific values */ | |
9308 | UCATableHeader *myData = (UCATableHeader *)buffer; | |
9309 | myData->size = length; | |
9310 | ||
9311 | /* offset for the options, the only part of the data that is present after the header */ | |
9312 | myData->options = sizeof(UCATableHeader); | |
9313 | ||
9314 | /* need to always set the expansion value for an upper bound of the options */ | |
9315 | myData->expansion = myData->options + sizeof(UColOptionSet); | |
9316 | ||
9317 | myData->magic = UCOL_HEADER_MAGIC; | |
9318 | myData->isBigEndian = U_IS_BIG_ENDIAN; | |
9319 | myData->charSetFamily = U_CHARSET_FAMILY; | |
9320 | ||
9321 | /* copy UCA's version; genrb will override all but the builder version with tailoring data */ | |
9322 | uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); | |
9323 | ||
9324 | uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); | |
9325 | uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); | |
9326 | uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); | |
9327 | myData->jamoSpecial = coll->image->jamoSpecial; | |
9328 | ||
9329 | /* copy the collator options */ | |
9330 | uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); | |
9331 | } | |
9332 | } | |
9333 | return length; | |
9334 | } | |
9335 | ||
9336 | U_CAPI UCollator* U_EXPORT2 | |
9337 | ucol_openBinary(const uint8_t *bin, int32_t length, | |
9338 | const UCollator *base, | |
9339 | UErrorCode *status) | |
9340 | { | |
9341 | UCollator *result = NULL; | |
9342 | if(U_FAILURE(*status)){ | |
9343 | return NULL; | |
9344 | } | |
9345 | if(base == NULL) { | |
9346 | // we don't support null base yet | |
9347 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
9348 | return NULL; | |
9349 | } | |
9350 | UCATableHeader *colData = (UCATableHeader *)bin; | |
9351 | // do we want version check here? We're trying to figure out whether collators are compatible | |
9352 | if(uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || | |
9353 | uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0 || | |
9354 | colData->version[0] != UCOL_BUILDER_VERSION) { | |
9355 | *status = U_COLLATOR_VERSION_MISMATCH; | |
9356 | return NULL; | |
9357 | } else { | |
9358 | if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { | |
9359 | result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); | |
9360 | if(U_FAILURE(*status)){ | |
9361 | return NULL; | |
9362 | } | |
9363 | result->hasRealData = TRUE; | |
9364 | } else { | |
9365 | if(base) { | |
9366 | result = ucol_initCollator(base->image, result, base, status); | |
9367 | ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); | |
9368 | if(U_FAILURE(*status)){ | |
9369 | return NULL; | |
9370 | } | |
9371 | result->hasRealData = FALSE; | |
9372 | } else { | |
9373 | *status = U_USELESS_COLLATOR_ERROR; | |
9374 | return NULL; | |
9375 | } | |
9376 | } | |
9377 | result->freeImageOnClose = FALSE; | |
9378 | } | |
9379 | result->validLocale = NULL; | |
9380 | result->requestedLocale = NULL; | |
9381 | result->rules = NULL; | |
9382 | result->rulesLength = 0; | |
9383 | result->freeRulesOnClose = FALSE; | |
9384 | result->rb = NULL; | |
9385 | result->elements = NULL; | |
9386 | return result; | |
9387 | } | |
9388 | ||
b75a7d8f | 9389 | #endif /* #if !UCONFIG_NO_COLLATION */ |
374ca955 | 9390 |