]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / i18n / ucol.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
374ca955 3* Copyright (C) 1996-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: ucol.cpp
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* Modification history
12* Date Name Comments
13* 1996-1999 various members of ICU team maintained C API for collation framework
14* 02/16/2001 synwee Added internal method getPrevSpecialCE
15* 03/01/2001 synwee Added maxexpansion functionality.
16* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17*/
18
19#include "unicode/utypes.h"
374ca955 20#include "ustrenum.h"
b75a7d8f
A
21#include "uassert.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/uloc.h"
26#include "unicode/coll.h"
27#include "unicode/tblcoll.h"
28#include "unicode/coleitr.h"
29#include "unicode/unorm.h"
30#include "unicode/udata.h"
31#include "unicode/uchar.h"
32#include "unicode/caniter.h"
33
34#include "ucol_bld.h"
35#include "ucol_imp.h"
36#include "ucol_tok.h"
37#include "ucol_elm.h"
38#include "bocsu.h"
39
40#include "unormimp.h"
41#include "unorm_it.h"
42#include "uresimp.h"
43#include "umutex.h"
44#include "uhash.h"
374ca955 45#include "ucln_in.h"
b75a7d8f 46#include "cstring.h"
374ca955
A
47#include "utracimp.h"
48#include "putilimp.h"
b75a7d8f
A
49
50#ifdef UCOL_DEBUG
51#include <stdio.h>
52#endif
53
54U_NAMESPACE_USE
55
56/* added by synwee for trie manipulation*/
57#define STAGE_1_SHIFT_ 10
58#define STAGE_2_SHIFT_ 4
59#define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
60#define STAGE_3_MASK_ 0xF
61#define LAST_BYTE_MASK_ 0xFF
62#define SECOND_LAST_BYTE_SHIFT_ 8
63
64#define ZERO_CC_LIMIT_ 0xC0
65
374ca955
A
66// static UCA. There is only one. Collators don't use it.
67// It is referenced only in ucol_initUCA and ucol_cleanup
68static UCollator* _staticUCA = NULL;
69// static pointer to udata memory. Inited in ucol_initUCA
70// used for cleanup in ucol_cleanup
b75a7d8f
A
71static UDataMemory* UCA_DATA_MEM = NULL;
72
374ca955
A
73// this is static pointer to the normalizer fcdTrieIndex
74// it is always the same between calls to u_cleanup
75// and therefore writing to it is not synchronized.
76// It is cleaned in ucol_cleanup
77static const uint16_t *fcdTrieIndex=NULL;
b75a7d8f
A
78
79U_CDECL_BEGIN
80static UBool U_CALLCONV
81isAcceptableUCA(void * /*context*/,
82 const char * /*type*/, const char * /*name*/,
83 const UDataInfo *pInfo){
84 /* context, type & name are intentionally not used */
85 if( pInfo->size>=20 &&
86 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
87 pInfo->charsetFamily==U_CHARSET_FAMILY &&
374ca955
A
88 pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 && /* dataFormat="UCol" */
89 pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 &&
90 pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 &&
91 pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 &&
92 pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 &&
93 pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// &&
94 //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
95 //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
96 //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
b75a7d8f
A
97 ) {
98 UVersionInfo UCDVersion;
99 u_getUnicodeVersion(UCDVersion);
100 if(pInfo->dataVersion[0]==UCDVersion[0] &&
101 pInfo->dataVersion[1]==UCDVersion[1]) { // &&
102 //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
103 //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
104 return TRUE;
105 } else {
106 return FALSE;
107 }
108 } else {
109 return FALSE;
110 }
111}
112
113
114static int32_t U_CALLCONV
115_getFoldingOffset(uint32_t data) {
116 return (int32_t)(data&0xFFFFFF);
117}
118
119U_CDECL_END
120
121static
122inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
123 int32_t sourceLen, collIterate *s) {
124 (s)->string = (s)->pos = (UChar *)(sourceString);
125 (s)->origFlags = 0;
126 (s)->flags = 0;
127 if (sourceLen >= 0) {
128 s->flags |= UCOL_ITER_HASLEN;
129 (s)->endp = (UChar *)sourceString+sourceLen;
130 }
131 else {
132 /* change to enable easier checking for end of string for fcdpositon */
133 (s)->endp = NULL;
134 }
135 (s)->CEpos = (s)->toReturn = (s)->CEs;
136 (s)->writableBuffer = (s)->stackWritableBuffer;
137 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
138 (s)->coll = (collator);
139 (s)->fcdPosition = 0;
140 if(collator->normalizationMode == UCOL_ON) {
374ca955 141 (s)->flags |= UCOL_ITER_NORM;
b75a7d8f
A
142 }
143 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
144 (s)->flags |= UCOL_HIRAGANA_Q;
145 }
146 (s)->iterator = NULL;
147 //(s)->iteratorIndex = 0;
148}
149
150U_CAPI void U_EXPORT2
151uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
152 int32_t sourceLen, collIterate *s){
153 /* Out-of-line version for use from other files. */
154 IInit_collIterate(collator, sourceString, sourceLen, s);
155}
156
157
158/**
159* Backup the state of the collIterate struct data
160* @param data collIterate to backup
161* @param backup storage
162*/
163static
164inline void backupState(const collIterate *data, collIterateState *backup)
165{
166 backup->fcdPosition = data->fcdPosition;
167 backup->flags = data->flags;
168 backup->origFlags = data->origFlags;
169 backup->pos = data->pos;
170 backup->bufferaddress = data->writableBuffer;
171 backup->buffersize = data->writableBufSize;
172 if(data->iterator != NULL) {
173 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
174 backup->iteratorIndex = data->iterator->getState(data->iterator);
175 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
176 backup->iteratorMove = 0;
177 if(backup->iteratorIndex == UITER_NO_STATE) {
178 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
179 backup->iteratorMove++;
180 data->iterator->move(data->iterator, -1, UITER_CURRENT);
181 }
182 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
183 }
184 }
185}
186
187/**
188* Loads the state into the collIterate struct data
189* @param data collIterate to backup
190* @param backup storage
191* @param forwards boolean to indicate if forwards iteration is used,
192* false indicates backwards iteration
193*/
194static
195inline void loadState(collIterate *data, const collIterateState *backup,
196 UBool forwards)
197{
198 UErrorCode status = U_ZERO_ERROR;
199 data->flags = backup->flags;
200 data->origFlags = backup->origFlags;
201 if(data->iterator != NULL) {
202 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
203 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
204 if(backup->iteratorMove != 0) {
205 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
206 }
207 }
208 data->pos = backup->pos;
209 if ((data->flags & UCOL_ITER_INNORMBUF) &&
210 data->writableBuffer != backup->bufferaddress) {
211 /*
212 this is when a new buffer has been reallocated and we'll have to
213 calculate the new position.
214 note the new buffer has to contain the contents of the old buffer.
215 */
216 if (forwards) {
217 data->pos = data->writableBuffer +
218 (data->pos - backup->bufferaddress);
219 }
220 else {
221 /* backwards direction */
222 uint32_t temp = backup->buffersize -
223 (data->pos - backup->bufferaddress);
224 data->pos = data->writableBuffer + (data->writableBufSize - temp);
225 }
226 }
227 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
228 /*
229 this is alittle tricky.
230 if we are initially not in the normalization buffer, even if we
231 normalize in the later stage, the data in the buffer will be
232 ignored, since we skip back up to the data string.
233 however if we are already in the normalization buffer, any
234 further normalization will pull data into the normalization
235 buffer and modify the fcdPosition.
236 since we are keeping the data in the buffer for use, the
237 fcdPosition can not be reverted back.
238 arrgghh....
239 */
240 data->fcdPosition = backup->fcdPosition;
241 }
242}
243
244
245/*
246* collIter_eos()
247* Checks for a collIterate being positioned at the end of
248* its source string.
249*
250*/
251static
252inline UBool collIter_eos(collIterate *s) {
253 if(s->flags & UCOL_USE_ITERATOR) {
254 return !(s->iterator->hasNext(s->iterator));
255 }
256 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
257 // Null terminated string, but not at null, so not at end.
258 // Whether in main or normalization buffer doesn't matter.
259 return FALSE;
260 }
261
262 // String with length. Can't be in normalization buffer, which is always
263 // null termintated.
264 if (s->flags & UCOL_ITER_HASLEN) {
265 return (s->pos == s->endp);
266 }
267
268 // We are at a null termination, could be either normalization buffer or main string.
269 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
270 // At null at end of main string.
271 return TRUE;
272 }
273
274 // At null at end of normalization buffer. Need to check whether there there are
275 // any characters left in the main buffer.
276 if(s->origFlags & UCOL_USE_ITERATOR) {
277 return !(s->iterator->hasNext(s->iterator));
278 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
279 // Null terminated main string. fcdPosition is the 'return' position into main buf.
280 return (*s->fcdPosition == 0);
281 }
282 else {
283 // Main string with an end pointer.
284 return s->fcdPosition == s->endp;
285 }
286}
287
288/*
289* collIter_bos()
290* Checks for a collIterate being positioned at the start of
291* its source string.
292*
293*/
294static
295inline UBool collIter_bos(collIterate *source) {
296 // if we're going backwards, we need to know whether there is more in the
297 // iterator, even if we are in the side buffer
298 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
299 return !source->iterator->hasPrevious(source->iterator);
300 }
301 if (source->pos <= source->string ||
302 ((source->flags & UCOL_ITER_INNORMBUF) &&
303 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
304 return TRUE;
305 }
306 return FALSE;
307}
308
309static
310inline UBool collIter_SimpleBos(collIterate *source) {
311 // if we're going backwards, we need to know whether there is more in the
312 // iterator, even if we are in the side buffer
313 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
314 return !source->iterator->hasPrevious(source->iterator);
315 }
316 if (source->pos == source->string) {
317 return TRUE;
318 }
319 return FALSE;
320}
321 //return (data->pos == data->string) ||
322
323
324/**
325* Checks and free writable buffer if it is not the original stack buffer
326* in collIterate. This function does not reassign the writable buffer.
327* @param data collIterate struct to determine and free the writable buffer
328*/
329static
330inline void freeHeapWritableBuffer(collIterate *data)
331{
332 if (data->writableBuffer != data->stackWritableBuffer) {
333 uprv_free(data->writableBuffer);
334 }
335}
336
337
338/****************************************************************************/
339/* Following are the open/close functions */
340/* */
341/****************************************************************************/
342static UCollator*
343tryOpeningFromRules(UResourceBundle *collElem, UErrorCode *status) {
344 int32_t rulesLen = 0;
345 const UChar *rules = ures_getStringByKey(collElem, "Sequence", &rulesLen, status);
346 return ucol_openRules(rules, rulesLen, UCOL_DEFAULT, UCOL_DEFAULT, NULL, status);
347
348}
349
350
351U_CAPI UCollator*
352ucol_open(const char *loc,
374ca955 353 UErrorCode *status)
b75a7d8f 354{
374ca955
A
355 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN);
356 UTRACE_DATA1(UTRACE_INFO, "locale = \"%s\"", loc);
b75a7d8f 357 UCollator *result = NULL;
374ca955
A
358
359 u_init(status);
360#if !UCONFIG_NO_SERVICE
361 result = Collator::createUCollator(loc, status);
362 if (result == NULL)
363#endif
364 {
365 result = ucol_open_internal(loc, status);
b75a7d8f 366 }
374ca955
A
367 UTRACE_EXIT_PTR_STATUS(result, *status);
368 return result;
b75a7d8f
A
369}
370
371// API in ucol_imp.h
372
373U_CFUNC UCollator*
374ucol_open_internal(const char *loc,
374ca955 375 UErrorCode *status)
b75a7d8f 376{
374ca955 377 const UCollator* UCA = ucol_initUCA(status);
b75a7d8f
A
378
379 /* New version */
380 if(U_FAILURE(*status)) return 0;
381
374ca955
A
382
383
b75a7d8f 384 UCollator *result = NULL;
374ca955
A
385 UResourceBundle *b = ures_open(U_ICUDATA_COLL, loc, status);
386
387 /* we try to find stuff from keyword */
388 UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status);
389 UResourceBundle *collElem = NULL;
390 char keyBuffer[256];
391 // if there is a keyword, we pick it up and try to get elements
392 if(!uloc_getKeywordValue(loc, "collation", keyBuffer, 256, status)) {
393 // no keyword. we try to find the default setting, which will give us the keyword value
394 UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status);
395 if(U_SUCCESS(*status)) {
396 int32_t defaultKeyLen = 0;
397 const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status);
398 u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen);
399 keyBuffer[defaultKeyLen] = 0;
400 } else {
401 *status = U_INTERNAL_PROGRAM_ERROR;
402 return NULL;
403 }
404 ures_close(defaultColl);
405 }
406 collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status);
407
b75a7d8f 408 UResourceBundle *binary = NULL;
374ca955 409 UErrorCode binaryStatus = U_ZERO_ERROR;
b75a7d8f
A
410
411 if(*status == U_MISSING_RESOURCE_ERROR) { /* We didn't find the tailoring data, we fallback to the UCA */
412 *status = U_USING_DEFAULT_WARNING;
374ca955 413 result = ucol_initCollator(UCA->image, result, UCA, status);
b75a7d8f 414 // if we use UCA, real locale is root
374ca955
A
415 result->rb = ures_open(U_ICUDATA_COLL, "", status);
416 result->elements = ures_open(U_ICUDATA_COLL, "", status);
b75a7d8f
A
417 if(U_FAILURE(*status)) {
418 goto clean;
419 }
420 ures_close(b);
421 result->hasRealData = FALSE;
374ca955 422 } else if(U_SUCCESS(*status)) {
b75a7d8f
A
423 binary = ures_getByKey(collElem, "%%CollationBin", NULL, &binaryStatus);
424
425 if(binaryStatus == U_MISSING_RESOURCE_ERROR) { /* we didn't find the binary image, we should use the rules */
426 binary = NULL;
427 result = tryOpeningFromRules(collElem, status);
428 if(U_FAILURE(*status)) {
429 goto clean;
430 }
431 } else if(U_SUCCESS(*status)) { /* otherwise, we'll pick a collation data that exists */
432 int32_t len = 0;
433 const uint8_t *inData = ures_getBinary(binary, &len, status);
434 UCATableHeader *colData = (UCATableHeader *)inData;
435 if(uprv_memcmp(colData->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
436 uprv_memcmp(colData->UCDVersion, UCA->image->UCDVersion, sizeof(UVersionInfo)) != 0 ||
437 colData->version[0] != UCOL_BUILDER_VERSION) {
438 *status = U_DIFFERENT_UCA_VERSION;
439 result = tryOpeningFromRules(collElem, status);
440 } else {
441 if(U_FAILURE(*status)){
442 goto clean;
443 }
444 if((uint32_t)len > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
374ca955 445 result = ucol_initCollator((const UCATableHeader *)inData, result, UCA, status);
b75a7d8f
A
446 if(U_FAILURE(*status)){
447 goto clean;
448 }
449 result->hasRealData = TRUE;
450 } else {
374ca955 451 result = ucol_initCollator(UCA->image, result, UCA, status);
b75a7d8f
A
452 ucol_setOptionsFromHeader(result, (UColOptionSet *)(inData+((const UCATableHeader *)inData)->options), status);
453 if(U_FAILURE(*status)){
454 goto clean;
455 }
456 result->hasRealData = FALSE;
457 }
458 result->freeImageOnClose = FALSE;
459 }
460 }
461 result->rb = b;
462 result->elements = collElem;
463 } else { /* There is another error, and we're just gonna clean up */
464clean:
465 ures_close(b);
466 ures_close(collElem);
374ca955 467 ures_close(collations);
b75a7d8f
A
468 ures_close(binary);
469 return NULL;
470 }
471
472 result->validLocale = NULL; // default is to use rb info
473
474 if(loc == NULL) {
475 loc = ures_getLocale(result->rb, status);
476 }
477 result->requestedLocale = (char *)uprv_malloc((uprv_strlen(loc)+1)*sizeof(char));
478 /* test for NULL */
479 if (result->requestedLocale == NULL) {
374ca955
A
480 *status = U_MEMORY_ALLOCATION_ERROR;
481 ures_close(b); // ??? appears needed
b75a7d8f 482 ures_close(collElem);
374ca955 483 ures_close(collations);
b75a7d8f 484 ures_close(binary); // ??? appears needed
374ca955 485 return NULL;
b75a7d8f
A
486 }
487 uprv_strcpy(result->requestedLocale, loc);
488
489 ures_close(binary);
374ca955 490 ures_close(collations); //??? we have to decide on that. Probably affects something :)
b75a7d8f
A
491 return result;
492}
493
374ca955 494
b75a7d8f
A
495U_CAPI void U_EXPORT2
496ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt)
497{
498 if (coll) {
499 if (coll->validLocale) {
500 uprv_free(coll->validLocale);
374ca955 501 }
b75a7d8f
A
502 coll->validLocale = validLocaleToAdopt;
503 if (coll->requestedLocale) { // should always have
504 uprv_free(coll->requestedLocale);
374ca955 505 }
b75a7d8f
A
506 coll->requestedLocale = requestedLocaleToAdopt;
507 }
508}
509
510U_CAPI void U_EXPORT2
511ucol_close(UCollator *coll)
512{
374ca955
A
513 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
514 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
b75a7d8f 515 if(coll != NULL) {
374ca955
A
516 // these are always owned by each UCollator struct,
517 // so we always free them
518 if(coll->validLocale != NULL) {
519 uprv_free(coll->validLocale);
520 }
521 if(coll->requestedLocale != NULL) {
522 uprv_free(coll->requestedLocale);
523 }
b75a7d8f 524
374ca955
A
525 /* Here, it would be advisable to close: */
526 /* - UData for UCA (unless we stuff it in the root resb */
527 /* Again, do we need additional housekeeping... HMMM! */
528 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
529 if(coll->freeOnClose){
530 /* for safeClone, if freeOnClose is FALSE,
531 don't free the other instance data */
532 if(coll->freeOptionsOnClose != FALSE) {
533 if(coll->options != NULL) {
534 uprv_free(coll->options);
535 }
536 }
537 if(coll->mapping != NULL) {
538 /*ucmpe32_close(coll->mapping);*/
539 uprv_free(coll->mapping);
540 }
541 if(coll->rules != NULL && coll->freeRulesOnClose) {
542 uprv_free((UChar *)coll->rules);
543 }
544 if(coll->rb != NULL) { /* pointing to read-only memory */
545 ures_close(coll->rb);
546 }
547 if(coll->freeImageOnClose == TRUE) {
548 uprv_free((UCATableHeader *)coll->image);
549 }
550 if(coll->elements != NULL) {
551 ures_close(coll->elements);
552 }
553 if(coll->latinOneCEs != NULL) {
554 uprv_free(coll->latinOneCEs);
555 }
556 uprv_free(coll);
b75a7d8f 557 }
b75a7d8f 558 }
374ca955 559 UTRACE_EXIT();
b75a7d8f
A
560}
561
562U_CAPI UCollator* U_EXPORT2
563ucol_openRules( const UChar *rules,
564 int32_t rulesLength,
565 UColAttributeValue normalizationMode,
566 UCollationStrength strength,
567 UParseError *parseError,
568 UErrorCode *status)
569{
570 uint32_t listLen = 0;
571 UColTokenParser src;
572 UColAttributeValue norm;
573 UParseError tErr;
374ca955 574
b75a7d8f
A
575 if(status == NULL || U_FAILURE(*status)){
576 return 0;
577 }
578
374ca955
A
579 u_init(status);
580 if (U_FAILURE(*status)) {
581 return NULL;
582 }
583
b75a7d8f
A
584 if(rulesLength < -1 || (rules == NULL && rulesLength != 0)) {
585 *status = U_ILLEGAL_ARGUMENT_ERROR;
586 return 0;
587 }
588
589 if(rulesLength == -1) {
590 rulesLength = u_strlen(rules);
591 }
592
593 if(parseError == NULL){
594 parseError = &tErr;
595 }
374ca955 596
b75a7d8f
A
597 switch(normalizationMode) {
598 case UCOL_OFF:
599 case UCOL_ON:
600 case UCOL_DEFAULT:
601 norm = normalizationMode;
602 break;
603 default:
604 *status = U_ILLEGAL_ARGUMENT_ERROR;
605 return 0;
606 }
607
374ca955 608 UCollator *UCA = ucol_initUCA(status);
b75a7d8f
A
609
610 if(U_FAILURE(*status)){
611 return NULL;
612 }
613
614 ucol_tok_initTokenList(&src, rules, rulesLength, UCA, status);
615 listLen = ucol_tok_assembleTokenList(&src,parseError, status);
616
617 if(U_FAILURE(*status)) {
618 /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
619 /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
620 /* so something might be done here... or on lower level */
621#ifdef UCOL_DEBUG
622 if(*status == U_ILLEGAL_ARGUMENT_ERROR) {
623 fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source);
624 } else {
625 fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source);
626 }
627#endif
628 ucol_tok_closeTokenList(&src);
629 return NULL;
630 }
631 UCollator *result = NULL;
632 UCATableHeader *table = NULL;
633
634 if(src.resultLen > 0 || src.removeSet != NULL) { /* we have a set of rules, let's make something of it */
635 /* also, if we wanted to remove some contractions, we should make a tailoring */
636 table = ucol_assembleTailoringTable(&src, status);
637 if(U_SUCCESS(*status)) {
638 // builder version
639 table->version[0] = UCOL_BUILDER_VERSION;
640 // no tailoring information on this level
641 table->version[1] = table->version[2] = table->version[3] = 0;
642 // set UCD version
643 u_getUnicodeVersion(table->UCDVersion);
644 // set UCA version
645 uprv_memcpy(table->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo));
374ca955 646 result = ucol_initCollator(table, 0, UCA, status);
b75a7d8f
A
647 result->hasRealData = TRUE;
648 result->freeImageOnClose = TRUE;
649 }
650 } else { /* no rules, but no error either */
651 // must be only options
374ca955
A
652 // We will init the collator from UCA
653 result = ucol_initCollator(UCA->image, 0, UCA, status);
b75a7d8f
A
654 // And set only the options
655 UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
656 /* test for NULL */
657 if (opts == NULL) {
658 *status = U_MEMORY_ALLOCATION_ERROR;
659 goto cleanup;
660 }
661 uprv_memcpy(opts, src.opts, sizeof(UColOptionSet));
662 ucol_setOptionsFromHeader(result, opts, status);
663 result->freeOptionsOnClose = TRUE;
664 result->hasRealData = FALSE;
665 result->freeImageOnClose = FALSE;
666 }
667
668 if(U_SUCCESS(*status)) {
669 UChar *newRules;
670 result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
671 if(rulesLength > 0) {
672 newRules = (UChar *)uprv_malloc((rulesLength+1)*U_SIZEOF_UCHAR);
673 /* test for NULL */
674 if (newRules == NULL) {
675 *status = U_MEMORY_ALLOCATION_ERROR;
676 goto cleanup;
677 }
678 uprv_memcpy(newRules, rules, rulesLength*U_SIZEOF_UCHAR);
679 newRules[rulesLength]=0;
680 result->rules = newRules;
681 result->rulesLength = rulesLength;
682 result->freeRulesOnClose = TRUE;
683 }
684 result->rb = NULL;
685 result->elements = NULL;
686 result->validLocale = NULL;
687 result->requestedLocale = NULL;
688 ucol_setAttribute(result, UCOL_STRENGTH, strength, status);
689 ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status);
690 } else {
691cleanup:
692 if(result != NULL) {
693 ucol_close(result);
694 } else {
695 if(table != NULL) {
696 uprv_free(table);
697 }
698 }
699 result = NULL;
700 }
701
702 ucol_tok_closeTokenList(&src);
703
704 return result;
705}
706
707/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
708/* you should be able to get the binary chunk to write out... Doesn't look very full now */
709U_CAPI uint8_t* U_EXPORT2
710ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
711{
712 uint8_t *result = NULL;
713 if(U_FAILURE(*status)) {
714 return NULL;
715 }
716 if(coll->hasRealData == TRUE) {
717 *length = coll->image->size;
718 result = (uint8_t *)uprv_malloc(*length);
719 /* test for NULL */
720 if (result == NULL) {
721 *status = U_MEMORY_ALLOCATION_ERROR;
722 return NULL;
723 }
724 uprv_memcpy(result, coll->image, *length);
725 } else {
726 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
727 result = (uint8_t *)uprv_malloc(*length);
728 /* test for NULL */
729 if (result == NULL) {
730 *status = U_MEMORY_ALLOCATION_ERROR;
731 return NULL;
732 }
374ca955
A
733
734 /* build the UCATableHeader with minimal entries */
735 /* do not copy the header from the UCA file because its values are wrong! */
736 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
737
738 /* reset everything */
739 uprv_memset(result, 0, *length);
740
741 /* set the tailoring-specific values */
742 UCATableHeader *myData = (UCATableHeader *)result;
743 myData->size = *length;
744
745 /* offset for the options, the only part of the data that is present after the header */
746 myData->options = sizeof(UCATableHeader);
747
748 /* need to always set the expansion value for an upper bound of the options */
749 myData->expansion = myData->options + sizeof(UColOptionSet);
750
751 myData->magic = UCOL_HEADER_MAGIC;
752 myData->isBigEndian = U_IS_BIG_ENDIAN;
753 myData->charSetFamily = U_CHARSET_FAMILY;
754
755 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
756 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
757
758 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
759 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
760 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
761 myData->jamoSpecial = coll->image->jamoSpecial;
762
763 /* copy the collator options */
b75a7d8f
A
764 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
765 }
766 return result;
767}
768
769void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
770 if(U_FAILURE(*status)) {
771 return;
772 }
773 result->caseFirst = (UColAttributeValue)opts->caseFirst;
774 result->caseLevel = (UColAttributeValue)opts->caseLevel;
775 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
776 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
777 result->strength = (UColAttributeValue)opts->strength;
778 result->variableTopValue = opts->variableTopValue;
779 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
780 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
781 result->numericCollation = (UColAttributeValue)opts->numericCollation;
782
783 result->caseFirstisDefault = TRUE;
784 result->caseLevelisDefault = TRUE;
785 result->frenchCollationisDefault = TRUE;
786 result->normalizationModeisDefault = TRUE;
787 result->strengthisDefault = TRUE;
788 result->variableTopValueisDefault = TRUE;
789 result->hiraganaQisDefault = TRUE;
790 result->numericCollationisDefault = TRUE;
791
792 ucol_updateInternalState(result, status);
793
794 result->options = opts;
795}
796
797#if 0
798// doesn't look like anybody is using this
799void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
800 if(U_FAILURE(*status)) {
801 return;
802 }
803 opts->caseFirst = result->caseFirst;
804 opts->caseLevel = result->caseLevel;
805 opts->frenchCollation = result->frenchCollation;
806 opts->normalizationMode = result->normalizationMode;
807 opts->strength = result->strength;
808 opts->variableTopValue = result->variableTopValue;
809 opts->alternateHandling = result->alternateHandling;
810 opts->hiraganaQ = result->hiraganaQ;
811 opts->numericCollation = result->numericCollation;
812}
813#endif
814
b75a7d8f
A
815
816/**
817* Approximate determination if a character is at a contraction end.
818* Guaranteed to be TRUE if a character is at the end of a contraction,
819* otherwise it is not deterministic.
820* @param c character to be determined
821* @param coll collator
822*/
823static
824inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
825 if (UTF_IS_TRAIL(c)) {
826 return TRUE;
827 }
828
829 if (c < coll->minContrEndCP) {
830 return FALSE;
831 }
832
833 int32_t hash = c;
834 uint8_t htbyte;
835 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
836 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
837 }
838 htbyte = coll->contrEndCP[hash>>3];
839 return (((htbyte >> (hash & 7)) & 1) == 1);
840}
841
842
843
844/*
845* i_getCombiningClass()
846* A fast, at least partly inline version of u_getCombiningClass()
847* This is a candidate for further optimization. Used heavily
848* in contraction processing.
849*/
850static
851inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) {
852 uint8_t sCC = 0;
853 if (c >= 0x300 && ucol_unsafeCP(c, coll)) {
854 sCC = u_getCombiningClass(c);
855 }
856 return sCC;
857}
858
859
374ca955 860UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
b75a7d8f
A
861 UChar c;
862 UCollator *result = fillIn;
863 if(U_FAILURE(*status) || image == NULL) {
864 return NULL;
865 }
866
867 if(result == NULL) {
868 result = (UCollator *)uprv_malloc(sizeof(UCollator));
869 if(result == NULL) {
870 *status = U_MEMORY_ALLOCATION_ERROR;
871 return result;
872 }
873 result->freeOnClose = TRUE;
874 } else {
875 result->freeOnClose = FALSE;
876 }
877
878 result->image = image;
879 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
880 /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
374ca955 881 UTrie *newUCAmapping = (UTrie *)uprv_malloc(sizeof(UTrie));
b75a7d8f
A
882 if(newUCAmapping != NULL) {
883 utrie_unserialize(newUCAmapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
884 } else {
885 *status = U_MEMORY_ALLOCATION_ERROR;
886 if(result->freeOnClose == TRUE) {
887 uprv_free(result);
888 result = NULL;
889 }
890 return result;
891 }
892 if(U_SUCCESS(*status)) {
893 result->mapping = newUCAmapping;
894 } else {
895 if(result->freeOnClose == TRUE) {
896 uprv_free(result);
897 result = NULL;
898 }
899 uprv_free(newUCAmapping);
900 return result;
901 }
902
903 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
904 result->latinOneMapping = UTRIE_GET32_LATIN1(result->mapping);
905 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
906 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
907 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
908
909 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
910 result->freeOptionsOnClose = FALSE;
911
912 /* set attributes */
913 result->caseFirst = (UColAttributeValue)result->options->caseFirst;
914 result->caseLevel = (UColAttributeValue)result->options->caseLevel;
915 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
916 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
917 result->strength = (UColAttributeValue)result->options->strength;
918 result->variableTopValue = result->options->variableTopValue;
919 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
920 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
921 result->numericCollation = (UColAttributeValue)result->options->numericCollation;
922
923 result->caseFirstisDefault = TRUE;
924 result->caseLevelisDefault = TRUE;
925 result->frenchCollationisDefault = TRUE;
926 result->normalizationModeisDefault = TRUE;
927 result->strengthisDefault = TRUE;
928 result->variableTopValueisDefault = TRUE;
929 result->alternateHandlingisDefault = TRUE;
930 result->hiraganaQisDefault = TRUE;
931 result->numericCollationisDefault = TRUE;
932
933 result->scriptOrder = NULL;
934
935 result->rules = NULL;
936 result->rulesLength = 0;
937
938 /* get the version info from UCATableHeader and populate the Collator struct*/
939 result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
940 result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
941
942 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
943 result->minUnsafeCP = 0;
944 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
945 if (ucol_unsafeCP(c, result)) break;
946 }
947 result->minUnsafeCP = c;
948
949 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
950 result->minContrEndCP = 0;
951 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
952 if (ucol_contractionEndCP(c, result)) break;
953 }
954 result->minContrEndCP = c;
955
956 /* max expansion tables */
957 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
958 result->image->endExpansionCE);
959 result->lastEndExpansionCE = result->endExpansionCE +
960 result->image->endExpansionCECount - 1;
961 result->expansionCESize = (uint8_t*)result->image +
962 result->image->expansionCESize;
963
b75a7d8f
A
964
965 //result->errorCode = *status;
966
967 result->latinOneCEs = NULL;
968
969 result->latinOneRegenTable = FALSE;
970 result->latinOneFailed = FALSE;
374ca955 971 result->UCA = UCA;
b75a7d8f
A
972
973 ucol_updateInternalState(result, status);
974
975
976 return result;
977}
978
374ca955 979/* new Mark's code */
b75a7d8f 980
374ca955
A
981/**
982 * For generation of Implicit CEs
983 * @author Davis
984 *
985 * Cleaned up so that changes can be made more easily.
986 * Old values:
987# First Implicit: E26A792D
988# Last Implicit: E3DC70C0
989# First CJK: E0030300
990# Last CJK: E0A9DD00
991# First CJK_A: E0A9DF00
992# Last CJK_A: E0DE3100
993 */
b75a7d8f 994/* Following is a port of Mark's code for new treatment of implicits.
374ca955 995 * It is positioned here, since ucol_initUCA need to initialize the
b75a7d8f
A
996 * variables below according to the data in the fractional UCA.
997 */
374ca955 998
b75a7d8f 999/**
374ca955
A
1000 * Function used to:
1001 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1002 * b) bump any non-CJK characters by 10FFFF.
1003 * The relevant blocks are:
1004 * A: 4E00..9FFF; CJK Unified Ideographs
1005 * F900..FAFF; CJK Compatibility Ideographs
1006 * B: 3400..4DBF; CJK Unified Ideographs Extension A
1007 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
1008 * As long as
1009 * no new B characters are allocated between 4E00 and FAFF, and
1010 * no new A characters are outside of this range,
1011 * (very high probability) this simple code will work.
1012 * The reordered blocks are:
1013 * Block1 is CJK
1014 * Block2 is CJK_COMPAT_USED
1015 * Block3 is CJK_A
1016 * (all contiguous)
1017 * Any other CJK gets its normal code point
1018 * Any non-CJK gets +10FFFF
1019 * When we reorder Block1, we make sure that it is at the very start,
1020 * so that it will use a 3-byte form.
1021 * Warning: the we only pick up the compatibility characters that are
1022 * NOT decomposed, so that block is smaller!
1023 */
b75a7d8f
A
1024
1025// CONSTANTS
374ca955 1026static const UChar32
b75a7d8f 1027 NON_CJK_OFFSET = 0x110000,
374ca955
A
1028 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1029
1030/**
1031 * Precomputed by constructor
1032 */
1033static int32_t
1034 final3Multiplier = 0,
1035 final4Multiplier = 0,
1036 final3Count = 0,
1037 final4Count = 0,
1038 medialCount = 0,
1039 min3Primary = 0,
1040 min4Primary = 0,
1041 max4Primary = 0,
1042 minTrail = 0,
1043 maxTrail = 0,
1044 max3Trail = 0,
1045 max4Trail = 0,
1046 min4Boundary = 0;
b75a7d8f
A
1047
1048static const UChar32
1049 CJK_BASE = 0x4E00,
1050 CJK_LIMIT = 0x9FFF+1,
1051 CJK_COMPAT_USED_BASE = 0xFA0E,
1052 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1053 CJK_A_BASE = 0x3400,
1054 CJK_A_LIMIT = 0x4DBF+1,
1055 CJK_B_BASE = 0x20000,
1056 CJK_B_LIMIT = 0x2A6DF+1;
1057
374ca955
A
1058static UChar32 swapCJK(UChar32 i) {
1059
1060 if (i >= CJK_BASE) {
1061 if (i < CJK_LIMIT) return i - CJK_BASE;
1062
1063 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
1064
1065 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
1066 + (CJK_LIMIT - CJK_BASE);
1067 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
1068
1069 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
1070
1071 return i + NON_CJK_OFFSET; // non-CJK
1072 }
1073 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
1074
1075 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
1076 + (CJK_LIMIT - CJK_BASE)
1077 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1078 return i + NON_CJK_OFFSET; // non-CJK
1079}
1080
1081U_CAPI UChar32 U_EXPORT2
1082uprv_uca_getRawFromCodePoint(UChar32 i) {
1083 return swapCJK(i)+1;
1084}
1085
1086U_CAPI UChar32 U_EXPORT2
1087uprv_uca_getCodePointFromRaw(UChar32 i) {
1088 i--;
1089 UChar32 result = 0;
1090 if(i >= NON_CJK_OFFSET) {
1091 result = i - NON_CJK_OFFSET;
1092 } else if(i >= CJK_B_BASE) {
1093 result = i;
1094 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1095 if(i < CJK_LIMIT - CJK_BASE) {
1096 result = i + CJK_BASE;
1097 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1098 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1099 } else {
1100 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1101 }
1102 } else {
1103 result = -1;
1104 }
1105 return result;
b75a7d8f 1106}
b75a7d8f
A
1107
1108// GET IMPLICIT PRIMARY WEIGHTS
1109// Return value is left justified primary key
374ca955
A
1110U_CAPI uint32_t U_EXPORT2
1111uprv_uca_getImplicitFromRaw(UChar32 cp) {
1112 /*
1113 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1114 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1115 }
1116 */
1117 int32_t last0 = cp - min4Boundary;
1118 if (last0 < 0) {
1119 int32_t last1 = cp / final3Count;
1120 last0 = cp % final3Count;
b75a7d8f 1121
374ca955
A
1122 int32_t last2 = last1 / medialCount;
1123 last1 %= medialCount;
b75a7d8f 1124
374ca955
A
1125 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1126 last1 = minTrail + last1; // offset
1127 last2 = min3Primary + last2; // offset
b75a7d8f 1128 /*
374ca955
A
1129 if (last2 >= min4Primary) {
1130 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1131 }
b75a7d8f 1132 */
374ca955 1133 return (last2 << 24) + (last1 << 16) + (last0 << 8);
b75a7d8f 1134 } else {
374ca955
A
1135 int32_t last1 = last0 / final4Count;
1136 last0 %= final4Count;
b75a7d8f 1137
374ca955
A
1138 int32_t last2 = last1 / medialCount;
1139 last1 %= medialCount;
1140
1141 int32_t last3 = last2 / medialCount;
1142 last2 %= medialCount;
1143
1144 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1145 last1 = minTrail + last1; // offset
1146 last2 = minTrail + last2; // offset
1147 last3 = min4Primary + last3; // offset
b75a7d8f 1148 /*
374ca955
A
1149 if (last3 > max4Primary) {
1150 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1151 }
b75a7d8f 1152 */
374ca955 1153 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
b75a7d8f
A
1154 }
1155}
1156
374ca955
A
1157U_CAPI uint32_t U_EXPORT2
1158uprv_uca_getImplicitPrimary(UChar32 cp) {
1159 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1160
1161 cp = swapCJK(cp);
1162 cp++;
1163 // we now have a range of numbers from 0 to 21FFFF.
1164
1165 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1166
1167 return uprv_uca_getImplicitFromRaw(cp);
1168}
1169
1170/**
1171 * Converts implicit CE into raw integer ("code point")
1172 * @param implicit
1173 * @return -1 if illegal format
1174 */
1175U_CAPI UChar32 U_EXPORT2
1176uprv_uca_getRawFromImplicit(uint32_t implicit) {
1177 UChar32 result;
1178 UChar32 b3 = implicit & 0xFF;
1179 implicit >>= 8;
1180 UChar32 b2 = implicit & 0xFF;
1181 implicit >>= 8;
1182 UChar32 b1 = implicit & 0xFF;
1183 implicit >>= 8;
1184 UChar32 b0 = implicit & 0xFF;
1185
1186 // simple parameter checks
1187 if (b0 < min3Primary || b0 > max4Primary
1188 || b1 < minTrail || b1 > maxTrail) return -1;
1189 // normal offsets
1190 b1 -= minTrail;
1191
1192 // take care of the final values, and compose
1193 if (b0 < min4Primary) {
1194 if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
1195 b2 -= minTrail;
1196 UChar32 remainder = b2 % final3Multiplier;
1197 if (remainder != 0) return -1;
1198 b0 -= min3Primary;
1199 b2 /= final3Multiplier;
1200 result = ((b0 * medialCount) + b1) * final3Count + b2;
1201 } else {
1202 if (b2 < minTrail || b2 > maxTrail
1203 || b3 < minTrail || b3 > max4Trail) return -1;
1204 b2 -= minTrail;
1205 b3 -= minTrail;
1206 UChar32 remainder = b3 % final4Multiplier;
1207 if (remainder != 0) return -1;
1208 b3 /= final4Multiplier;
1209 b0 -= min4Primary;
1210 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1211 }
1212 // final check
1213 if (result < 0 || result > UCOL_MAX_INPUT) return -1;
1214 return result;
1215}
1216
1217
1218static inline int32_t divideAndRoundUp(int a, int b) {
1219 return 1 + (a-1)/b;
1220}
1221
b75a7d8f
A
1222/* this function is either called from initUCA or from genUCA before
1223 * doing canonical closure for the UCA.
1224 */
374ca955
A
1225
1226/**
1227 * Set up to generate implicits.
1228 * @param minPrimary
1229 * @param maxPrimary
1230 * @param minTrail final byte
1231 * @param maxTrail final byte
1232 * @param gap3 the gap we leave for tailoring for 3-byte forms
1233 * @param gap4 the gap we leave for tailoring for 4-byte forms
1234 */
1235static void initImplicitConstants(int minPrimary, int maxPrimary,
1236 int minTrailIn, int maxTrailIn,
1237 int gap3, int primaries3count,
1238 UErrorCode *status) {
1239 // some simple parameter checks
1240 if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
1241 *status = U_ILLEGAL_ARGUMENT_ERROR;
1242 return;
1243 };
1244 if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) {
1245 *status = U_ILLEGAL_ARGUMENT_ERROR;
1246 return;
1247 };
1248 if (primaries3count < 1) {
1249 *status = U_ILLEGAL_ARGUMENT_ERROR;
1250 return;
1251 };
1252
1253 minTrail = minTrailIn;
1254 maxTrail = maxTrailIn;
1255
1256 min3Primary = minPrimary;
1257 max4Primary = maxPrimary;
1258 // compute constants for use later.
1259 // number of values we can use in trailing bytes
1260 // leave room for empty values between AND above, e.g. if gap = 2
1261 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1262 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1263 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1264 final3Multiplier = gap3 + 1;
1265 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1266 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1267
1268 // medials can use full range
1269 medialCount = (maxTrail - minTrail + 1);
1270 // find out how many values fit in each form
1271 int32_t threeByteCount = medialCount * final3Count;
1272 // now determine where the 3/4 boundary is.
1273 // we use 3 bytes below the boundary, and 4 above
1274 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1275 int32_t primaries4count = primariesAvailable - primaries3count;
1276
1277
1278 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1279 min4Primary = minPrimary + primaries3count;
1280 min4Boundary = min3ByteCoverage;
1281 // Now expand out the multiplier for the 4 bytes, and redo.
1282
1283 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1284 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1285 //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1286 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1287 //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1288 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1289 //if (DEBUG) System.out.println("expandedGap: " + gap4);
1290 if (gap4 < 1) {
1291 *status = U_ILLEGAL_ARGUMENT_ERROR;
1292 return;
1293 }
1294 final4Multiplier = gap4 + 1;
1295 final4Count = neededPerFinalByte;
1296 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1297 /*
1298 if (DEBUG) {
1299 System.out.println("final4Count: " + final4Count);
1300 for (int counter = 0; counter <= final4Count; ++counter) {
1301 int value = minTrail + (1 + counter)*final4Multiplier;
1302 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1303 }
1304 }
1305 */
1306}
1307
1308 /**
1309 * Supply parameters for generating implicit CEs
1310 */
1311U_CAPI void U_EXPORT2
1312uprv_uca_initImplicitConstants(int32_t minPrimary, int32_t maxPrimary, UErrorCode *status) {
1313 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1314 initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1315}
1316
1317U_CDECL_BEGIN
1318static UBool U_CALLCONV
1319ucol_cleanup(void)
b75a7d8f 1320{
374ca955
A
1321 if (UCA_DATA_MEM) {
1322 udata_close(UCA_DATA_MEM);
1323 UCA_DATA_MEM = NULL;
1324 }
1325 if (_staticUCA) {
1326 ucol_close(_staticUCA);
1327 _staticUCA = NULL;
1328 }
1329 fcdTrieIndex = NULL;
1330 return TRUE;
b75a7d8f 1331}
374ca955
A
1332U_CDECL_END
1333
b75a7d8f
A
1334/* do not close UCA returned by ucol_initUCA! */
1335UCollator *
1336ucol_initUCA(UErrorCode *status) {
1337 if(U_FAILURE(*status)) {
1338 return NULL;
1339 }
1340 umtx_lock(NULL);
374ca955 1341 UBool f = (_staticUCA == NULL);
b75a7d8f 1342 umtx_unlock(NULL);
374ca955 1343
b75a7d8f
A
1344 if(f) {
1345 UCollator *newUCA = NULL;
1346 UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
374ca955 1347
b75a7d8f
A
1348 if(U_FAILURE(*status)) {
1349 if (result) {
1350 udata_close(result);
1351 }
1352 uprv_free(newUCA);
1353 }
374ca955
A
1354
1355 // init FCD data
1356 if (fcdTrieIndex == NULL) {
1357 fcdTrieIndex = unorm_getFCDTrie(status);
1358 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1359 }
1360
b75a7d8f 1361 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
374ca955 1362 newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status);
b75a7d8f
A
1363 if(U_SUCCESS(*status)){
1364 newUCA->rb = NULL;
374ca955
A
1365 newUCA->elements = NULL;
1366 newUCA->validLocale = NULL;
1367 newUCA->requestedLocale = NULL;
1368 newUCA->hasRealData = FALSE; // real data lives in .dat file...
b75a7d8f
A
1369 newUCA->freeImageOnClose = FALSE;
1370 umtx_lock(NULL);
374ca955
A
1371 if(_staticUCA == NULL) {
1372 _staticUCA = newUCA;
b75a7d8f
A
1373 UCA_DATA_MEM = result;
1374 result = NULL;
1375 newUCA = NULL;
1376 }
1377 umtx_unlock(NULL);
374ca955 1378
b75a7d8f
A
1379 if(newUCA != NULL) {
1380 udata_close(result);
1381 uprv_free(newUCA);
1382 }
1383 else {
374ca955 1384 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
b75a7d8f
A
1385 }
1386 // Initalize variables for implicit generation
374ca955
A
1387 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts);
1388 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status);
1389 _staticUCA->mapping->getFoldingOffset = _getFoldingOffset;
b75a7d8f
A
1390 }else{
1391 udata_close(result);
1392 uprv_free(newUCA);
374ca955 1393 _staticUCA= NULL;
b75a7d8f
A
1394 }
1395 }
1396 }
374ca955 1397 return _staticUCA;
b75a7d8f
A
1398}
1399
1400
1401/* collIterNormalize Incremental Normalization happens here. */
1402/* pick up the range of chars identifed by FCD, */
1403/* normalize it into the collIterate's writable buffer, */
1404/* switch the collIterate's state to use the writable buffer. */
1405/* */
1406static
1407void collIterNormalize(collIterate *collationSource)
1408{
1409 UErrorCode status = U_ZERO_ERROR;
1410
1411 int32_t normLen;
1412 UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1413 UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1414
1415 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1416 srcP, (int32_t)(endP - srcP),
1417 FALSE, 0,
1418 &status);
1419 if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1420 // reallocate and terminate
1421 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1422 &collationSource->writableBuffer,
1423 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1424 0)
1425 ) {
1426#ifdef UCOL_DEBUG
1427 fprintf(stderr, "collIterNormalize(), out of memory\n");
1428#endif
1429 return;
1430 }
1431 status = U_ZERO_ERROR;
1432 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1433 srcP, (int32_t)(endP - srcP),
1434 FALSE, 0,
1435 &status);
1436 }
1437 if (U_FAILURE(status)) {
1438#ifdef UCOL_DEBUG
1439 fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1440#endif
1441 return;
1442 }
1443
1444 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1445 collationSource->flags |= UCOL_ITER_ALLOCATED;
1446 }
1447 collationSource->pos = collationSource->writableBuffer;
1448 collationSource->origFlags = collationSource->flags;
1449 collationSource->flags |= UCOL_ITER_INNORMBUF;
1450 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1451}
1452
1453
1454// This function takes the iterator and extracts normalized stuff up to the next boundary
1455// It is similar in the end results to the collIterNormalize, but for the cases when we
1456// use an iterator
1457static
1458inline void normalizeIterator(collIterate *collationSource) {
1459 UErrorCode status = U_ZERO_ERROR;
1460 UBool wasNormalized = FALSE;
1461 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1462 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
374ca955 1463 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
b75a7d8f
A
1464 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1465 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1466 // reallocate and terminate
1467 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1468 &collationSource->writableBuffer,
1469 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1470 0)
1471 ) {
1472 #ifdef UCOL_DEBUG
1473 fprintf(stderr, "normalizeIterator(), out of memory\n");
1474 #endif
1475 return;
1476 }
1477 status = U_ZERO_ERROR;
1478 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1479 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
374ca955 1480 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
b75a7d8f
A
1481 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1482 }
1483 // Terminate the buffer - we already checked that it is big enough
374ca955 1484 collationSource->writableBuffer[normLen] = 0;
b75a7d8f
A
1485 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1486 collationSource->flags |= UCOL_ITER_ALLOCATED;
1487 }
1488 collationSource->pos = collationSource->writableBuffer;
1489 collationSource->origFlags = collationSource->flags;
1490 collationSource->flags |= UCOL_ITER_INNORMBUF;
1491 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1492}
1493
1494
1495/* Incremental FCD check and normalize */
1496/* Called from getNextCE when normalization state is suspect. */
1497/* When entering, the state is known to be this: */
1498/* o We are working in the main buffer of the collIterate, not the side */
1499/* writable buffer. When in the side buffer, normalization mode is always off, */
1500/* so we won't get here. */
1501/* o The leading combining class from the current character is 0 or */
1502/* the trailing combining class of the previous char was zero. */
1503/* True because the previous call to this function will have always exited */
1504/* that way, and we get called for every char where cc might be non-zero. */
1505static
1506inline UBool collIterFCD(collIterate *collationSource) {
1507 UChar c, c2;
1508 const UChar *srcP, *endP;
1509 uint8_t leadingCC;
1510 uint8_t prevTrailingCC = 0;
1511 uint16_t fcd;
1512 UBool needNormalize = FALSE;
1513
1514 srcP = collationSource->pos-1;
1515
1516 if (collationSource->flags & UCOL_ITER_HASLEN) {
1517 endP = collationSource->endp;
1518 } else {
1519 endP = NULL;
1520 }
1521
1522 // Get the trailing combining class of the current character. If it's zero,
1523 // we are OK.
1524 c = *srcP++;
1525 /* trie access */
1526 fcd = unorm_getFCD16(fcdTrieIndex, c);
1527 if (fcd != 0) {
1528 if (UTF_IS_FIRST_SURROGATE(c)) {
1529 if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1530 ++srcP;
1531 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1532 } else {
1533 fcd = 0;
1534 }
1535 }
1536
1537 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1538
1539 if (prevTrailingCC != 0) {
1540 // The current char has a non-zero trailing CC. Scan forward until we find
1541 // a char with a leading cc of zero.
1542 while (endP == NULL || srcP != endP)
1543 {
1544 const UChar *savedSrcP = srcP;
1545
1546 c = *srcP++;
1547 /* trie access */
1548 fcd = unorm_getFCD16(fcdTrieIndex, c);
1549 if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) {
1550 if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1551 ++srcP;
1552 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1553 } else {
1554 fcd = 0;
1555 }
1556 }
1557 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1558 if (leadingCC == 0) {
1559 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1560 // back up over it. (Could be surrogate pair!)
1561 break;
1562 }
1563
1564 if (leadingCC < prevTrailingCC) {
1565 needNormalize = TRUE;
1566 }
1567
1568 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1569 }
1570 }
1571 }
1572
1573 collationSource->fcdPosition = (UChar *)srcP;
1574
1575 return needNormalize;
1576}
1577
1578/****************************************************************************/
1579/* Following are the CE retrieval functions */
1580/* */
1581/****************************************************************************/
1582
1583/* there should be a macro version of this function in the header file */
1584/* This is the first function that tries to fetch a collation element */
1585/* If it's not succesfull or it encounters a more difficult situation */
1586/* some more sofisticated and slower functions are invoked */
1587static
1588inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1589 uint32_t order = 0;
1590 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1591 order = *(collationSource->toReturn++); /* if so, return them */
1592 if(collationSource->CEpos == collationSource->toReturn) {
1593 collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1594 }
1595 return order;
1596 }
1597
1598 UChar ch = 0;
1599
1600 for (;;) /* Loop handles case when incremental normalize switches */
1601 { /* to or from the side buffer / original string, and we */
1602 /* need to start again to get the next character. */
1603
1604 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1605 {
1606 // The source string is null terminated and we're not working from the side buffer,
1607 // and we're not normalizing. This is the fast path.
1608 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1609 ch = *collationSource->pos++;
1610 if (ch != 0) {
1611 break;
1612 }
1613 else {
1614 return UCOL_NO_MORE_CES;
1615 }
1616 }
1617
1618 if (collationSource->flags & UCOL_ITER_HASLEN) {
1619 // Normal path for strings when length is specified.
1620 // (We can't be in side buffer because it is always null terminated.)
1621 if (collationSource->pos >= collationSource->endp) {
1622 // Ran off of the end of the main source string. We're done.
1623 return UCOL_NO_MORE_CES;
1624 }
1625 ch = *collationSource->pos++;
1626 }
1627 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1628 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1629 if(iterCh == U_SENTINEL) {
1630 return UCOL_NO_MORE_CES;
1631 }
1632 ch = (UChar)iterCh;
1633 }
1634 else
1635 {
1636 // Null terminated string.
1637 ch = *collationSource->pos++;
1638 if (ch == 0) {
1639 // Ran off end of buffer.
1640 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1641 // Ran off end of main string. backing up one character.
1642 collationSource->pos--;
1643 return UCOL_NO_MORE_CES;
1644 }
1645 else
1646 {
1647 // Hit null in the normalize side buffer.
1648 // Usually this means the end of the normalized data,
1649 // except for one odd case: a null followed by combining chars,
1650 // which is the case if we are at the start of the buffer.
1651 if (collationSource->pos == collationSource->writableBuffer+1) {
1652 break;
1653 }
1654
1655 // Null marked end of side buffer.
1656 // Revert to the main string and
1657 // loop back to top to try again to get a character.
1658 collationSource->pos = collationSource->fcdPosition;
1659 collationSource->flags = collationSource->origFlags;
1660 continue;
1661 }
1662 }
1663 }
1664
1665 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1666 if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1667 collationSource->flags |= UCOL_WAS_HIRAGANA;
1668 } else {
1669 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1670 }
1671 }
1672
1673 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1674 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1675 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1676 break;
1677 }
1678
1679 if (collationSource->fcdPosition >= collationSource->pos) {
1680 // An earlier FCD check has already covered the current character.
1681 // We can go ahead and process this char.
1682 break;
1683 }
1684
1685 if (ch < ZERO_CC_LIMIT_ ) {
1686 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1687 break;
1688 }
1689
1690 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1691 // We need to peek at the next character in order to tell if we are FCD
1692 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1693 // We are at the last char of source string.
1694 // It is always OK for FCD check.
1695 break;
1696 }
1697
1698 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1699 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1700 break;
1701 }
1702 }
1703
1704
1705 // Need a more complete FCD check and possible normalization.
1706 if (collIterFCD(collationSource)) {
1707 collIterNormalize(collationSource);
1708 }
1709 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1710 // No normalization was needed. Go ahead and process the char we already had.
1711 break;
1712 }
1713
1714 // Some normalization happened. Next loop iteration will pick up a char
1715 // from the normalization buffer.
1716
1717 } // end for (;;)
1718
1719
1720 if (ch <= 0xFF) {
1721 /* For latin-1 characters we never need to fall back to the UCA table */
1722 /* because all of the UCA data is replicated in the latinOneMapping array */
1723 order = coll->latinOneMapping[ch];
1724 if (order > UCOL_NOT_FOUND) {
1725 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1726 }
1727 }
1728 else
1729 {
1730 order = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
1731 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1732 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1733 }
374ca955 1734 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
b75a7d8f 1735 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
374ca955 1736 order = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
b75a7d8f
A
1737
1738 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
374ca955 1739 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
b75a7d8f
A
1740 }
1741 }
1742 }
1743 return order; /* return the CE */
1744}
1745
1746/* ucol_getNextCE, out-of-line version for use from other files. */
1747U_CAPI uint32_t U_EXPORT2
1748ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1749 return ucol_IGetNextCE(coll, collationSource, status);
374ca955 1750}
b75a7d8f
A
1751
1752
1753/**
1754* Incremental previous normalization happens here. Pick up the range of chars
1755* identifed by FCD, normalize it into the collIterate's writable buffer,
1756* switch the collIterate's state to use the writable buffer.
1757* @param data collation iterator data
1758*/
1759static
1760void collPrevIterNormalize(collIterate *data)
1761{
1762 UErrorCode status = U_ZERO_ERROR;
1763 UChar *pEnd = data->pos; /* End normalize + 1 */
1764 UChar *pStart;
1765 uint32_t normLen;
1766 UChar *pStartNorm;
1767
1768 /* Start normalize */
1769 if (data->fcdPosition == NULL) {
1770 pStart = data->string;
1771 }
1772 else {
1773 pStart = data->fcdPosition + 1;
1774 }
1775
1776 normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1777 data->writableBuffer, 0, &status);
1778
1779 if (data->writableBufSize <= normLen) {
1780 freeHeapWritableBuffer(data);
1781 data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1782 sizeof(UChar));
1783 if(data->writableBuffer == NULL) { // something is wrong here, return
1784 return;
1785 }
1786 data->flags |= UCOL_ITER_ALLOCATED;
1787 /* to handle the zero termination */
1788 data->writableBufSize = normLen + 1;
1789 }
1790 status = U_ZERO_ERROR;
1791 /*
1792 this puts the null termination infront of the normalized string instead
1793 of the end
1794 */
1795 pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1796 *(pStartNorm - 1) = 0;
1797 unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1798 normLen, &status);
1799
1800 data->pos = data->writableBuffer + data->writableBufSize;
1801 data->origFlags = data->flags;
1802 data->flags |= UCOL_ITER_INNORMBUF;
1803 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1804}
1805
1806
1807/**
1808* Incremental FCD check for previous iteration and normalize. Called from
1809* getPrevCE when normalization state is suspect.
1810* When entering, the state is known to be this:
1811* o We are working in the main buffer of the collIterate, not the side
1812* writable buffer. When in the side buffer, normalization mode is always
1813* off, so we won't get here.
1814* o The leading combining class from the current character is 0 or the
1815* trailing combining class of the previous char was zero.
1816* True because the previous call to this function will have always exited
1817* that way, and we get called for every char where cc might be non-zero.
1818* @param data collation iterate struct
1819* @return normalization status, TRUE for normalization to be done, FALSE
1820* otherwise
1821*/
1822static
1823inline UBool collPrevIterFCD(collIterate *data)
1824{
1825 const UChar *src, *start;
1826 UChar c, c2;
1827 uint8_t leadingCC;
1828 uint8_t trailingCC = 0;
1829 uint16_t fcd;
1830 UBool result = FALSE;
1831
1832 start = data->string;
1833 src = data->pos + 1;
1834
1835 /* Get the trailing combining class of the current character. */
1836 c = *--src;
1837 if (!UTF_IS_SURROGATE(c)) {
1838 fcd = unorm_getFCD16(fcdTrieIndex, c);
1839 } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1840 --src;
1841 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1842 if (fcd != 0) {
1843 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1844 }
1845 } else /* unpaired surrogate */ {
1846 fcd = 0;
1847 }
1848
1849 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1850
1851 if (leadingCC != 0) {
1852 /*
1853 The current char has a non-zero leading combining class.
1854 Scan backward until we find a char with a trailing cc of zero.
1855 */
1856 for (;;)
1857 {
1858 if (start == src) {
1859 data->fcdPosition = NULL;
1860 return result;
1861 }
1862
1863 c = *--src;
1864 if (!UTF_IS_SURROGATE(c)) {
1865 fcd = unorm_getFCD16(fcdTrieIndex, c);
1866 } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1867 --src;
1868 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1869 if (fcd != 0) {
1870 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1871 }
1872 } else /* unpaired surrogate */ {
1873 fcd = 0;
1874 }
1875
1876 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1877
1878 if (trailingCC == 0) {
1879 break;
1880 }
1881
1882 if (leadingCC < trailingCC) {
1883 result = TRUE;
1884 }
1885
1886 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1887 }
1888 }
1889
1890 data->fcdPosition = (UChar *)src;
1891
1892 return result;
1893}
1894
1895/** gets a character from the string at a given offset
1896 * Handles both normal and iterative cases.
1897 * No error checking - caller beware!
1898 */
374ca955 1899inline static
b75a7d8f
A
1900UChar peekCharacter(collIterate *source, int32_t offset) {
1901 if(source->pos != NULL) {
1902 return *(source->pos + offset);
1903 } else if(source->iterator != NULL) {
1904 if(offset != 0) {
1905 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1906 UChar toReturn = (UChar)source->iterator->next(source->iterator);
1907 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1908 return toReturn;
1909 } else {
1910 return (UChar)source->iterator->current(source->iterator);
1911 }
1912 } else {
1913 return (UChar)U_SENTINEL;
1914 }
1915}
1916
1917/**
1918* Determines if we are at the start of the data string in the backwards
1919* collation iterator
1920* @param data collation iterator
1921* @return TRUE if we are at the start
1922*/
1923static
1924inline UBool isAtStartPrevIterate(collIterate *data) {
1925 if(data->pos == NULL && data->iterator != NULL) {
1926 return !data->iterator->hasPrevious(data->iterator);
1927 }
1928 //return (collIter_bos(data)) ||
1929 return (data->pos == data->string) ||
1930 ((data->flags & UCOL_ITER_INNORMBUF) &&
1931 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1932}
1933
374ca955
A
1934static
1935inline void goBackOne(collIterate *data) {
1936# if 0
1937 // somehow, it looks like we need to keep iterator synced up
1938 // at all times, as above.
1939 if(data->pos) {
1940 data->pos--;
1941 }
1942 if(data->iterator) {
1943 data->iterator->previous(data->iterator);
1944 }
1945#endif
1946 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1947 data->iterator->previous(data->iterator);
1948 }
1949 if(data->pos) {
1950 data->pos --;
1951 }
1952}
1953
b75a7d8f
A
1954/**
1955* Inline function that gets a simple CE.
1956* So what it does is that it will first check the expansion buffer. If the
1957* expansion buffer is not empty, ie the end pointer to the expansion buffer
1958* is different from the string pointer, we return the collation element at the
1959* return pointer and decrement it.
1960* For more complicated CEs it resorts to getComplicatedCE.
1961* @param coll collator data
1962* @param data collation iterator struct
1963* @param status error status
1964*/
1965static
1966inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1967 UErrorCode *status)
1968{
374ca955 1969 uint32_t result = (uint32_t)UCOL_NULLORDER;
b75a7d8f
A
1970 if (data->toReturn > data->CEs) {
1971 data->toReturn --;
1972 result = *(data->toReturn);
1973 if (data->CEs == data->toReturn) {
1974 data->CEpos = data->toReturn;
1975 }
1976 }
1977 else {
1978 UChar ch = 0;
1979 /*
1980 Loop handles case when incremental normalize switches to or from the
1981 side buffer / original string, and we need to start again to get the
1982 next character.
1983 */
1984 for (;;) {
1985 if (data->flags & UCOL_ITER_HASLEN) {
1986 /*
1987 Normal path for strings when length is specified.
1988 Not in side buffer because it is always null terminated.
1989 */
1990 if (data->pos <= data->string) {
1991 /* End of the main source string */
1992 return UCOL_NO_MORE_CES;
1993 }
1994 data->pos --;
1995 ch = *data->pos;
1996 }
1997 // we are using an iterator to go back. Pray for us!
1998 else if (data->flags & UCOL_USE_ITERATOR) {
1999 UChar32 iterCh = data->iterator->previous(data->iterator);
2000 if(iterCh == U_SENTINEL) {
2001 return UCOL_NO_MORE_CES;
2002 } else {
2003 ch = (UChar)iterCh;
2004 }
2005 }
2006 else {
2007 data->pos --;
2008 ch = *data->pos;
2009 /* we are in the side buffer. */
2010 if (ch == 0) {
2011 /*
2012 At the start of the normalize side buffer.
2013 Go back to string.
2014 Because pointer points to the last accessed character,
2015 hence we have to increment it by one here.
2016 */
2017 if (data->fcdPosition == NULL) {
2018 data->pos = data->string;
2019 return UCOL_NO_MORE_CES;
2020 }
2021 else {
2022 data->pos = data->fcdPosition + 1;
2023 }
2024 data->flags = data->origFlags;
2025 continue;
2026 }
2027 }
2028
2029 if(data->flags&UCOL_HIRAGANA_Q) {
2030 if(ch>=0x3040 && ch<=0x309f) {
2031 data->flags |= UCOL_WAS_HIRAGANA;
2032 } else {
2033 data->flags &= ~UCOL_WAS_HIRAGANA;
2034 }
2035 }
374ca955 2036
b75a7d8f 2037 /*
374ca955 2038 * got a character to determine if there's fcd and/or normalization
b75a7d8f
A
2039 * stuff to do.
2040 * if the current character is not fcd.
2041 * if current character is at the start of the string
2042 * Trailing combining class == 0.
2043 * Note if pos is in the writablebuffer, norm is always 0
2044 */
374ca955 2045 if (ch < ZERO_CC_LIMIT_ ||
b75a7d8f
A
2046 // this should propel us out of the loop in the iterator case
2047 (data->flags & UCOL_ITER_NORM) == 0 ||
374ca955 2048 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
b75a7d8f
A
2049 || data->string == data->pos) {
2050 break;
2051 }
2052
2053 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2054 /* if next character is FCD */
2055 if (data->pos == data->string) {
2056 /* First char of string is always OK for FCD check */
2057 break;
2058 }
2059
2060 /* Not first char of string, do the FCD fast test */
2061 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2062 break;
2063 }
2064 }
2065
2066 /* Need a more complete FCD check and possible normalization. */
2067 if (collPrevIterFCD(data)) {
2068 collPrevIterNormalize(data);
2069 }
2070
2071 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2072 /* No normalization. Go ahead and process the char. */
2073 break;
2074 }
2075
2076 /*
2077 Some normalization happened.
2078 Next loop picks up a char from the normalization buffer.
2079 */
2080 }
2081
2082 /* attempt to handle contractions, after removal of the backwards
2083 contraction
2084 */
2085 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2086 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2087 }
2088 else {
b75a7d8f
A
2089 // TODO: fix me for THAI - I reference *(data->pos-1)
2090 if ((data->flags & UCOL_ITER_INNORMBUF) == 0 &&
2091 /*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally
374ca955
A
2092 // makes sure that we're not at the beggining of the string
2093 //data->pos > data->string &&
2094 !collIter_bos(data) &&
b75a7d8f
A
2095 UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1)))
2096 //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
2097 {
374ca955
A
2098 collIterateState entryState;
2099 backupState(data, &entryState);
2100 // we have to check if the previous character is also Thai
2101 // if not, we can just set the result
2102 goBackOne(data);
2103 if(collIter_bos(data) || !UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) {
2104 loadState(data, &entryState, FALSE);
2105 result = UCOL_THAI;
2106 } else { // previous is also reordered
2107 // we need to go back as long as they are being reordered
2108 // count over the range of reorderable characters and see
2109 // if there is an even or odd number of them
2110 // if even, we should not reorder. If odd we should reorder.
2111 int32_t noReordered = 1; // the one we already detected
2112 while(!collIter_bos(data) && UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) {
2113 noReordered++;
2114 goBackOne(data);
2115 }
2116 if(noReordered & 1) { // odd number of reorderables
2117 result = UCOL_THAI;
2118 } else {
2119 result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
2120 }
2121 loadState(data, &entryState, FALSE);
2122 }
b75a7d8f 2123 }
374ca955
A
2124 else if (ch <= 0xFF) {
2125 result = coll->latinOneMapping[ch];
2126 //if (result > UCOL_NOT_FOUND) {
2127 //result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2128 //}
2129 }
b75a7d8f
A
2130 else {
2131 /*result = ucmpe32_get(coll->mapping, ch);*/
2132 result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
2133 }
374ca955
A
2134 if (result > UCOL_NOT_FOUND) {
2135 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2136 }
b75a7d8f
A
2137 if (result == UCOL_NOT_FOUND) {
2138 if (!isAtStartPrevIterate(data) &&
2139 ucol_contractionEndCP(ch, data->coll)) {
2140 result = UCOL_CONTRACTION;
2141 }
2142 else {
2143 /*result = ucmpe32_get(UCA->mapping, ch);*/
374ca955
A
2144 if(coll->UCA) {
2145 result = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
2146 }
b75a7d8f
A
2147 }
2148
374ca955
A
2149 if (result > UCOL_NOT_FOUND && coll->UCA) {
2150 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
b75a7d8f
A
2151 }
2152 }
2153 }
2154 }
b75a7d8f
A
2155 return result;
2156}
2157
2158
2159/* ucol_getPrevCE, out-of-line version for use from other files. */
2160U_CAPI uint32_t U_EXPORT2
2161ucol_getPrevCE(const UCollator *coll, collIterate *data,
2162 UErrorCode *status) {
2163 return ucol_IGetPrevCE(coll, data, status);
2164}
2165
2166
2167/* this should be connected to special Jamo handling */
2168U_CAPI uint32_t U_EXPORT2
2169ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2170 collIterate colIt;
2171 uint32_t order;
2172 IInit_collIterate(coll, &u, 1, &colIt);
2173 order = ucol_IGetNextCE(coll, &colIt, status);
2174 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2175 return order;
2176}
2177
2178/**
2179* Inserts the argument character into the end of the buffer pushing back the
2180* null terminator.
2181* @param data collIterate struct data
2182* @param pNull pointer to the null termination
2183* @param ch character to be appended
2184* @return the position of the new addition
2185*/
2186static
2187inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
2188{
2189 uint32_t size = data->writableBufSize;
2190 UChar *newbuffer;
2191 const uint32_t incsize = 5;
2192
2193 if ((data->writableBuffer + size) > (pNull + 1)) {
2194 *pNull = ch;
2195 *(pNull + 1) = 0;
2196 return pNull;
2197 }
2198
2199 /*
2200 buffer will always be null terminated at the end.
2201 giving extra space since it is likely that more characters will be added.
2202 */
2203 size += incsize;
2204 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2205 if(newbuffer != NULL) { // something wrong, but no status
2206 uprv_memcpy(newbuffer, data->writableBuffer,
2207 data->writableBufSize * sizeof(UChar));
2208
2209 freeHeapWritableBuffer(data);
2210 data->writableBufSize = size;
2211 data->writableBuffer = newbuffer;
2212
2213 newbuffer = newbuffer + data->writableBufSize;
2214 *newbuffer = ch;
2215 *(newbuffer + 1) = 0;
2216 }
2217 return newbuffer;
2218}
2219
2220/**
2221* Inserts the argument string into the end of the buffer pushing back the
2222* null terminator.
2223* @param data collIterate struct data
2224* @param pNull pointer to the null termination
2225* @param string to be appended
2226* @param length of the string to be appended
2227* @return the position of the new addition
2228*/
2229static
2230inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2231 int32_t length)
2232{
2233 uint32_t size = pNull - data->writableBuffer;
2234 UChar *newbuffer;
2235
2236 if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2237 uprv_memcpy(pNull, str, length * sizeof(UChar));
2238 *(pNull + length) = 0;
2239 return pNull;
2240 }
2241
2242 /*
2243 buffer will always be null terminated at the end.
2244 giving extra space since it is likely that more characters will be added.
2245 */
2246 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2247 if(newbuffer != NULL) {
2248 uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2249 uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2250
2251 freeHeapWritableBuffer(data);
2252 data->writableBufSize = size + length + 1;
2253 data->writableBuffer = newbuffer;
2254 }
2255
2256 return newbuffer;
2257}
2258
2259/**
2260* Special normalization function for contraction in the forwards iterator.
2261* This normalization sequence will place the current character at source->pos
2262* and its following normalized sequence into the buffer.
2263* The fcd position, pos will be changed.
2264* pos will now point to positions in the buffer.
2265* Flags will be changed accordingly.
2266* @param data collation iterator data
2267*/
2268static
2269inline void normalizeNextContraction(collIterate *data)
2270{
2271 UChar *buffer = data->writableBuffer;
2272 uint32_t buffersize = data->writableBufSize;
2273 uint32_t strsize;
2274 UErrorCode status = U_ZERO_ERROR;
2275 /* because the pointer points to the next character */
2276 UChar *pStart = data->pos - 1;
2277 UChar *pEnd;
2278 uint32_t normLen;
2279 UChar *pStartNorm;
2280
2281 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2282 *data->writableBuffer = *(pStart - 1);
2283 strsize = 1;
2284 }
2285 else {
2286 strsize = u_strlen(data->writableBuffer);
2287 }
2288
2289 pEnd = data->fcdPosition;
2290
2291 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2292 &status);
2293
2294 if (buffersize <= normLen + strsize) {
2295 uint32_t size = strsize + normLen + 1;
2296 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2297 if(temp != NULL) {
2298 uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2299 freeHeapWritableBuffer(data);
2300 data->writableBuffer = temp;
2301 data->writableBufSize = size;
2302 data->flags |= UCOL_ITER_ALLOCATED;
2303 }
2304 }
2305
2306 status = U_ZERO_ERROR;
2307 pStartNorm = buffer + strsize;
2308 /* null-termination will be added here */
2309 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2310 normLen + 1, &status);
2311
2312 data->pos = data->writableBuffer + strsize;
2313 data->origFlags = data->flags;
2314 data->flags |= UCOL_ITER_INNORMBUF;
2315 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2316}
2317
2318/**
2319* Contraction character management function that returns the next character
2320* for the forwards iterator.
2321* Does nothing if the next character is in buffer and not the first character
2322* in it.
2323* Else it checks next character in data string to see if it is normalizable.
2324* If it is not, the character is simply copied into the buffer, else
2325* the whole normalized substring is copied into the buffer, including the
2326* current character.
2327* @param data collation element iterator data
2328* @return next character
2329*/
2330static
2331inline UChar getNextNormalizedChar(collIterate *data)
2332{
2333 UChar nextch;
2334 UChar ch;
2335 // Here we need to add the iterator code. One problem is the way
2336 // end of string is handled. If we just return next char, it could
2337 // be the sentinel. Most of the cases already check for this, but we
2338 // need to be sure.
2339 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2340 /* if no normalization and not in buffer. */
2341 if(data->flags & UCOL_USE_ITERATOR) {
2342 return (UChar)data->iterator->next(data->iterator);
2343 } else {
2344 return *(data->pos ++);
2345 }
2346 }
2347
2348 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2349 //normalizeIterator(data);
2350 //}
2351
2352 UChar *pEndWritableBuffer = NULL;
2353 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2354 if ((innormbuf && *data->pos != 0) ||
2355 (data->fcdPosition != NULL && !innormbuf &&
2356 data->pos < data->fcdPosition)) {
2357 /*
2358 if next character is in normalized buffer, no further normalization
2359 is required
2360 */
2361 return *(data->pos ++);
2362 }
2363
2364 if (data->flags & UCOL_ITER_HASLEN) {
2365 /* in data string */
2366 if (data->pos + 1 == data->endp) {
2367 return *(data->pos ++);
2368 }
2369 }
2370 else {
2371 if (innormbuf) {
374ca955
A
2372 // inside the normalization buffer, but at the end
2373 // (since we encountered zero). This means, in the
2374 // case we're using char iterator, that we need to
2375 // do another round of normalization.
b75a7d8f
A
2376 //if(data->origFlags & UCOL_USE_ITERATOR) {
2377 // we need to restore original flags,
2378 // otherwise, we'll lose them
2379 //data->flags = data->origFlags;
2380 //normalizeIterator(data);
2381 //return *(data->pos++);
2382 //} else {
2383 /*
2384 in writable buffer, at this point fcdPosition can not be
2385 pointing to the end of the data string. see contracting tag.
2386 */
2387 if(data->fcdPosition) {
2388 if (*(data->fcdPosition + 1) == 0 ||
2389 data->fcdPosition + 1 == data->endp) {
2390 /* at the end of the string, dump it into the normalizer */
2391 data->pos = insertBufferEnd(data, data->pos,
2392 *(data->fcdPosition)) + 1;
2393 return *(data->fcdPosition ++);
2394 }
2395 pEndWritableBuffer = data->pos;
2396 data->pos = data->fcdPosition;
2397 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2398 // if we are here, we're using a normalizing iterator.
2399 // we should just continue further.
2400 data->flags = data->origFlags;
2401 data->pos = NULL;
2402 return (UChar)data->iterator->next(data->iterator);
2403 }
2404 //}
2405 }
2406 else {
2407 if (*(data->pos + 1) == 0) {
2408 return *(data->pos ++);
2409 }
2410 }
2411 }
2412
2413 ch = *data->pos ++;
2414 nextch = *data->pos;
2415
2416 /*
2417 * if the current character is not fcd.
2418 * Trailing combining class == 0.
2419 */
2420 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2421 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2422 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2423 /*
2424 Need a more complete FCD check and possible normalization.
2425 normalize substring will be appended to buffer
2426 */
2427 if (collIterFCD(data)) {
2428 normalizeNextContraction(data);
2429 return *(data->pos ++);
2430 }
2431 else if (innormbuf) {
2432 /* fcdposition shifted even when there's no normalization, if we
2433 don't input the rest into this, we'll get the wrong position when
2434 we reach the end of the writableBuffer */
2435 int32_t length = data->fcdPosition - data->pos + 1;
2436 data->pos = insertBufferEnd(data, pEndWritableBuffer,
2437 data->pos - 1, length);
2438 return *(data->pos ++);
2439 }
2440 }
2441
2442 if (innormbuf) {
2443 /*
2444 no normalization is to be done hence only one character will be
2445 appended to the buffer.
2446 */
2447 data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2448 }
2449
2450 /* points back to the pos in string */
2451 return ch;
2452}
2453
b75a7d8f
A
2454
2455
2456/**
2457* Function to copy the buffer into writableBuffer and sets the fcd position to
2458* the correct position
2459* @param source data string source
2460* @param buffer character buffer
2461* @param tempdb current position in buffer that has been used up
2462*/
2463static
2464inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2465 UChar *tempdb)
2466{
2467 /* okay confusing part here. to ensure that the skipped characters are
2468 considered later, we need to place it in the appropriate position in the
2469 normalization buffer and reassign the pos pointer. simple case if pos
2470 reside in string, simply copy to normalization buffer and
2471 fcdposition = pos, pos = start of normalization buffer. if pos in
2472 normalization buffer, we'll insert the copy infront of pos and point pos
2473 to the start of the normalization buffer. why am i doing these copies?
2474 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2475 not require any changes, which be really painful. */
2476 uint32_t length = u_strlen(buffer);;
2477 if (source->flags & UCOL_ITER_INNORMBUF) {
2478 u_strcpy(tempdb, source->pos);
2479 }
2480 else {
2481 source->fcdPosition = source->pos;
2482 source->origFlags = source->flags;
2483 source->flags |= UCOL_ITER_INNORMBUF;
2484 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2485 }
2486
2487 if (length >= source->writableBufSize) {
2488 freeHeapWritableBuffer(source);
2489 source->writableBuffer =
2490 (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2491 if(source->writableBuffer == NULL) {
2492 return;
2493 }
2494 source->writableBufSize = length;
2495 }
2496
2497 u_strcpy(source->writableBuffer, buffer);
2498 source->pos = source->writableBuffer;
2499}
2500
2501/**
2502* Function to get the discontiguos collation element within the source.
2503* Note this function will set the position to the appropriate places.
2504* @param coll current collator used
2505* @param source data string source
2506* @param constart index to the start character in the contraction table
2507* @return discontiguos collation element offset
2508*/
2509static
2510uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2511 const UChar *constart)
2512{
2513 /* source->pos currently points to the second combining character after
2514 the start character */
2515 UChar *temppos = source->pos;
2516 UChar buffer[4*UCOL_MAX_BUFFER];
2517 UChar *tempdb = buffer;
2518 const UChar *tempconstart = constart;
2519 uint8_t tempflags = source->flags;
2520 UBool multicontraction = FALSE;
2521 UChar *tempbufferpos = 0;
2522 collIterateState discState;
2523
2524 backupState(source, &discState);
2525
2526 //*tempdb = *(source->pos - 1);
2527 *tempdb = peekCharacter(source, -1);
2528 tempdb ++;
2529 while (TRUE) {
2530 UChar *UCharOffset;
2531 UChar schar,
2532 tchar;
2533 uint32_t result;
2534
2535 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2536 || (peekCharacter(source, 0) == 0 &&
2537 //|| (*source->pos == 0 &&
2538 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2539 source->fcdPosition == NULL ||
2540 source->fcdPosition == source->endp ||
2541 *(source->fcdPosition) == 0 ||
2542 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2543 /* end of string in null terminated string or stopped by a
2544 null character, note fcd does not always point to a base
2545 character after the discontiguos change */
2546 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2547 //u_getCombiningClass(*(source->pos)) == 0) {
2548 //constart = (UChar *)coll->image + getContractOffset(CE);
2549 if (multicontraction) {
2550 *tempbufferpos = 0;
2551 source->pos = temppos - 1;
2552 setDiscontiguosAttribute(source, buffer, tempdb);
2553 return *(coll->contractionCEs +
2554 (tempconstart - coll->contractionIndex));
2555 }
2556 constart = tempconstart;
2557 break;
2558 }
2559
2560 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2561 schar = getNextNormalizedChar(source);
2562
2563 while (schar > (tchar = *UCharOffset)) {
2564 UCharOffset++;
2565 }
2566
2567 if (schar != tchar) {
2568 /* not the correct codepoint. we stuff the current codepoint into
2569 the discontiguos buffer and try the next character */
2570 *tempdb = schar;
2571 tempdb ++;
2572 continue;
2573 }
2574 else {
2575 if (u_getCombiningClass(schar) ==
2576 u_getCombiningClass(peekCharacter(source, -2))) {
2577 //u_getCombiningClass(*(source->pos - 2))) {
2578 *tempdb = schar;
2579 tempdb ++;
2580 continue;
2581 }
2582 result = *(coll->contractionCEs +
2583 (UCharOffset - coll->contractionIndex));
2584 }
2585 *tempdb = 0;
2586
2587 if (result == UCOL_NOT_FOUND) {
2588 break;
2589 } else if (isContraction(result)) {
2590 /* this is a multi-contraction*/
2591 tempconstart = (UChar *)coll->image + getContractOffset(result);
2592 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2593 != UCOL_NOT_FOUND) {
2594 multicontraction = TRUE;
2595 temppos = source->pos + 1;
2596 tempbufferpos = buffer + u_strlen(buffer);
2597 }
2598 } else {
2599 setDiscontiguosAttribute(source, buffer, tempdb);
2600 return result;
2601 }
2602 }
2603
2604 /* no problems simply reverting just like that,
2605 if we are in string before getting into this function, points back to
2606 string hence no problem.
2607 if we are in normalization buffer before getting into this function,
2608 since we'll never use another normalization within this function, we
2609 know that fcdposition points to a base character. the normalization buffer
2610 never change, hence this revert works. */
2611 loadState(source, &discState, TRUE);
2612 goBackOne(source);
2613
2614 //source->pos = temppos - 1;
2615 source->flags = tempflags;
2616 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2617}
2618
2619static
2620inline UBool isNonChar(UChar32 cp) {
2621 if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2622 return TRUE;
2623 }
2624 return FALSE;
2625}
2626
2627/* now uses Mark's getImplicitPrimary code */
2628static
2629inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2630 if(isNonChar(cp)) {
2631 return 0;
2632 }
374ca955 2633 uint32_t r = uprv_uca_getImplicitPrimary(cp);
b75a7d8f
A
2634 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2635 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2636}
2637
2638/**
2639* Inserts the argument character into the front of the buffer replacing the
2640* front null terminator.
2641* @param data collation element iterator data
2642* @param pNull pointer to the null terminator
2643* @param ch character to be appended
2644* @return positon of added character
2645*/
2646static
2647inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2648{
2649 uint32_t size = data->writableBufSize;
2650 UChar *end;
2651 UChar *newbuffer;
2652 const uint32_t incsize = 5;
2653
2654 if (pNull > data->writableBuffer + 1) {
2655 *pNull = ch;
2656 *(pNull - 1) = 0;
2657 return pNull;
2658 }
2659
2660 /*
2661 buffer will always be null terminated infront.
2662 giving extra space since it is likely that more characters will be added.
2663 */
2664 size += incsize;
2665 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2666 if(newbuffer == NULL) {
2667 return NULL;
2668 }
2669 end = newbuffer + incsize;
2670 uprv_memcpy(end, data->writableBuffer,
2671 data->writableBufSize * sizeof(UChar));
2672 *end = ch;
2673 *(end - 1) = 0;
2674
2675 freeHeapWritableBuffer(data);
2676
2677 data->writableBufSize = size;
2678 data->writableBuffer = newbuffer;
2679 return end;
2680}
2681
2682/**
2683* Special normalization function for contraction in the previous iterator.
2684* This normalization sequence will place the current character at source->pos
2685* and its following normalized sequence into the buffer.
2686* The fcd position, pos will be changed.
2687* pos will now point to positions in the buffer.
2688* Flags will be changed accordingly.
2689* @param data collation iterator data
2690*/
2691static
2692inline void normalizePrevContraction(collIterate *data)
2693{
2694 UChar *buffer = data->writableBuffer;
2695 uint32_t buffersize = data->writableBufSize;
2696 uint32_t nulltermsize;
2697 UErrorCode status = U_ZERO_ERROR;
2698 UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2699 UChar *pStart;
2700 uint32_t normLen;
2701 UChar *pStartNorm;
2702
2703 if (data->flags & UCOL_ITER_HASLEN) {
2704 /*
2705 normalization buffer not used yet, we'll pull down the next
2706 character into the end of the buffer
2707 */
2708 *(buffer + (buffersize - 1)) = *(data->pos + 1);
2709 nulltermsize = buffersize - 1;
2710 }
2711 else {
2712 nulltermsize = buffersize;
2713 UChar *temp = buffer + (nulltermsize - 1);
2714 while (*(temp --) != 0) {
2715 nulltermsize --;
2716 }
2717 }
2718
2719 /* Start normalize */
2720 if (data->fcdPosition == NULL) {
2721 pStart = data->string;
2722 }
2723 else {
2724 pStart = data->fcdPosition + 1;
2725 }
2726
2727 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2728 &status);
2729
2730 if (nulltermsize <= normLen) {
2731 uint32_t size = buffersize - nulltermsize + normLen + 1;
2732 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2733 if(temp != NULL) {
2734 nulltermsize = normLen + 1;
2735 uprv_memcpy(temp + normLen, buffer,
2736 sizeof(UChar) * (buffersize - nulltermsize));
2737 freeHeapWritableBuffer(data);
2738 data->writableBuffer = temp;
2739 data->writableBufSize = size;
2740 }
2741 }
2742
2743 status = U_ZERO_ERROR;
2744 /*
2745 this puts the null termination infront of the normalized string instead
2746 of the end
2747 */
2748 pStartNorm = buffer + (nulltermsize - normLen);
2749 *(pStartNorm - 1) = 0;
2750 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2751 &status);
2752
2753 data->pos = data->writableBuffer + nulltermsize;
2754 data->origFlags = data->flags;
2755 data->flags |= UCOL_ITER_INNORMBUF;
2756 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2757}
2758
2759/**
2760* Contraction character management function that returns the previous character
2761* for the backwards iterator.
2762* Does nothing if the previous character is in buffer and not the first
2763* character in it.
2764* Else it checks previous character in data string to see if it is
2765* normalizable.
2766* If it is not, the character is simply copied into the buffer, else
2767* the whole normalized substring is copied into the buffer, including the
2768* current character.
2769* @param data collation element iterator data
2770* @return previous character
2771*/
2772static
2773inline UChar getPrevNormalizedChar(collIterate *data)
2774{
2775 UChar prevch;
2776 UChar ch;
2777 UChar *start;
2778 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2779 UChar *pNull = NULL;
2780 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2781 (innormbuf && *(data->pos - 1) != 0)) {
2782 /*
2783 if no normalization.
2784 if previous character is in normalized buffer, no further normalization
2785 is required
2786 */
2787 if(data->flags & UCOL_USE_ITERATOR) {
2788 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2789 return (UChar)data->iterator->next(data->iterator);
2790 } else {
2791 return *(data->pos - 1);
2792 }
2793 }
2794
2795 start = data->pos;
2796 if (data->flags & UCOL_ITER_HASLEN) {
2797 /* in data string */
2798 if ((start - 1) == data->string) {
2799 return *(start - 1);
2800 }
2801 start --;
2802 ch = *start;
2803 prevch = *(start - 1);
2804 }
2805 else {
2806 /*
2807 in writable buffer, at this point fcdPosition can not be NULL.
2808 see contracting tag.
2809 */
2810 if (data->fcdPosition == data->string) {
2811 /* at the start of the string, just dump it into the normalizer */
2812 insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2813 data->fcdPosition = NULL;
2814 return *(data->pos - 1);
2815 }
2816 pNull = data->pos - 1;
2817 start = data->fcdPosition;
2818 ch = *start;
2819 prevch = *(start - 1);
2820 }
2821 /*
2822 * if the current character is not fcd.
2823 * Trailing combining class == 0.
2824 */
2825 if (data->fcdPosition > start &&
2826 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2827 {
2828 /*
2829 Need a more complete FCD check and possible normalization.
2830 normalize substring will be appended to buffer
2831 */
2832 UChar *backuppos = data->pos;
2833 data->pos = start;
2834 if (collPrevIterFCD(data)) {
2835 normalizePrevContraction(data);
2836 return *(data->pos - 1);
2837 }
2838 data->pos = backuppos;
2839 data->fcdPosition ++;
2840 }
2841
2842 if (innormbuf) {
2843 /*
2844 no normalization is to be done hence only one character will be
2845 appended to the buffer.
2846 */
2847 insertBufferFront(data, pNull, ch);
2848 data->fcdPosition --;
2849 }
2850
2851 return ch;
2852}
2853
2854/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2855/* It is called by getNextCE */
2856
2857uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2858 collIterateState entryState;
2859 backupState(source, &entryState);
2860 UChar32 cp = ch;
2861
2862 for (;;) {
2863 // This loop will repeat only in the case of contractions, and only when a contraction
2864 // is found and the first CE resulting from that contraction is itself a special
2865 // (an expansion, for example.) All other special CE types are fully handled the
2866 // first time through, and the loop exits.
2867
2868 const uint32_t *CEOffset = NULL;
2869 switch(getCETag(CE)) {
2870 case NOT_FOUND_TAG:
2871 /* This one is not found, and we'll let somebody else bother about it... no more games */
2872 return CE;
2873 case SURROGATE_TAG:
2874 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2875 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2876 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2877 /* we return 0 (completely ignorable - per UCA specification */
2878 {
2879 UChar trail;
2880 collIterateState state;
2881 backupState(source, &state);
2882 if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
374ca955 2883 // we chould have stepped one char forward and it might have turned that it
b75a7d8f
A
2884 // was not a trail surrogate. In that case, we have to backup.
2885 loadState(source, &state, TRUE);
2886 return 0;
2887 } else {
2888 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2889 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, CE&0xFFFFFF, trail);
2890 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2891 // We need to backup
2892 loadState(source, &state, TRUE);
2893 return CE;
374ca955 2894 }
b75a7d8f
A
2895 // calculate the supplementary code point value, if surrogate was not tailored
2896 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2897 }
2898 }
2899 break;
2900 case THAI_TAG:
2901 /* Thai/Lao reordering */
2902 if (((source->flags) & UCOL_ITER_INNORMBUF) /* Already Swapped || */
374ca955 2903 || collIter_eos(source)) /* At end of string. No swap possible */
b75a7d8f
A
2904 {
2905 // Treat Thai as a length one expansion */
2906 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2907 CE = *CEOffset++;
2908 }
2909 else
2910 {
374ca955
A
2911 // Move the prevowel and the following base Consonant into the normalization buffer
2912 // with their order swapped
2913 // Note: this operation might activate the normalization buffer. We have to check for
2914 // that and act accordingly.
2915 UChar thCh = getNextNormalizedChar(source);
2916 UChar32 cp = 0;
2917 if(U16_IS_LEAD(thCh)) {
2918 if(!collIter_eos(source)) {
2919 collIterateState thaiState;
2920 backupState(source, &thaiState);
2921 UChar trailCh = getNextNormalizedChar(source);
2922 if(U16_IS_TRAIL(trailCh)) {
2923 cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
2924 } else {
2925 loadState(source, &thaiState, TRUE);
2926 cp = (UChar32)thCh;
2927 }
2928 } else {
2929 cp = (UChar32)thCh;
2930 }
2931 } else {
2932 cp = (UChar32)thCh;
2933 }
2934 // Now we have the character that needs to be decomposed
2935 // if the normalizing buffer was not used, we can just use our structure and be happy.
2936 if((source->flags & UCOL_ITER_INNORMBUF) == 0) {
2937 // decompose into writable buffer
2938 int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
2939 if(decompLen < 0) {
2940 decompLen = -decompLen;
2941 }
2942 // reorder Thai and the character after it
2943 if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) {
2944 source->writableBuffer[0] = source->writableBuffer[1];
2945 source->writableBuffer[1] = source->writableBuffer[2];
2946 source->writableBuffer[2] = ch;
2947 } else {
2948 source->writableBuffer[0] = source->writableBuffer[1];
2949 source->writableBuffer[1] = ch;
2950 }
2951 // zero terminate, since normalization buffer is always zero terminated
2952 source->writableBuffer[decompLen+1] = 0; // we added the prevowel
b75a7d8f 2953 if(source->pos) {
374ca955 2954 source->fcdPosition = source->pos; // Indicate where to continue in main input string
b75a7d8f 2955 // after exhausting the writableBuffer
b75a7d8f 2956 }
374ca955 2957 source->pos = source->writableBuffer;
b75a7d8f
A
2958 source->origFlags = source->flags;
2959 source->flags |= UCOL_ITER_INNORMBUF;
2960 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
374ca955
A
2961 }
2962 else {
2963 // stuff is already normalized... what to do here???
2964
2965 // if we are in the normalization buffer, thCh must be in it
2966 // prove by contradiction
2967 // if thCh is not in the normalization buffer,
2968 // that means that trailCh is the normalization buffer
2969 // that means that trailCh is a trail surrogate by the above
2970 // bounding if block, this is a contradiction because there
2971 // are no characters at the moment that decomposes to an
2972 // unmatched surrogate. qed.
2973 if (cp >= 0x10000) {
2974 source->writableBuffer[0] = source->writableBuffer[1];
2975 source->writableBuffer[1] = source->writableBuffer[2];
2976 source->writableBuffer[2] = ch;
2977 }
2978 else {
2979 source->writableBuffer[0] = source->writableBuffer[1];
2980 source->writableBuffer[1] = ch;
2981 }
2982 source->pos = source->writableBuffer;
2983 }
2984 CE = UCOL_IGNORABLE;
b75a7d8f
A
2985 }
2986 break;
2987 case SPEC_PROC_TAG:
2988 {
2989 // Special processing is getting a CE that is preceded by a certain prefix
2990 // Currently this is only needed for optimizing Japanese length and iteration marks.
374ca955
A
2991 // When we encouter a special processing tag, we go backwards and try to see if
2992 // we have a match.
b75a7d8f
A
2993 // Contraction tables are used - so the whole process is not unlike contraction.
2994 // prefix data is stored backwards in the table.
2995 const UChar *UCharOffset;
2996 UChar schar, tchar;
2997 collIterateState prefixState;
2998 backupState(source, &prefixState);
2999 loadState(source, &entryState, TRUE);
3000 goBackOne(source); // We want to look at the point where we entered - actually one
3001 // before that...
3002
3003 for(;;) {
3004 // This loop will run once per source string character, for as long as we
374ca955 3005 // are matching a potential contraction sequence
b75a7d8f 3006
374ca955 3007 // First we position ourselves at the begining of contraction sequence
b75a7d8f
A
3008 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3009 if (collIter_bos(source)) {
3010 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3011 break;
3012 }
3013 schar = getPrevNormalizedChar(source);
3014 goBackOne(source);
3015
3016 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3017 UCharOffset++;
3018 }
3019
3020 if (schar == tchar) {
3021 // Found the source string char in the table.
3022 // Pick up the corresponding CE from the table.
3023 CE = *(coll->contractionCEs +
3024 (UCharOffset - coll->contractionIndex));
3025 }
3026 else
3027 {
374ca955 3028 // if there is a completely ignorable code point in the middle of
b75a7d8f
A
3029 // a prefix, we need to act as if it's not there
3030 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3031 // lone surrogates cannot be set to zero as it would break other processing
3032 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3033 // it's easy for BMP code points
3034 if(isZeroCE == 0) {
3035 continue;
3036 } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
3037 // for supplementary code points, we have to check the next one
3038 // situations where we are going to ignore
3039 // 1. beginning of the string: schar is a lone surrogate
3040 // 2. schar is a lone surrogate
3041 // 3. schar is a trail surrogate in a valid surrogate sequence
3042 // that is explicitly set to zero.
3043 if (!collIter_bos(source)) {
3044 UChar lead;
3045 if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
3046 isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
3047 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3048 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
3049 if(finalCE == 0) {
3050 // this is a real, assigned completely ignorable code point
3051 goBackOne(source);
3052 continue;
3053 }
3054 }
3055 } else {
3056 // lone surrogate, completely ignorable
3057 continue;
3058 }
3059 } else {
3060 // lone surrogate at the beggining, completely ignorable
3061 continue;
3062 }
3063 }
3064 // Source string char was not in the table.
3065 // We have not found the prefix.
3066 CE = *(coll->contractionCEs +
3067 (ContractionStart - coll->contractionIndex));
3068 }
3069
3070 if(!isPrefix(CE)) {
3071 // The source string char was in the contraction table, and the corresponding
3072 // CE is not a prefix CE. We found the prefix, break
3073 // out of loop, this CE will end up being returned. This is the normal
3074 // way out of prefix handling when the source actually contained
3075 // the prefix.
3076 break;
3077 }
3078 }
3079 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
3080 loadState(source, &prefixState, TRUE);
3081 if(source->origFlags & UCOL_USE_ITERATOR) {
3082 source->flags = source->origFlags;
3083 }
3084 } else { // prefix search was a failure, we have to backup all the way to the start
3085 loadState(source, &entryState, TRUE);
3086 }
3087 break;
3088 }
3089 case CONTRACTION_TAG:
3090 {
3091 /* This should handle contractions */
3092 collIterateState state;
3093 backupState(source, &state);
3094 uint32_t firstCE = UCOL_NOT_FOUND;
3095 const UChar *UCharOffset;
3096 UChar schar, tchar;
3097
3098 for (;;) {
3099 /* This loop will run once per source string character, for as long as we */
3100 /* are matching a potential contraction sequence */
3101
3102 /* First we position ourselves at the begining of contraction sequence */
3103 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3104
3105 if (collIter_eos(source)) {
3106 // Ran off the end of the source string.
3107 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3108 // So we'll pick whatever we have at the point...
3109 if (CE == UCOL_NOT_FOUND) {
3110 // back up the source over all the chars we scanned going into this contraction.
374ca955 3111 CE = firstCE;
b75a7d8f
A
3112 loadState(source, &state, TRUE);
3113 if(source->origFlags & UCOL_USE_ITERATOR) {
374ca955 3114 source->flags = source->origFlags;
b75a7d8f
A
3115 }
3116 }
3117 break;
3118 }
3119
3120 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
3121 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
3122
3123 schar = getNextNormalizedChar(source);
3124 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3125 UCharOffset++;
3126 }
3127
3128 if (schar == tchar) {
3129 // Found the source string char in the contraction table.
3130 // Pick up the corresponding CE from the table.
3131 CE = *(coll->contractionCEs +
3132 (UCharOffset - coll->contractionIndex));
3133 }
3134 else
3135 {
374ca955 3136 // if there is a completely ignorable code point in the middle of
b75a7d8f
A
3137 // contraction, we need to act as if it's not there
3138 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3139 // it's easy for BMP code points
3140 if(isZeroCE == 0) {
374ca955 3141 continue;
b75a7d8f
A
3142 } else if(UTF_IS_LEAD(schar)) {
3143 if(!collIter_eos(source)) {
3144 backupState(source, &state);
3145 UChar trail = getNextNormalizedChar(source);
3146 if(UTF_IS_TRAIL(trail)) { // do stuff with trail
3147 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3148 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
3149 if(finalCE == 0) {
3150 continue;
3151 }
3152 }
3153 } else {
3154 // broken surrogate sequence, thus completely ignorable
3155 loadState(source, &state, TRUE);
3156 continue;
3157 }
3158 loadState(source, &state, TRUE);
374ca955 3159 } else { // no more characters, so broken surrogate pair...
b75a7d8f 3160 // this contraction will ultimately fail, but not because of us
374ca955 3161 continue;
b75a7d8f
A
3162 }
3163 } // else if(UTF_IS_LEAD(schar))
3164
3165 // Source string char was not in contraction table.
3166 // Unless we have a discontiguous contraction, we have finished
3167 // with this contraction.
3168 uint8_t sCC;
374ca955 3169 if (schar < 0x300 ||
b75a7d8f
A
3170 maxCC == 0 ||
3171 (sCC = i_getCombiningClass(schar, coll)) == 0 ||
374ca955 3172 sCC>maxCC ||
b75a7d8f
A
3173 (allSame != 0 && sCC == maxCC) ||
3174 collIter_eos(source)) {
374ca955
A
3175 // Contraction can not be discontiguous.
3176 goBackOne(source); // back up the source string by one,
b75a7d8f
A
3177 // because the character we just looked at was
3178 // not part of the contraction. */
3179 CE = *(coll->contractionCEs +
3180 (ContractionStart - coll->contractionIndex));
3181 } else {
3182 //
3183 // Contraction is possibly discontiguous.
3184 // Scan more of source string looking for a match
3185 //
3186 UChar tempchar;
3187 /* find the next character if schar is not a base character
3188 and we are not yet at the end of the string */
3189 tempchar = getNextNormalizedChar(source);
3190 goBackOne(source);
3191 if (i_getCombiningClass(tempchar, coll) == 0) {
3192 goBackOne(source);
3193 /* Spit out the last char of the string, wasn't tasty enough */
3194 CE = *(coll->contractionCEs +
3195 (ContractionStart - coll->contractionIndex));
3196 } else {
3197 CE = getDiscontiguous(coll, source, ContractionStart);
3198 }
3199 }
3200 } // else after if(schar == tchar)
3201
3202 if(CE == UCOL_NOT_FOUND) {
3203 /* The Source string did not match the contraction that we were checking. */
3204 /* Back up the source position to undo the effects of having partially */
3205 /* scanned through what ultimately proved to not be a contraction. */
3206 loadState(source, &state, TRUE);
3207 CE = firstCE;
b75a7d8f
A
3208 break;
3209 }
374ca955 3210
b75a7d8f
A
3211 if(!isContraction(CE)) {
3212 // The source string char was in the contraction table, and the corresponding
3213 // CE is not a contraction CE. We completed the contraction, break
3214 // out of loop, this CE will end up being returned. This is the normal
3215 // way out of contraction handling when the source actually contained
3216 // the contraction.
3217 break;
3218 }
374ca955 3219
b75a7d8f
A
3220
3221 // The source string char was in the contraction table, and the corresponding
3222 // CE is IS a contraction CE. We will continue looping to check the source
3223 // string for the remaining chars in the contraction.
3224 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
3225 if(tempCE != UCOL_NOT_FOUND) {
3226 // We have scanned a a section of source string for which there is a
374ca955 3227 // CE from the contraction table. Remember the CE and scan position, so
b75a7d8f
A
3228 // that we can return to this point if further scanning fails to
3229 // match a longer contraction sequence.
3230 firstCE = tempCE;
3231
3232 goBackOne(source);
3233 backupState(source, &state);
3234 getNextNormalizedChar(source);
3235
3236 // Another way to do this is:
3237 //collIterateState tempState;
3238 //backupState(source, &tempState);
3239 //goBackOne(source);
3240 //backupState(source, &state);
3241 //loadState(source, &tempState, TRUE);
3242
3243 // The problem is that for incomplete contractions we have to remember the previous
374ca955 3244 // position. Before, the only thing I needed to do was state.pos--;
b75a7d8f 3245 // After iterator introduction and especially after introduction of normalizing
374ca955 3246 // iterators, it became much more difficult to decrease the saved state.
b75a7d8f
A
3247 // I'm not yet sure which of the two methods above is faster.
3248 }
3249 } // for(;;)
3250 break;
3251 } // case CONTRACTION_TAG:
3252 case LONG_PRIMARY_TAG:
3253 {
3254 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3255 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3256 return CE;
3257 }
3258 case EXPANSION_TAG:
3259 {
3260 /* This should handle expansion. */
3261 /* NOTE: we can encounter both continuations and expansions in an expansion! */
3262 /* I have to decide where continuations are going to be dealt with */
3263 uint32_t size;
3264 uint32_t i; /* general counter */
3265 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3266 size = getExpansionCount(CE);
3267 CE = *CEOffset++;
3268 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3269 for(i = 1; i<size; i++) {
3270 *(source->CEpos++) = *CEOffset++;
3271 }
374ca955
A
3272 } else { /* else, we do */
3273 while(*CEOffset != 0) {
3274 *(source->CEpos++) = *CEOffset++;
3275 }
3276 }
3277 return CE;
3278 }
3279 case DIGIT_TAG:
3280 {
3281 /*
3282 We do a check to see if we want to collate digits as numbers; if so we generate
3283 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3284 */
3285 uint32_t size;
3286 uint32_t i; /* general counter */
3287 collIterateState digitState;
3288
3289 if (source->coll->numericCollation == UCOL_ON){
3290 UChar32 char32 = 0;
3291
3292 uint32_t digIndx = 0;
3293 uint32_t endIndex = 0;
3294 uint32_t trailingZeroIndex = 0;
3295
3296 uint32_t primWeight = 0;
3297
3298 int32_t digVal = 0;
3299 uint8_t collateVal = 0;
3300
3301 UBool nonZeroValReached = FALSE;
3302
3303 uint8_t *numTempBuf;
3304 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3305 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3306
3307 numTempBuf = stackNumTempBuf;
3308 /*
3309 We parse the source string until we hit a char that's NOT a digit.
3310 Use this u_charDigitValue. This might be slow because we have to
3311 handle surrogates...
3312 */
3313/*
3314 if (U16_IS_LEAD(ch)){
3315 if (!collIter_eos(source)) {
3316 backupState(source, &digitState);
3317 UChar trail = getNextNormalizedChar(source);
3318 if(U16_IS_TRAIL(trail)) {
3319 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3320 } else {
3321 loadState(source, &digitState, TRUE);
3322 char32 = ch;
3323 }
3324 } else {
3325 char32 = ch;
3326 }
3327 } else {
3328 char32 = ch;
3329 }
3330 digVal = u_charDigitValue(char32);
3331*/
3332 digVal = u_charDigitValue(cp); // if we have arrived here, we have
3333 // already processed possible supplementaries that trigered the digit tag -
3334 // all supplementaries are marked in the UCA.
3335 /*
3336 We pad a zero in front of the first element anyways. This takes
3337 care of the (probably) most common case where people are sorting things followed
3338 by a single digit
3339 */
3340 digIndx++;
3341 for(;;){
3342 // Make sure we have enough space.
3343 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3344 {
3345 numTempBufSize *= 2;
3346 if (numTempBuf == stackNumTempBuf){
3347 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3348 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3349 }else
3350 uprv_realloc(numTempBuf, numTempBufSize);
3351 }
3352
3353 // Skipping over leading zeroes.
3354 if (digVal != 0 || nonZeroValReached){
3355 if (digVal != 0 && !nonZeroValReached)
3356 nonZeroValReached = TRUE;
3357
3358 /*
3359 We parse the digit string into base 100 numbers (this fits into a byte).
3360 We only add to the buffer in twos, thus if we are parsing an odd character,
3361 that serves as the 'tens' digit while the if we are parsing an even one, that
3362 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3363 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3364 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3365 than all the other bytes.
3366 */
3367
3368 if (digIndx % 2 == 1){
3369 collateVal += (uint8_t)digVal;
3370
3371 // We don't enter the low-order-digit case unless we've already seen
3372 // the high order, or for the first digit, which is always non-zero.
3373 if (collateVal != 0)
3374 trailingZeroIndex = 0;
3375
3376 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3377 collateVal = 0;
3378 }
3379 else{
3380 // We drop the collation value into the buffer so if we need to do
3381 // a "front patch" we don't have to check to see if we're hitting the
3382 // last element.
3383 collateVal = (uint8_t)(digVal * 10);
3384
3385 // Check for trailing zeroes.
3386 if (collateVal == 0)
3387 {
3388 if (!trailingZeroIndex)
3389 trailingZeroIndex = (digIndx/2) + 2;
3390 }
3391 else
3392 trailingZeroIndex = 0;
3393
3394 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3395 }
3396 digIndx++;
3397 }
3398
3399 // Get next character.
3400 if (!collIter_eos(source)){
3401 ch = getNextNormalizedChar(source);
3402 if (U16_IS_LEAD(ch)){
3403 if (!collIter_eos(source)) {
3404 backupState(source, &digitState);
3405 UChar trail = getNextNormalizedChar(source);
3406 if(U16_IS_TRAIL(trail)) {
3407 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3408 } else {
3409 loadState(source, &digitState, TRUE);
3410 char32 = ch;
3411 }
3412 }
3413 } else {
3414 char32 = ch;
3415 }
3416
3417 if ((digVal = u_charDigitValue(char32)) == -1){
3418 // Resetting position to point to the next unprocessed char. We
3419 // overshot it when doing our test/set for numbers.
3420 if (char32 > 0xFFFF) { // For surrogates.
3421 loadState(source, &digitState, TRUE);
3422 //goBackOne(source);
3423 }
3424 goBackOne(source);
3425 break;
3426 }
3427 } else {
3428 break;
3429 }
3430 }
3431
3432 if (nonZeroValReached == FALSE){
3433 digIndx = 2;
3434 numTempBuf[2] = 6;
b75a7d8f 3435 }
374ca955
A
3436
3437 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3438 if (digIndx % 2 != 0){
3439 /*
3440 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3441 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3442 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3443 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3444 */
3445
3446 for(i = 2; i < endIndex; i++){
3447 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3448 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3449 }
3450 --digIndx;
3451 }
3452
3453 // Subtract one off of the last byte.
3454 numTempBuf[endIndex-1] -= 1;
3455
3456 /*
3457 We want to skip over the first two slots in the buffer. The first slot
3458 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3459 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3460 */
3461 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3462 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3463
3464 // Now transfer the collation key to our collIterate struct.
3465 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3466 size = ((endIndex+1) & ~1)/2;
3467 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3468 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3469 UCOL_BYTE_COMMON; // Tertiary weight.
3470 i = 2; // Reset the index into the buffer.
3471 while(i < endIndex)
3472 {
3473 primWeight = numTempBuf[i++] << 8;
3474 if ( i < endIndex)
3475 primWeight |= numTempBuf[i++];
3476 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3477 }
3478
3479 if (numTempBuf != stackNumTempBuf)
3480 uprv_free(numTempBuf);
3481 } else {
3482 // no numeric mode, we'll just switch to whatever we stashed and continue
3483 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3484 CE = *CEOffset++;
3485 break;
3486#if 0
3487 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3488 size = getExpansionCount(CE);
3489 CE = *CEOffset++;
3490 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3491 for(i = 1; i<size; i++) {
3492 *(source->CEpos++) = *CEOffset++;
3493 }
3494 } else { /* else, we do */
3495 while(*CEOffset != 0) {
3496 *(source->CEpos++) = *CEOffset++;
3497 }
3498 }
3499#endif
b75a7d8f
A
3500 }
3501 return CE;
3502 }
b75a7d8f
A
3503 /* various implicits optimization */
3504 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3505 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3506 //return getImplicit(cp, source, 0x04000000);
3507 return getImplicit(cp, source);
3508 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3509 /* UCA is filled with these. Tailorings are NOT_FOUND */
3510 //return getImplicit(cp, source, 0);
3511 return getImplicit(cp, source);
3512 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3513 return 0; /* broken surrogate sequence */
3514 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3515 UChar nextChar;
3516 if( source->flags & UCOL_USE_ITERATOR) {
3517 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3518 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3519 source->iterator->next(source->iterator);
3520 return getImplicit(cp, source);
3521 } else {
3522 return 0;
3523 }
3524 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3525 U_IS_TRAIL((nextChar=*source->pos))) {
3526 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3527 source->pos++;
3528 return getImplicit(cp, source);
3529 } else {
3530 return 0; /* completely ignorable */
3531 }
3532 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3533 {
3534 const uint32_t
3535 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3536 //const uint32_t LCount = 19;
374ca955 3537 const uint32_t VCount = 21;
b75a7d8f
A
3538 const uint32_t TCount = 28;
3539 //const uint32_t NCount = VCount * TCount; // 588
3540 //const uint32_t SCount = LCount * NCount; // 11172
3541 uint32_t L = ch - SBase;
3542
3543 // divide into pieces
3544
3545 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3546 L /= TCount;
3547 uint32_t V = L % VCount;
3548 L /= VCount;
3549
3550 // offset them
3551
3552 L += LBase;
3553 V += VBase;
3554 T += TBase;
3555
3556 // return the first CE, but first put the rest into the expansion buffer
3557 if (!source->coll->image->jamoSpecial) { // FAST PATH
3558
3559 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
3560 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3561 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
3562 if (T != TBase) {
3563 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
3564 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3565 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
3566 }
3567
3568 /*return ucmpe32_get(UCA->mapping, L);*/ // return first one
3569 /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3570 return UTRIE_GET32_FROM_LEAD(coll->mapping, L);
3571
3572 } else { // Jamo is Special
374ca955 3573 // Since Hanguls pass the FCD check, it is
b75a7d8f
A
3574 // guaranteed that we won't be in
3575 // the normalization buffer if something like this happens
3576 // However, if we are using a uchar iterator and normalization
3577 // is ON, the Hangul that lead us here is going to be in that
374ca955 3578 // normalization buffer. Here we want to restore the uchar
b75a7d8f
A
3579 // iterator state and pull out of the normalization buffer
3580 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3581 source->flags = source->origFlags; // restore the iterator
3582 source->pos = NULL;
3583 }
3584 // Move Jamos into normalization buffer
3585 source->writableBuffer[0] = (UChar)L;
3586 source->writableBuffer[1] = (UChar)V;
3587 if (T != TBase) {
3588 source->writableBuffer[2] = (UChar)T;
3589 source->writableBuffer[3] = 0;
3590 } else {
3591 source->writableBuffer[2] = 0;
3592 }
3593
3594 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3595 // after exhausting the writableBuffer
3596 source->pos = source->writableBuffer;
3597 source->origFlags = source->flags;
3598 source->flags |= UCOL_ITER_INNORMBUF;
3599 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3600
3601 return(UCOL_IGNORABLE);
3602 }
3603 }
3604 case CHARSET_TAG:
3605 /* not yet implemented */
3606 /* probably after 1.8 */
3607 return UCOL_NOT_FOUND;
3608 default:
3609 *status = U_INTERNAL_PROGRAM_ERROR;
3610 CE=0;
3611 break;
3612 }
3613 if (CE <= UCOL_NOT_FOUND) break;
3614 }
3615 return CE;
3616}
3617
3618
3619/* now uses Mark's getImplicitPrimary code */
3620static
3621inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3622 if(isNonChar(cp)) {
3623 return 0;
3624 }
3625
374ca955 3626 uint32_t r = uprv_uca_getImplicitPrimary(cp);
b75a7d8f
A
3627
3628 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3629 collationSource->toReturn = collationSource->CEpos;
3630 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3631}
3632
3633/**
3634 * This function handles the special CEs like contractions, expansions,
3635 * surrogates, Thai.
3636 * It is called by both getPrevCE
3637 */
3638uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3639 collIterate *source,
3640 UErrorCode *status)
3641{
3642 const uint32_t *CEOffset = NULL;
3643 UChar *UCharOffset = NULL;
3644 UChar schar;
3645 const UChar *constart = NULL;
3646 uint32_t size;
3647 UChar buffer[UCOL_MAX_BUFFER];
3648 uint32_t *endCEBuffer;
3649 UChar *strbuffer;
3650 int32_t noChars = 0;
3651
3652 for(;;)
3653 {
3654 /* the only ces that loops are thai and contractions */
3655 switch (getCETag(CE))
3656 {
3657 case NOT_FOUND_TAG: /* this tag always returns */
3658 return CE;
3659 case SURROGATE_TAG: /* This is a surrogate pair */
3660 /* essentialy an engaged lead surrogate. */
3661 /* if you have encountered it here, it means that a */
3662 /* broken sequence was encountered and this is an error */
3663 return 0;
3664 case THAI_TAG:
3665 if ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
3666 source->string == source->pos || /* At start of string.|| */
3667 /* previous char not Thai prevowel */
3668 /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
3669 UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE)
3670 //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
3671 {
3672 /* Treat Thai as a length one expansion */
3673 /* find the offset to expansion table */
3674 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE);
3675 CE = *CEOffset ++;
3676 }
3677 else
3678 {
3679 /*
3680 Move the prevowel and the following base Consonant into the
3681 normalization buffer with their order swapped
3682 */
374ca955
A
3683 UChar32 cp = (UChar32)peekCharacter(source, 0);
3684 UBool reorder = TRUE;
3685
3686 int32_t decompLen = unorm_getDecomposition(cp, FALSE, source->writableBuffer, UCOL_WRITABLE_BUFFER_SIZE-1);
3687 if(decompLen < 0) {
3688 decompLen = -decompLen; // there was no decomposition
3689 } else { // we need to check if we will hit a contraction trigger because of decomposition
3690 int32_t i = decompLen;
3691 for(i = 0; i < decompLen; i++) {
3692 if(ucol_contractionEndCP(source->writableBuffer[i], coll)) {
3693 reorder = FALSE;
3694 }
3695 }
3696 }
3697
3698 UChar *tempbuffer = source->writableBuffer +
3699 (source->writableBufSize - 1);
3700 uprv_memcpy(tempbuffer-decompLen + 1, source->writableBuffer, sizeof(UChar)*decompLen);
3701 if(reorder) {
3702 *(tempbuffer - decompLen) = *(tempbuffer - decompLen + 1);
3703 *(tempbuffer - decompLen + 1) = peekCharacter(source, -1);
3704 } else {
3705 *(tempbuffer - decompLen) = peekCharacter(source, -1);
3706 }
3707 *(tempbuffer - decompLen - 1) = 0;
3708
3709
3710/*
b75a7d8f
A
3711 UChar *tempbuffer = source->writableBuffer +
3712 (source->writableBufSize - 1);
3713 *(tempbuffer - 2) = 0;
3714 *(tempbuffer - 1) = peekCharacter(source, 0);
3715 *(tempbuffer) = peekCharacter(source, -1);
374ca955 3716*/
b75a7d8f
A
3717 /*
3718 Indicate where to continue in main input string after exhausting
3719 the writableBuffer
3720 */
3721 if (source->pos - 1 == source->string) {
3722 source->fcdPosition = NULL;
3723 } else {
3724 source->fcdPosition = source->pos-2;
3725 }
3726
374ca955 3727 source->pos = tempbuffer+1; // we're doing predecrement, right?
b75a7d8f
A
3728 source->origFlags = source->flags;
3729 source->flags |= UCOL_ITER_INNORMBUF;
3730 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3731
3732 //CE = UCOL_IGNORABLE;
3733 return(UCOL_IGNORABLE);
3734 }
3735 break;
3736 case SPEC_PROC_TAG:
3737 {
3738 // Special processing is getting a CE that is preceded by a certain prefix
3739 // Currently this is only needed for optimizing Japanese length and iteration marks.
374ca955
A
3740 // When we encouter a special processing tag, we go backwards and try to see if
3741 // we have a match.
b75a7d8f
A
3742 // Contraction tables are used - so the whole process is not unlike contraction.
3743 // prefix data is stored backwards in the table.
3744 const UChar *UCharOffset;
3745 UChar schar, tchar;
3746 collIterateState prefixState;
3747 backupState(source, &prefixState);
3748 for(;;) {
3749 // This loop will run once per source string character, for as long as we
374ca955 3750 // are matching a potential contraction sequence
b75a7d8f 3751
374ca955 3752 // First we position ourselves at the begining of contraction sequence
b75a7d8f
A
3753 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3754
3755 if (collIter_bos(source)) {
3756 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3757 break;
3758 }
3759 schar = getPrevNormalizedChar(source);
3760 goBackOne(source);
3761
3762 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3763 UCharOffset++;
3764 }
3765
3766 if (schar == tchar) {
3767 // Found the source string char in the table.
3768 // Pick up the corresponding CE from the table.
3769 CE = *(coll->contractionCEs +
3770 (UCharOffset - coll->contractionIndex));
3771 }
3772 else
374ca955
A
3773 {
3774 // if there is a completely ignorable code point in the middle of
b75a7d8f
A
3775 // a prefix, we need to act as if it's not there
3776 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3777 // lone surrogates cannot be set to zero as it would break other processing
3778 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3779 // it's easy for BMP code points
3780 if(isZeroCE == 0) {
3781 continue;
3782 } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
3783 // for supplementary code points, we have to check the next one
3784 // situations where we are going to ignore
3785 // 1. beginning of the string: schar is a lone surrogate
3786 // 2. schar is a lone surrogate
3787 // 3. schar is a trail surrogate in a valid surrogate sequence
3788 // that is explicitly set to zero.
3789 if (!collIter_bos(source)) {
3790 UChar lead;
3791 if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
3792 isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
3793 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3794 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
3795 if(finalCE == 0) {
3796 // this is a real, assigned completely ignorable code point
3797 goBackOne(source);
3798 continue;
3799 }
3800 }
3801 } else {
3802 // lone surrogate, completely ignorable
3803 continue;
3804 }
3805 } else {
3806 // lone surrogate at the beggining, completely ignorable
3807 continue;
3808 }
3809 }
3810 // Source string char was not in the table.
3811 // We have not found the prefix.
3812 CE = *(coll->contractionCEs +
3813 (ContractionStart - coll->contractionIndex));
3814 }
3815
3816 if(!isPrefix(CE)) {
3817 // The source string char was in the contraction table, and the corresponding
3818 // CE is not a prefix CE. We found the prefix, break
3819 // out of loop, this CE will end up being returned. This is the normal
3820 // way out of prefix handling when the source actually contained
3821 // the prefix.
3822 break;
3823 }
3824 }
3825 loadState(source, &prefixState, TRUE);
3826 break;
3827 }
3828
3829 case CONTRACTION_TAG:
3830 /* to ensure that the backwards and forwards iteration matches, we
3831 take the current region of most possible match and pass it through
3832 the forward iteration. this will ensure that the obstinate problem of
3833 overlapping contractions will not occur.
3834 */
3835 schar = peekCharacter(source, 0);
3836 constart = (UChar *)coll->image + getContractOffset(CE);
3837 if (isAtStartPrevIterate(source)
3838 /* commented away contraction end checks after adding the checks
3839 in getPrevCE */) {
3840 /* start of string or this is not the end of any contraction */
3841 CE = *(coll->contractionCEs +
3842 (constart - coll->contractionIndex));
3843 break;
3844 }
3845 strbuffer = buffer;
3846 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3847 *(UCharOffset --) = 0;
3848 noChars = 0;
3849 // have to swap thai characters
374ca955
A
3850 while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) {
3851 // we might have ended here after trying to reorder Thai, but seeing that there are unsafe points
3852 // in the backward processing
b75a7d8f
A
3853 *(UCharOffset) = schar;
3854 noChars++;
3855 UCharOffset --;
3856 schar = getPrevNormalizedChar(source);
3857 goBackOne(source);
3858 // TODO: when we exhaust the contraction buffer,
3859 // it needs to get reallocated. The problem is
3860 // that the size depends on the string which is
3861 // not iterated over. However, since we're travelling
3862 // backwards, we already had to set the iterator at
3863 // the end - so we might as well know where we are?
3864 if (UCharOffset + 1 == buffer) {
3865 /* we have exhausted the buffer */
3866 int32_t newsize = 0;
3867 if(source->pos) { // actually dealing with a position
3868 newsize = source->pos - source->string + 1;
3869 } else { // iterator
3870 newsize = 4 * UCOL_MAX_BUFFER;
3871 }
3872 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3873 (newsize + UCOL_MAX_BUFFER));
374ca955
A
3874 /* test for NULL */
3875 if (strbuffer == NULL) {
3876 *status = U_MEMORY_ALLOCATION_ERROR;
3877 return UCOL_NO_MORE_CES;
3878 }
b75a7d8f
A
3879 UCharOffset = strbuffer + newsize;
3880 uprv_memcpy(UCharOffset, buffer,
3881 UCOL_MAX_BUFFER * sizeof(UChar));
3882 UCharOffset --;
3883 }
3884 if ((source->pos && (source->pos == source->string ||
3885 ((source->flags & UCOL_ITER_INNORMBUF) &&
3886 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3887 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3888 break;
3889 }
3890 }
3891 /* adds the initial base character to the string */
3892 *(UCharOffset) = schar;
3893 noChars++;
3894
374ca955 3895 /* a new collIterate is used to simplify things, since using the current
b75a7d8f
A
3896 collIterate will mean that the forward and backwards iteration will
3897 share and change the same buffers. we don't want to get into that. */
3898 collIterate temp;
3899 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3900 IInit_collIterate(coll, UCharOffset, noChars, &temp);
3901 temp.flags &= ~UCOL_ITER_NORM;
3902
3903 CE = ucol_IGetNextCE(coll, &temp, status);
3904 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3905 while (CE != UCOL_NO_MORE_CES) {
3906 *(source->CEpos ++) = CE;
3907 if (source->CEpos == endCEBuffer) {
3908 /* ran out of CE space, bail.
3909 there's no guarantee of the right character position after
3910 this bail*/
3911 *status = U_BUFFER_OVERFLOW_ERROR;
3912 source->CEpos = source->CEs;
3913 freeHeapWritableBuffer(&temp);
3914 if (strbuffer != buffer) {
3915 uprv_free(strbuffer);
3916 }
374ca955 3917 return (uint32_t)UCOL_NULLORDER;
b75a7d8f
A
3918 }
3919 CE = ucol_IGetNextCE(coll, &temp, status);
3920 }
3921 freeHeapWritableBuffer(&temp);
3922 if (strbuffer != buffer) {
3923 uprv_free(strbuffer);
3924 }
3925 source->toReturn = source->CEpos - 1;
3926 if (source->toReturn == source->CEs) {
3927 source->CEpos = source->CEs;
3928 }
3929 return *(source->toReturn);
3930 case LONG_PRIMARY_TAG:
3931 {
3932 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3933 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3934 source->toReturn = source->CEpos - 1;
3935 return *(source->toReturn);
3936 }
3937 case EXPANSION_TAG: /* this tag always returns */
3938 /*
3939 This should handle expansion.
3940 NOTE: we can encounter both continuations and expansions in an expansion!
3941 I have to decide where continuations are going to be dealt with
3942 */
3943 /* find the offset to expansion table */
3944 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3945 size = getExpansionCount(CE);
3946 if (size != 0) {
3947 /*
3948 if there are less than 16 elements in expansion, we don't terminate
3949 */
3950 uint32_t count;
3951 for (count = 0; count < size; count++) {
3952 *(source->CEpos ++) = *CEOffset++;
3953 }
3954 }
3955 else {
3956 /* else, we do */
3957 while (*CEOffset != 0) {
3958 *(source->CEpos ++) = *CEOffset ++;
3959 }
3960 }
3961 source->toReturn = source->CEpos - 1;
374ca955 3962 // in case of one element expansion, we
b75a7d8f
A
3963 // want to immediately return CEpos
3964 if(source->toReturn == source->CEs) {
3965 source->CEpos = source->CEs;
3966 }
3967 return *(source->toReturn);
374ca955 3968 case DIGIT_TAG:
b75a7d8f 3969 {
374ca955
A
3970 /*
3971 We do a check to see if we want to collate digits as numbers; if so we generate
b75a7d8f
A
3972 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3973 */
374ca955 3974 //uint32_t size;
b75a7d8f 3975 uint32_t i; /* general counter */
374ca955
A
3976 collIterateState state;
3977
3978 if (source->coll->numericCollation == UCOL_ON){
3979 UChar32 char32 = 0;
3980
3981 uint32_t digIndx = 0;
3982 uint32_t endIndex = 0;
3983 uint32_t leadingZeroIndex = 0;
3984 uint32_t trailingZeroCount = 0;
3985
3986 uint32_t primWeight = 0;
3987
3988 int32_t digVal = 0;
3989 uint8_t collateVal = 0;
3990
3991 UBool nonZeroValReached = FALSE;
3992
3993 uint8_t *numTempBuf;
3994 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3995 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3996
3997 numTempBuf = stackNumTempBuf;
3998 /*
3999 We parse the source string until we hit a char that's NOT a digit.
4000 Use this u_charDigitValue. This might be slow because we have to
4001 handle surrogates...
4002 */
4003
4004 if (U16_IS_TRAIL (ch)){
4005 if (!collIter_bos(source)){
4006 UChar lead = getPrevNormalizedChar(source);
4007 if(U16_IS_LEAD(lead)) {
4008 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
4009 goBackOne(source);
4010 } else {
4011 char32 = ch;
4012 }
4013 } else {
4014 char32 = ch;
4015 }
4016 } else {
4017 char32 = ch;
4018 }
4019 digVal = u_charDigitValue(char32);
4020
4021 for(;;){
4022 // Make sure we have enough space.
4023 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
4024 {
4025 numTempBufSize *= 2;
4026 if (numTempBuf == stackNumTempBuf){
4027 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
4028 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
4029 }else
4030 uprv_realloc(numTempBuf, numTempBufSize);
4031 }
4032
4033 // Skip over trailing zeroes, and keep a count of them.
4034 if (digVal != 0)
4035 nonZeroValReached = TRUE;
4036 if (nonZeroValReached){
4037 /*
4038 We parse the digit string into base 100 numbers (this fits into a byte).
4039 We only add to the buffer in twos, thus if we are parsing an odd character,
4040 that serves as the 'tens' digit while the if we are parsing an even one, that
4041 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
4042 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
4043 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
4044 than all the other bytes.
4045
4046 Since we're doing in this reverse we want to put the first digit encountered into the
4047 ones place and the second digit encountered into the tens place.
4048 */
4049
4050 if ((digIndx + trailingZeroCount) % 2 == 1){
4051 // High-order digit case (tens place)
4052 collateVal += (uint8_t)(digVal * 10);
4053
4054 // We cannot set leadingZeroIndex unless it has been set for the
4055 // low-order digit. Therefore, all we can do for the high-order
4056 // digit is turn it off, never on.
4057 // The only time we will have a high digit without a low is for
4058 // the very first non-zero digit, so no zero check is necessary.
4059 if (collateVal != 0)
4060 leadingZeroIndex = 0;
4061
4062 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
4063 collateVal = 0;
4064 }
4065 else{
4066 // Low-order digit case (ones place)
4067 collateVal = (uint8_t)digVal;
4068
4069 // Check for leading zeroes.
4070 if (collateVal == 0)
4071 {
4072 if (!leadingZeroIndex)
4073 leadingZeroIndex = (digIndx/2) + 2;
4074 }
4075 else
4076 leadingZeroIndex = 0;
4077
4078 // No need to write to buffer; the case of a last odd digit
4079 // is handled below.
4080 }
4081 ++digIndx;
4082 }
4083 else
4084 ++trailingZeroCount;
4085
4086 if (!collIter_bos(source)){
4087 ch = getPrevNormalizedChar(source);
4088 //goBackOne(source);
4089 if (U16_IS_TRAIL(ch)){
4090 backupState(source, &state);
4091 if (!collIter_bos(source))
4092 {
4093 goBackOne(source);
4094 UChar lead = getPrevNormalizedChar(source);
4095 if(U16_IS_LEAD(lead)) {
4096 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
4097 } else {
4098 loadState(source, &state, FALSE);
4099 char32 = ch;
4100 }
4101 }
4102 }
4103 else
4104 char32 = ch;
4105
4106 if ((digVal = u_charDigitValue(char32)) == -1){
4107 if (char32 > 0xFFFF) {// For surrogates.
4108 loadState(source, &state, FALSE);
4109 }
4110 // Don't need to "reverse" the goBackOne call,
4111 // as this points to the next position to process..
4112 //if (char32 > 0xFFFF) // For surrogates.
4113 //getNextNormalizedChar(source);
4114 break;
4115 }
4116 goBackOne(source);
4117 }else
4118 break;
4119 }
4120
4121 if (nonZeroValReached == FALSE){
4122 digIndx = 2;
4123 trailingZeroCount = 0;
4124 numTempBuf[2] = 6;
4125 }
4126
4127 if ((digIndx + trailingZeroCount) % 2 != 0){
4128 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
4129 digIndx += 1; // The implicit leading zero
4130 }
4131 if (trailingZeroCount % 2 != 0){
4132 // We had to consume one trailing zero for the low digit
4133 // of the least significant byte
4134 digIndx += 1; // The trailing zero not in the exponent
4135 trailingZeroCount -= 1;
4136 }
4137
4138 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
4139
4140 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4141 numTempBuf[2] -= 1;
4142
4143 /*
4144 We want to skip over the first two slots in the buffer. The first slot
4145 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4146 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4147 The exponent must be adjusted by the number of leading zeroes, and the number of
4148 trailing zeroes.
4149 */
4150 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
4151 uint32_t exponent = (digIndx+trailingZeroCount)/2;
4152 if (leadingZeroIndex)
4153 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
4154 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
4155
4156 // Now transfer the collation key to our collIterate struct.
4157 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4158 //size = ((endIndex+1) & ~1)/2;
4159 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
4160 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
4161 UCOL_BYTE_COMMON; // Tertiary weight.
4162 i = endIndex - 1; // Reset the index into the buffer.
4163 while(i >= 2)
4164 {
4165 primWeight = numTempBuf[i--] << 8;
4166 if ( i >= 2)
4167 primWeight |= numTempBuf[i--];
4168 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
4169 }
4170 if (numTempBuf != stackNumTempBuf)
4171 uprv_free(numTempBuf);
4172
4173 source->toReturn = source->CEpos -1;
4174 return *(source->toReturn);
b75a7d8f 4175 }
374ca955
A
4176 else {
4177 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4178 CE = *(CEOffset++);
4179 break;
4180#if 0
4181 /* find the offset to expansion table */
4182 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4183 size = getExpansionCount(CE);
4184 if (size != 0) {
4185 /*
4186 if there are less than 16 elements in expansion, we don't terminate
4187 */
4188 uint32_t count;
4189 for (count = 0; count < size; count++) {
4190 *(source->CEpos ++) = *CEOffset++;
4191 }
4192 }
4193 else {
4194 /* else, we do */
4195 while (*CEOffset != 0) {
4196 *(source->CEpos ++) = *CEOffset ++;
4197 }
4198 }
4199 source->toReturn = source->CEpos - 1;
4200 // in case of one element expansion, we
b75a7d8f
A
4201 // want to immediately return CEpos
4202 if(source->toReturn == source->CEs) {
4203 source->CEpos = source->CEs;
4204 }
374ca955
A
4205 return *(source->toReturn);
4206#endif
4207 }
4208 }
b75a7d8f
A
4209 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
4210 {
4211 const uint32_t
4212 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
374ca955 4213 //const uint32_t LCount = 19;
b75a7d8f
A
4214 const uint32_t VCount = 21;
4215 const uint32_t TCount = 28;
4216 //const uint32_t NCount = VCount * TCount; /* 588 */
4217 //const uint32_t SCount = LCount * NCount; /* 11172 */
4218
4219 uint32_t L = ch - SBase;
4220 /*
4221 divide into pieces.
4222 we do it in this order since some compilers can do % and / in one
4223 operation
4224 */
4225 uint32_t T = L % TCount;
4226 L /= TCount;
4227 uint32_t V = L % VCount;
4228 L /= VCount;
4229
4230 /* offset them */
4231 L += LBase;
4232 V += VBase;
4233 T += TBase;
4234
4235 /*
4236 return the first CE, but first put the rest into the expansion buffer
4237 */
4238 if (!source->coll->image->jamoSpecial)
4239 {
4240 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
4241 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
4242 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L);
4243 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
4244 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
4245 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
4246 if (T != TBase)
4247 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
4248 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
4249 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
4250
4251 source->toReturn = source->CEpos - 1;
4252 return *(source->toReturn);
4253 } else {
374ca955 4254 // Since Hanguls pass the FCD check, it is
b75a7d8f
A
4255 // guaranteed that we won't be in
4256 // the normalization buffer if something like this happens
4257 // Move Jamos into normalization buffer
4258 /*
4259 Move the Jamos into the
374ca955 4260 normalization buffer
b75a7d8f
A
4261 */
4262 UChar *tempbuffer = source->writableBuffer +
4263 (source->writableBufSize - 1);
4264 *(tempbuffer) = 0;
4265 if (T != TBase) {
4266 *(tempbuffer - 1) = (UChar)T;
4267 *(tempbuffer - 2) = (UChar)V;
4268 *(tempbuffer - 3) = (UChar)L;
4269 *(tempbuffer - 4) = 0;
4270 } else {
4271 *(tempbuffer - 1) = (UChar)V;
4272 *(tempbuffer - 2) = (UChar)L;
4273 *(tempbuffer - 3) = 0;
4274 }
4275
4276 /*
4277 Indicate where to continue in main input string after exhausting
4278 the writableBuffer
4279 */
4280 if (source->pos == source->string) {
4281 source->fcdPosition = NULL;
4282 } else {
4283 source->fcdPosition = source->pos-1;
4284 }
4285
4286 source->pos = tempbuffer;
4287 source->origFlags = source->flags;
4288 source->flags |= UCOL_ITER_INNORMBUF;
4289 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4290
4291 return(UCOL_IGNORABLE);
4292 }
4293 }
4294 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4295 return 0; /* broken surrogate sequence */
4296 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4297 {
4298 UChar32 cp = 0;
4299 UChar prevChar;
4300 UChar *prev;
4301 if (isAtStartPrevIterate(source)) {
4302 /* we are at the start of the string, wrong place to be at */
4303 return 0;
4304 }
4305 if (source->pos != source->writableBuffer) {
4306 prev = source->pos - 1;
4307 } else {
4308 prev = source->fcdPosition;
4309 }
4310 prevChar = *prev;
4311
4312 /* Handles Han and Supplementary characters here.*/
4313 if (UTF_IS_FIRST_SURROGATE(prevChar)) {
4314 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4315 source->pos = prev;
4316 } else {
4317 return 0; /* completely ignorable */
4318 }
4319 return getPrevImplicit(cp, source);
4320 }
4321 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4322 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4323 return getPrevImplicit(ch, source);
4324 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4325 return getPrevImplicit(ch, source);
4326 /* UCA is filled with these. Tailorings are NOT_FOUND */
4327 /* not yet implemented */
4328 case CHARSET_TAG: /* this tag always returns */
4329 /* probably after 1.8 */
4330 return UCOL_NOT_FOUND;
4331 default: /* this tag always returns */
4332 *status = U_INTERNAL_PROGRAM_ERROR;
4333 CE=0;
4334 break;
4335 }
4336 if (CE <= UCOL_NOT_FOUND) {
4337 break;
4338 }
4339 }
4340 return CE;
4341}
4342
4343/* This should really be a macro */
4344/* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4345/* anyway */
4346static
4347uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4348#ifdef UCOL_DEBUG
4349 fprintf(stderr, ".");
4350#endif
4351 uint8_t *newStart = NULL;
4352 uint32_t offset = *secondaries-secStart;
4353
4354 if(secStart==second) {
4355 newStart=(uint8_t*)uprv_malloc(newSize);
4356 if(newStart==NULL) {
4357 *status = U_MEMORY_ALLOCATION_ERROR;
4358 return NULL;
4359 }
4360 uprv_memcpy(newStart, secStart, *secondaries-secStart);
4361 } else {
4362 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4363 if(newStart==NULL) {
4364 *status = U_MEMORY_ALLOCATION_ERROR;
4365 return NULL;
4366 }
4367 }
4368 *secondaries=newStart+offset;
4369 *secSize=newSize;
4370 return newStart;
4371}
4372
4373
4374/* This should really be a macro */
4375/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4376/* secondaries in French */
4377/*
4378void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4379 uint8_t temp;
4380 while(start<end) {
4381 temp = *start;
4382 *start++ = *end;
4383 *end-- = temp;
4384 }
4385}
4386*/
4387
4388#define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4389 TYPE tempA; \
4390while((start)<(end)) { \
4391 tempA = *(start); \
4392 *(start)++ = *(end); \
4393 *(end)-- = tempA; \
4394} \
4395}
4396
4397/****************************************************************************/
4398/* Following are the sortkey generation functions */
4399/* */
4400/****************************************************************************/
4401
4402/**
4403 * Merge two sort keys.
4404 * This is useful, for example, to combine sort keys from first and last names
4405 * to sort such pairs.
4406 * Merged sort keys consider on each collation level the first part first entirely,
4407 * then the second one.
4408 * It is possible to merge multiple sort keys by consecutively merging
4409 * another one with the intermediate result.
4410 *
4411 * The length of the merge result is the sum of the lengths of the input sort keys
4412 * minus 1.
4413 *
4414 * @param src1 the first sort key
4415 * @param src1Length the length of the first sort key, including the zero byte at the end;
4416 * can be -1 if the function is to find the length
4417 * @param src2 the second sort key
4418 * @param src2Length the length of the second sort key, including the zero byte at the end;
4419 * can be -1 if the function is to find the length
4420 * @param dest the buffer where the merged sort key is written,
4421 * can be NULL if destCapacity==0
4422 * @param destCapacity the number of bytes in the dest buffer
4423 * @return the length of the merged sort key, src1Length+src2Length-1;
4424 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4425 * in which cases the contents of dest is undefined
4426 *
4427 * @draft
4428 */
4429U_CAPI int32_t U_EXPORT2
4430ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4431 const uint8_t *src2, int32_t src2Length,
4432 uint8_t *dest, int32_t destCapacity) {
4433 int32_t destLength;
4434 uint8_t b;
4435
4436 /* check arguments */
4437 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4438 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4439 destCapacity<0 || (destCapacity>0 && dest==NULL)
4440 ) {
4441 /* error, attempt to write a zero byte and return 0 */
4442 if(dest!=NULL && destCapacity>0) {
4443 *dest=0;
4444 }
4445 return 0;
4446 }
4447
4448 /* check lengths and capacity */
4449 if(src1Length<0) {
4450 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4451 }
4452 if(src2Length<0) {
4453 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4454 }
4455
4456 destLength=src1Length+src2Length-1;
4457 if(destLength>destCapacity) {
4458 /* the merged sort key does not fit into the destination */
4459 return destLength;
4460 }
4461
4462 /* merge the sort keys with the same number of levels */
4463 while(*src1!=0 && *src2!=0) { /* while both have another level */
4464 /* copy level from src1 not including 00 or 01 */
4465 while((b=*src1)>=2) {
4466 ++src1;
4467 *dest++=b;
4468 }
4469
4470 /* add a 02 merge separator */
4471 *dest++=2;
4472
4473 /* copy level from src2 not including 00 or 01 */
4474 while((b=*src2)>=2) {
4475 ++src2;
4476 *dest++=b;
4477 }
4478
4479 /* if both sort keys have another level, then add a 01 level separator and continue */
4480 if(*src1==1 && *src2==1) {
4481 ++src1;
4482 ++src2;
4483 *dest++=1;
4484 }
4485 }
4486
4487 /*
4488 * here, at least one sort key is finished now, but the other one
4489 * might have some contents left from containing more levels;
4490 * that contents is just appended to the result
4491 */
4492 if(*src1!=0) {
4493 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4494 src2=src1;
4495 }
4496 /* append src2, "the other, unfinished sort key" */
4497 uprv_strcpy((char *)dest, (const char *)src2);
4498
4499 /* trust that neither sort key contained illegally embedded zero bytes */
4500 return destLength;
4501}
4502
4503/* sortkey API */
4504U_CAPI int32_t U_EXPORT2
4505ucol_getSortKey(const UCollator *coll,
4506 const UChar *source,
4507 int32_t sourceLength,
4508 uint8_t *result,
4509 int32_t resultLength)
4510{
374ca955
A
4511 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4512 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4513 int32_t actualSrcLen = sourceLength;
4514 if (actualSrcLen==-1 && source!=NULL) {
4515 actualSrcLen = u_strlen(source);
4516 }
4517 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, actualSrcLen);
4518 }
4519
b75a7d8f 4520 UErrorCode status = U_ZERO_ERROR;
374ca955 4521 int32_t keySize = 0;
b75a7d8f 4522
374ca955
A
4523 if(source != NULL) {
4524 // source == NULL is actually an error situation, but we would need to
4525 // have an error code to return it. Until we introduce a new
4526 // API, it stays like this
b75a7d8f 4527
374ca955
A
4528 /* this uses the function pointer that is set in updateinternalstate */
4529 /* currently, there are two funcs: */
4530 /*ucol_calcSortKey(...);*/
4531 /*ucol_calcSortKeySimpleTertiary(...);*/
4532
4533 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4534 //((UCollator *)coll)->errorCode = status; /*semantically const */
4535 }
4536 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4537 UTRACE_EXIT_STATUS(status);
b75a7d8f
A
4538 return keySize;
4539}
4540
4541/* this function is called by the C++ API for sortkey generation */
4542U_CFUNC int32_t
4543ucol_getSortKeyWithAllocation(const UCollator *coll,
4544 const UChar *source, int32_t sourceLength,
4545 uint8_t **pResult,
4546 UErrorCode *pErrorCode) {
4547 *pResult = 0;
4548 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4549}
4550
4551#define UCOL_FSEC_BUF_SIZE 256
4552
4553/* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4554/* or if we run out of space while making a sortkey and want to return ASAP */
4555int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4556 UErrorCode status = U_ZERO_ERROR;
374ca955 4557 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
b75a7d8f
A
4558 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4559 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4560 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4561 UBool compareIdent = (strength == UCOL_IDENTICAL);
4562 UBool doCase = (coll->caseLevel == UCOL_ON);
4563 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4564 //UBool qShifted = shifted && (compareQuad == 0);
4565 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4566 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4567 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4568 uint8_t *fSecs = fSecsBuff;
4569 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4570 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4571
4572 uint32_t variableTopValue = coll->variableTopValue;
374ca955 4573 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
b75a7d8f
A
4574 if(doHiragana) {
4575 UCOL_COMMON_BOT4++;
4576 /* allocate one more space for hiragana */
4577 }
4578 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4579
4580 uint32_t order = UCOL_NO_MORE_CES;
4581 uint8_t primary1 = 0;
4582 uint8_t primary2 = 0;
4583 uint8_t secondary = 0;
4584 uint8_t tertiary = 0;
4585 int32_t caseShift = 0;
4586 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4587
4588 uint8_t caseSwitch = coll->caseSwitch;
4589 uint8_t tertiaryMask = coll->tertiaryMask;
4590 uint8_t tertiaryCommon = coll->tertiaryCommon;
4591
4592 UBool wasShifted = FALSE;
4593 UBool notIsContinuation = FALSE;
4594 uint8_t leadPrimary = 0;
4595
4596
4597 for(;;) {
4598 order = ucol_IGetNextCE(coll, s, &status);
4599 if(order == UCOL_NO_MORE_CES) {
4600 break;
4601 }
4602
4603 if(order == 0) {
4604 continue;
4605 }
4606
4607 notIsContinuation = !isContinuation(order);
4608
4609
4610 if(notIsContinuation) {
4611 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4612 } else {
4613 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4614 }
4615 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4616 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4617 primary1 = (uint8_t)(order >> 8);
4618
4619
4620 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
374ca955 4621 || (!notIsContinuation && wasShifted))
b75a7d8f
A
4622 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4623 /* and other ignorables should be removed if following a shifted code point */
4624 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4625 /* we should just completely ignore it */
4626 continue;
4627 }
4628 if(compareQuad == 0) {
4629 if(c4 > 0) {
4630 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4631 c4 = 0;
4632 }
4633 currentSize++;
4634 if(primary2 != 0) {
4635 currentSize++;
4636 }
4637 }
4638 wasShifted = TRUE;
4639 } else {
4640 wasShifted = FALSE;
4641 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4642 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4643 /* calculate sortkey size */
4644 if(primary1 != UCOL_IGNORABLE) {
4645 if(notIsContinuation) {
4646 if(leadPrimary == primary1) {
4647 currentSize++;
4648 } else {
4649 if(leadPrimary != 0) {
4650 currentSize++;
4651 }
4652 if(primary2 == UCOL_IGNORABLE) {
4653 /* one byter, not compressed */
4654 currentSize++;
4655 leadPrimary = 0;
4656 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4657 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4658 (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4659 /* not compressible */
4660 leadPrimary = 0;
4661 currentSize+=2;
4662 } else { /* compress */
4663 leadPrimary = primary1;
4664 currentSize+=2;
4665 }
4666 }
4667 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4668 currentSize++;
4669 if(primary2 != UCOL_IGNORABLE) {
4670 currentSize++;
4671 }
4672 }
4673 }
4674
4675 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4676 if(!isFrenchSec){
4677 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4678 c2++;
4679 } else {
4680 if(c2 > 0) {
4681 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4682 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4683 } else {
4684 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4685 }
4686 c2 = 0;
4687 }
4688 currentSize++;
4689 }
4690 } else {
4691 fSecs[fSecsLen++] = secondary;
4692 if(fSecsLen == fSecsMaxLen) {
4693 if(fSecs == fSecsBuff) {
4694 fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4695 } else {
4696 fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4697 }
4698 if(fSecs == NULL) {
4699 status = U_MEMORY_ALLOCATION_ERROR;
4700 return -1;
4701 }
4702 fSecsMaxLen *= 2;
4703 }
4704 if(notIsContinuation) {
4705 if (frenchStartPtr != NULL) {
4706 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4707 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4708 frenchStartPtr = NULL;
4709 }
4710 } else {
4711 if (frenchStartPtr == NULL) {
4712 frenchStartPtr = fSecs+fSecsLen-2;
4713 }
4714 frenchEndPtr = fSecs+fSecsLen-1;
4715 }
4716 }
4717 }
4718
4719 if(doCase) {
4720 if (caseShift == 0) {
4721 currentSize++;
4722 caseShift = UCOL_CASE_SHIFT_START;
4723 }
4724 if((tertiary&0x3F) > 0 && notIsContinuation) {
4725 caseShift--;
4726 if((tertiary &0xC0) != 0) {
4727 if (caseShift == 0) {
4728 currentSize++;
4729 caseShift = UCOL_CASE_SHIFT_START;
4730 }
4731 caseShift--;
4732 }
4733 }
4734 } else {
4735 if(notIsContinuation) {
4736 tertiary ^= caseSwitch;
4737 }
4738 }
4739
4740 tertiary &= tertiaryMask;
4741 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4742 if (tertiary == tertiaryCommon && notIsContinuation) {
4743 c3++;
4744 } else {
4745 if(c3 > 0) {
4746 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4747 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4748 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4749 } else {
4750 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4751 }
4752 c3 = 0;
4753 }
4754 currentSize++;
4755 }
4756 }
4757
4758 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4759 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4760 if(c4>0) { // Close this part
4761 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4762 c4 = 0;
4763 }
4764 currentSize++; // Add the Hiragana
4765 } else { // This wasn't Hiragana, so we can continue adding stuff
4766 c4++;
4767 }
4768 }
4769
4770 }
4771 }
4772
4773 if(!isFrenchSec){
4774 if(c2 > 0) {
4775 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4776 }
4777 } else {
4778 uint32_t i = 0;
4779 if(frenchStartPtr != NULL) {
4780 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4781 }
4782 for(i = 0; i<fSecsLen; i++) {
4783 secondary = *(fSecs+fSecsLen-i-1);
4784 /* This is compression code. */
4785 if (secondary == UCOL_COMMON2) {
4786 ++c2;
4787 } else {
4788 if(c2 > 0) {
4789 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4790 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4791 } else {
4792 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4793 }
4794 c2 = 0;
4795 }
4796 currentSize++;
4797 }
4798 }
4799 if(c2 > 0) {
4800 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4801 }
4802 if(fSecs != fSecsBuff) {
4803 uprv_free(fSecs);
4804 }
4805 }
4806
4807 if(c3 > 0) {
4808 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4809 }
4810
4811 if(c4 > 0 && compareQuad == 0) {
4812 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4813 }
4814
4815 if(compareIdent) {
4816 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4817 }
4818 return currentSize;
4819
4820}
4821
4822static
4823inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4824 if (caseShift == 0) {
4825 *(*cases)++ = UCOL_CASE_BYTE_START;
4826 caseShift = UCOL_CASE_SHIFT_START;
4827 }
4828}
4829
4830// Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4831// know how many values we wanted to add, even if we didn't add them all
4832static
4833inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4834 size++;
4835 if(primaries < limit) {
4836 *(primaries)++ = value;
4837 }
4838}
4839
374ca955 4840// Packs the secondary buffer when processing French locale. Adds the terminator.
b75a7d8f
A
4841static
4842inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4843 uint8_t secondary;
4844 int32_t count2 = 0;
4845 uint32_t i = 0, size = 0;
4846 // we use i here since the key size already accounts for terminators, so we'll discard the increment
374ca955 4847 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
b75a7d8f
A
4848 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4849 if(frenchStartPtr != NULL) {
4850 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4851 }
4852 for(i = 0; i<*secsize; i++) {
4853 secondary = *(secondaries-i-1);
4854 /* This is compression code. */
4855 if (secondary == UCOL_COMMON2) {
4856 ++count2;
4857 } else {
4858 if (count2 > 0) {
4859 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4860 while (count2 > UCOL_TOP_COUNT2) {
4861 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4862 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4863 }
4864 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4865 } else {
4866 while (count2 > UCOL_BOT_COUNT2) {
4867 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4868 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4869 }
4870 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4871 }
4872 count2 = 0;
4873 }
4874 addWithIncrement(primaries, primEnd, size, secondary);
4875 }
4876 }
4877 if (count2 > 0) {
4878 while (count2 > UCOL_BOT_COUNT2) {
4879 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4880 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4881 }
4882 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4883 }
4884 *secsize = size;
4885 return primaries;
4886}
4887
4888/* This is the sortkey work horse function */
4889U_CFUNC int32_t U_CALLCONV
4890ucol_calcSortKey(const UCollator *coll,
4891 const UChar *source,
4892 int32_t sourceLength,
4893 uint8_t **result,
4894 uint32_t resultLength,
4895 UBool allocateSKBuffer,
4896 UErrorCode *status)
4897{
374ca955
A
4898 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4899
b75a7d8f
A
4900 uint32_t i = 0; /* general purpose counter */
4901
4902 /* Stack allocated buffers for buffers we use */
4903 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4904
4905 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4906
4907 if(U_FAILURE(*status)) {
4908 return 0;
4909 }
4910
4911 if(primaries == NULL && allocateSKBuffer == TRUE) {
4912 primaries = *result = prim;
4913 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4914 }
4915
4916 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4917 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4918
4919 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4920
4921 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4922 UChar *normSource = normBuffer;
4923 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4924
4925 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4926
4927 UColAttributeValue strength = coll->strength;
4928
4929 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4930 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4931 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4932 UBool compareIdent = (strength == UCOL_IDENTICAL);
4933 UBool doCase = (coll->caseLevel == UCOL_ON);
4934 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4935 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4936 //UBool qShifted = shifted && (compareQuad == 0);
4937 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4938 const uint8_t *scriptOrder = coll->scriptOrder;
4939
4940 uint32_t variableTopValue = coll->variableTopValue;
4941 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4942 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4943 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4944 uint8_t UCOL_HIRAGANA_QUAD = 0;
4945 if(doHiragana) {
4946 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4947 /* allocate one more space for hiragana, value for hiragana */
4948 }
4949 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4950
4951 /* support for special features like caselevel and funky secondaries */
4952 uint8_t *frenchStartPtr = NULL;
4953 uint8_t *frenchEndPtr = NULL;
4954 uint32_t caseShift = 0;
4955
4956 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4957
4958 /* If we need to normalize, we'll do it all at once at the beginning! */
4959 UNormalizationMode normMode;
4960 if(compareIdent) {
4961 normMode = UNORM_NFD;
4962 } else if(coll->normalizationMode != UCOL_OFF) {
4963 normMode = UNORM_FCD;
4964 } else {
4965 normMode = UNORM_NONE;
4966 }
4967
4968 if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4969 len = unorm_internalNormalize(normSource, normSourceLen,
4970 source, len,
4971 normMode, FALSE,
4972 status);
4973 if(*status == U_BUFFER_OVERFLOW_ERROR) {
4974 normSourceLen = len;
4975 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4976 if(normSource == NULL) {
4977 *status = U_MEMORY_ALLOCATION_ERROR;
4978 return 0;
4979 }
4980 *status = U_ZERO_ERROR;
4981 len = unorm_internalNormalize(normSource, normSourceLen,
4982 source, len,
4983 normMode, FALSE,
4984 status);
4985 }
4986
4987 if(U_FAILURE(*status)) {
4988 return 0;
4989 }
4990 source = normSource;
4991 }
4992
4993 collIterate s;
4994 IInit_collIterate(coll, (UChar *)source, len, &s);
4995 if(source == normSource) {
4996 s.flags &= ~UCOL_ITER_NORM;
4997 }
4998
4999 if(resultLength == 0 || primaries == NULL) {
5000 int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5001 if(normSource != normBuffer) {
5002 uprv_free(normSource);
5003 }
5004 return keyLen;
5005 }
5006 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5007
5008 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5009
5010 uint8_t *primStart = primaries;
5011 uint8_t *secStart = secondaries;
5012 uint8_t *terStart = tertiaries;
5013 uint8_t *caseStart = cases;
5014 uint8_t *quadStart = quads;
5015
5016 uint32_t order = 0;
5017
5018 uint8_t primary1 = 0;
5019 uint8_t primary2 = 0;
5020 uint8_t secondary = 0;
5021 uint8_t tertiary = 0;
5022 uint8_t caseSwitch = coll->caseSwitch;
5023 uint8_t tertiaryMask = coll->tertiaryMask;
5024 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5025 uint8_t tertiaryTop = coll->tertiaryTop;
5026 uint8_t tertiaryBottom = coll->tertiaryBottom;
5027 uint8_t tertiaryCommon = coll->tertiaryCommon;
5028 uint8_t caseBits = 0;
5029
5030 UBool finished = FALSE;
5031 UBool wasShifted = FALSE;
5032 UBool notIsContinuation = FALSE;
5033
5034 uint32_t prevBuffSize = 0;
5035
5036 uint32_t count2 = 0, count3 = 0, count4 = 0;
5037 uint8_t leadPrimary = 0;
5038
5039 for(;;) {
5040 for(i=prevBuffSize; i<minBufferSize; ++i) {
5041
5042 order = ucol_IGetNextCE(coll, &s, status);
5043 if(order == UCOL_NO_MORE_CES) {
5044 finished = TRUE;
5045 break;
5046 }
5047
5048 if(order == 0) {
5049 continue;
5050 }
5051
5052 notIsContinuation = !isContinuation(order);
5053
5054 if(notIsContinuation) {
5055 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
5056 } else {
5057 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5058 }
5059
5060 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5061 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5062 primary1 = (uint8_t)(order >> 8);
5063
5064 if(notIsContinuation) {
5065 if(scriptOrder != NULL) {
5066 primary1 = scriptOrder[primary1];
5067 }
5068 }
5069
5070 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
5071 || (!notIsContinuation && wasShifted))
5072 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5073 /* and other ignorables should be removed if following a shifted code point */
5074 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
5075 /* we should just completely ignore it */
5076 continue;
5077 }
5078 if(compareQuad == 0) {
5079 if(count4 > 0) {
5080 while (count4 > UCOL_BOT_COUNT4) {
5081 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5082 count4 -= UCOL_BOT_COUNT4;
5083 }
5084 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5085 count4 = 0;
5086 }
5087 /* We are dealing with a variable and we're treating them as shifted */
5088 /* This is a shifted ignorable */
5089 if(primary1 != 0) { /* we need to check this since we could be in continuation */
5090 *quads++ = primary1;
5091 }
5092 if(primary2 != 0) {
5093 *quads++ = primary2;
5094 }
5095 }
5096 wasShifted = TRUE;
5097 } else {
5098 wasShifted = FALSE;
5099 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5100 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5101 /* regular and simple sortkey calc */
5102 if(primary1 != UCOL_IGNORABLE) {
5103 if(notIsContinuation) {
5104 if(leadPrimary == primary1) {
5105 *primaries++ = primary2;
5106 } else {
5107 if(leadPrimary != 0) {
5108 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5109 }
5110 if(primary2 == UCOL_IGNORABLE) {
5111 /* one byter, not compressed */
5112 *primaries++ = primary1;
5113 leadPrimary = 0;
5114 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5115 (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5116 /* not compressible */
5117 leadPrimary = 0;
5118 *primaries++ = primary1;
5119 *primaries++ = primary2;
5120 } else { /* compress */
5121 *primaries++ = leadPrimary = primary1;
5122 *primaries++ = primary2;
5123 }
5124 }
5125 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5126 *primaries++ = primary1;
5127 if(primary2 != UCOL_IGNORABLE) {
5128 *primaries++ = primary2; /* second part */
5129 }
5130 }
5131 }
5132
5133 if(secondary > compareSec) {
5134 if(!isFrenchSec) {
5135 /* This is compression code. */
5136 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5137 ++count2;
5138 } else {
5139 if (count2 > 0) {
5140 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5141 while (count2 > UCOL_TOP_COUNT2) {
5142 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5143 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5144 }
5145 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5146 } else {
5147 while (count2 > UCOL_BOT_COUNT2) {
5148 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5149 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5150 }
5151 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5152 }
5153 count2 = 0;
5154 }
5155 *secondaries++ = secondary;
5156 }
5157 } else {
5158 *secondaries++ = secondary;
5159 /* Do the special handling for French secondaries */
5160 /* We need to get continuation elements and do intermediate restore */
5161 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5162 if(notIsContinuation) {
5163 if (frenchStartPtr != NULL) {
5164 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5165 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
5166 frenchStartPtr = NULL;
5167 }
5168 } else {
5169 if (frenchStartPtr == NULL) {
5170 frenchStartPtr = secondaries - 2;
5171 }
5172 frenchEndPtr = secondaries-1;
5173 }
5174 }
5175 }
5176
5177 if(doCase) {
5178 doCaseShift(&cases, caseShift);
5179 if(notIsContinuation) {
5180 caseBits = (uint8_t)(tertiary & 0xC0);
5181
5182 if(tertiary != 0) {
5183 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5184 if((caseBits & 0xC0) == 0) {
5185 *(cases-1) |= 1 << (--caseShift);
5186 } else {
5187 *(cases-1) |= 0 << (--caseShift);
5188 /* second bit */
5189 doCaseShift(&cases, caseShift);
5190 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
5191 }
5192 } else {
5193 if((caseBits & 0xC0) == 0) {
5194 *(cases-1) |= 0 << (--caseShift);
5195 } else {
5196 *(cases-1) |= 1 << (--caseShift);
5197 /* second bit */
5198 doCaseShift(&cases, caseShift);
5199 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
5200 }
5201 }
5202 }
5203
5204 }
5205 } else {
5206 if(notIsContinuation) {
5207 tertiary ^= caseSwitch;
5208 }
5209 }
5210
5211 tertiary &= tertiaryMask;
5212 if(tertiary > compareTer) {
5213 /* This is compression code. */
5214 /* sequence size check is included in the if clause */
5215 if (tertiary == tertiaryCommon && notIsContinuation) {
5216 ++count3;
5217 } else {
5218 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
5219 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
5220 tertiary += tertiaryAddition;
5221 }
5222 if (count3 > 0) {
5223 if ((tertiary > tertiaryCommon)) {
5224 while (count3 > coll->tertiaryTopCount) {
5225 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5226 count3 -= (uint32_t)coll->tertiaryTopCount;
5227 }
5228 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5229 } else {
5230 while (count3 > coll->tertiaryBottomCount) {
5231 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5232 count3 -= (uint32_t)coll->tertiaryBottomCount;
5233 }
5234 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5235 }
5236 count3 = 0;
5237 }
5238 *tertiaries++ = tertiary;
5239 }
5240 }
5241
5242 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
5243 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5244 if(count4>0) { // Close this part
5245 while (count4 > UCOL_BOT_COUNT4) {
5246 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5247 count4 -= UCOL_BOT_COUNT4;
5248 }
5249 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5250 count4 = 0;
5251 }
5252 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
5253 } else { // This wasn't Hiragana, so we can continue adding stuff
5254 count4++;
5255 }
5256 }
5257 }
5258
5259 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5260 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5261 IInit_collIterate(coll, (UChar *)source, len, &s);
5262 if(source == normSource) {
5263 s.flags &= ~UCOL_ITER_NORM;
5264 }
5265 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5266 *status = U_BUFFER_OVERFLOW_ERROR;
5267 finished = TRUE;
5268 break;
5269 } else { /* It's much nicer if we can actually reallocate */
5270 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
5271 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5272 if(U_SUCCESS(*status)) {
5273 *result = primStart;
5274 primarySafeEnd = primStart + resultLength - 2;
5275 } else {
5276 IInit_collIterate(coll, (UChar *)source, len, &s);
5277 if(source == normSource) {
5278 s.flags &= ~UCOL_ITER_NORM;
5279 }
5280 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5281 finished = TRUE;
5282 break;
5283 }
5284 }
5285 }
5286 }
5287 if(finished) {
5288 break;
5289 } else {
5290 prevBuffSize = minBufferSize;
5291 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5292 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5293 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5294 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5295 minBufferSize *= 2;
5296 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5297 IInit_collIterate(coll, (UChar *)source, len, &s);
5298 if(source == normSource) {
5299 s.flags &= ~UCOL_ITER_NORM;
5300 }
5301 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5302 break;
5303 }
5304 }
5305 }
5306
5307 /* Here, we are generally done with processing */
5308 /* bailing out would not be too productive */
5309
5310 if(U_SUCCESS(*status)) {
5311 sortKeySize += (primaries - primStart);
5312 /* we have done all the CE's, now let's put them together to form a key */
5313 if(compareSec == 0) {
5314 if (count2 > 0) {
5315 while (count2 > UCOL_BOT_COUNT2) {
5316 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5317 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5318 }
5319 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5320 }
5321 uint32_t secsize = secondaries-secStart;
5322 if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5323 sortKeySize += secsize;
5324 if(sortKeySize <= resultLength) {
5325 *(primaries++) = UCOL_LEVELTERMINATOR;
5326 uprv_memcpy(primaries, secStart, secsize);
5327 primaries += secsize;
5328 } else {
5329 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5330 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5331 if(U_SUCCESS(*status)) {
5332 *result = primStart;
5333 *(primaries++) = UCOL_LEVELTERMINATOR;
5334 uprv_memcpy(primaries, secStart, secsize);
5335 primaries += secsize;
5336 }
5337 } else {
5338 *status = U_BUFFER_OVERFLOW_ERROR;
5339 }
5340 }
5341 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5342 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5343 sortKeySize += secsize;
5344 if(sortKeySize <= resultLength) { // if we managed to pack fine
5345 primaries = newPrim; // update the primary pointer
5346 } else { // overflow, need to reallocate and redo
5347 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5348 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5349 if(U_SUCCESS(*status)) {
5350 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5351 }
5352 } else {
5353 *status = U_BUFFER_OVERFLOW_ERROR;
5354 }
5355 }
5356 }
5357 }
5358
5359 if(doCase) {
5360 uint32_t casesize = cases - caseStart;
5361 sortKeySize += casesize;
5362 if(sortKeySize <= resultLength) {
5363 *(primaries++) = UCOL_LEVELTERMINATOR;
5364 uprv_memcpy(primaries, caseStart, casesize);
5365 primaries += casesize;
5366 } else {
5367 if(allocateSKBuffer == TRUE) {
5368 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5369 if(U_SUCCESS(*status)) {
5370 *result = primStart;
5371 *(primaries++) = UCOL_LEVELTERMINATOR;
5372 uprv_memcpy(primaries, caseStart, casesize);
5373 }
5374 } else {
5375 *status = U_BUFFER_OVERFLOW_ERROR;
5376 }
5377 }
5378 }
5379
5380 if(compareTer == 0) {
5381 if (count3 > 0) {
5382 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5383 while (count3 >= coll->tertiaryTopCount) {
5384 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5385 count3 -= (uint32_t)coll->tertiaryTopCount;
5386 }
5387 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5388 } else {
5389 while (count3 > coll->tertiaryBottomCount) {
5390 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5391 count3 -= (uint32_t)coll->tertiaryBottomCount;
5392 }
5393 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5394 }
5395 }
5396 uint32_t tersize = tertiaries - terStart;
5397 sortKeySize += tersize;
5398 if(sortKeySize <= resultLength) {
5399 *(primaries++) = UCOL_LEVELTERMINATOR;
5400 uprv_memcpy(primaries, terStart, tersize);
5401 primaries += tersize;
5402 } else {
5403 if(allocateSKBuffer == TRUE) {
5404 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5405 if(U_SUCCESS(*status)) {
5406 *result = primStart;
5407 *(primaries++) = UCOL_LEVELTERMINATOR;
5408 uprv_memcpy(primaries, terStart, tersize);
5409 }
5410 } else {
5411 *status = U_BUFFER_OVERFLOW_ERROR;
5412 }
5413 }
5414
5415 if(compareQuad == 0/*qShifted == TRUE*/) {
5416 if(count4 > 0) {
5417 while (count4 > UCOL_BOT_COUNT4) {
5418 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5419 count4 -= UCOL_BOT_COUNT4;
5420 }
5421 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5422 }
5423 uint32_t quadsize = quads - quadStart;
5424 sortKeySize += quadsize;
5425 if(sortKeySize <= resultLength) {
5426 *(primaries++) = UCOL_LEVELTERMINATOR;
5427 uprv_memcpy(primaries, quadStart, quadsize);
5428 primaries += quadsize;
5429 } else {
5430 if(allocateSKBuffer == TRUE) {
5431 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5432 if(U_SUCCESS(*status)) {
5433 *result = primStart;
5434 *(primaries++) = UCOL_LEVELTERMINATOR;
5435 uprv_memcpy(primaries, quadStart, quadsize);
5436 }
5437 } else {
5438 *status = U_BUFFER_OVERFLOW_ERROR;
5439 }
5440 }
5441 }
5442
5443 if(compareIdent) {
5444 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5445 if(sortKeySize <= resultLength) {
5446 *(primaries++) = UCOL_LEVELTERMINATOR;
5447 primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5448 } else {
5449 if(allocateSKBuffer == TRUE) {
5450 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5451 if(U_SUCCESS(*status)) {
5452 *result = primStart;
5453 *(primaries++) = UCOL_LEVELTERMINATOR;
374ca955 5454 u_writeIdenticalLevelRun(s.string, len, primaries);
b75a7d8f
A
5455 }
5456 } else {
5457 *status = U_BUFFER_OVERFLOW_ERROR;
5458 }
5459 }
5460 }
5461 }
5462 *(primaries++) = '\0';
5463 }
5464
5465 if(terStart != tert) {
5466 uprv_free(terStart);
5467 uprv_free(secStart);
5468 uprv_free(caseStart);
5469 uprv_free(quadStart);
5470 }
5471
5472 if(normSource != normBuffer) {
5473 uprv_free(normSource);
5474 }
5475
5476 if(allocateSKBuffer == TRUE) {
5477 *result = (uint8_t*)uprv_malloc(sortKeySize);
374ca955
A
5478 /* test for NULL */
5479 if (*result == NULL) {
5480 *status = U_MEMORY_ALLOCATION_ERROR;
5481 return sortKeySize;
5482 }
b75a7d8f
A
5483 uprv_memcpy(*result, primStart, sortKeySize);
5484 if(primStart != prim) {
5485 uprv_free(primStart);
5486 }
5487 }
5488
5489 return sortKeySize;
5490}
5491
5492
5493U_CFUNC int32_t U_CALLCONV
5494ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5495 const UChar *source,
5496 int32_t sourceLength,
5497 uint8_t **result,
5498 uint32_t resultLength,
5499 UBool allocateSKBuffer,
5500 UErrorCode *status)
5501{
5502 U_ALIGN_CODE(16);
374ca955
A
5503
5504 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
b75a7d8f
A
5505 uint32_t i = 0; /* general purpose counter */
5506
5507 /* Stack allocated buffers for buffers we use */
5508 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5509
5510 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5511
5512 if(U_FAILURE(*status)) {
5513 return 0;
5514 }
5515
5516 if(primaries == NULL && allocateSKBuffer == TRUE) {
5517 primaries = *result = prim;
5518 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5519 }
5520
5521 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5522
5523 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5524
5525 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5526 UChar *normSource = normBuffer;
5527 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5528
5529 int32_t len = sourceLength;
5530
5531 /* If we need to normalize, we'll do it all at once at the beginning! */
5532 if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5533 len = unorm_internalNormalize(normSource, normSourceLen,
5534 source, len,
5535 UNORM_FCD, FALSE,
5536 status);
5537 if(*status == U_BUFFER_OVERFLOW_ERROR) {
5538 normSourceLen = len;
5539 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5540 if(normSource == NULL) {
5541 *status = U_MEMORY_ALLOCATION_ERROR;
5542 return 0;
5543 }
5544 *status = U_ZERO_ERROR;
5545 len = unorm_internalNormalize(normSource, normSourceLen,
5546 source, len,
5547 UNORM_FCD, FALSE,
5548 status);
5549 }
5550
5551 if(U_FAILURE(*status)) {
5552 return 0;
5553 }
5554 source = normSource;
5555 }
5556
5557 collIterate s;
5558 IInit_collIterate(coll, (UChar *)source, len, &s);
5559 if(source == normSource) {
5560 s.flags &= ~UCOL_ITER_NORM;
5561 }
5562
5563 if(resultLength == 0 || primaries == NULL) {
5564 int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5565 if(normSource != normBuffer) {
5566 uprv_free(normSource);
5567 }
5568 return t;
5569 }
5570
5571 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5572
5573 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5574
5575 uint8_t *primStart = primaries;
5576 uint8_t *secStart = secondaries;
5577 uint8_t *terStart = tertiaries;
5578
5579 uint32_t order = 0;
5580
5581 uint8_t primary1 = 0;
5582 uint8_t primary2 = 0;
5583 uint8_t secondary = 0;
5584 uint8_t tertiary = 0;
5585 uint8_t caseSwitch = coll->caseSwitch;
5586 uint8_t tertiaryMask = coll->tertiaryMask;
5587 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5588 uint8_t tertiaryTop = coll->tertiaryTop;
5589 uint8_t tertiaryBottom = coll->tertiaryBottom;
5590 uint8_t tertiaryCommon = coll->tertiaryCommon;
5591
5592 uint32_t prevBuffSize = 0;
5593
5594 UBool finished = FALSE;
5595 UBool notIsContinuation = FALSE;
5596
5597 uint32_t count2 = 0, count3 = 0;
5598 uint8_t leadPrimary = 0;
5599
5600 for(;;) {
5601 for(i=prevBuffSize; i<minBufferSize; ++i) {
5602
5603 order = ucol_IGetNextCE(coll, &s, status);
5604
5605 if(order == 0) {
5606 continue;
5607 }
5608
5609 if(order == UCOL_NO_MORE_CES) {
5610 finished = TRUE;
5611 break;
5612 }
5613
5614 notIsContinuation = !isContinuation(order);
5615
5616 if(notIsContinuation) {
5617 tertiary = (uint8_t)((order & tertiaryMask));
5618 } else {
5619 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5620 }
5621 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5622 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5623 primary1 = (uint8_t)(order >> 8);
5624
5625 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5626 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5627 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5628 /* regular and simple sortkey calc */
5629 if(primary1 != UCOL_IGNORABLE) {
5630 if(notIsContinuation) {
5631 if(leadPrimary == primary1) {
5632 *primaries++ = primary2;
5633 } else {
5634 if(leadPrimary != 0) {
5635 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5636 }
5637 if(primary2 == UCOL_IGNORABLE) {
5638 /* one byter, not compressed */
5639 *primaries++ = primary1;
5640 leadPrimary = 0;
5641 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
374ca955 5642 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
b75a7d8f
A
5643 (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5644 /* not compressible */
5645 leadPrimary = 0;
5646 *primaries++ = primary1;
5647 *primaries++ = primary2;
5648 } else { /* compress */
5649 *primaries++ = leadPrimary = primary1;
5650 *primaries++ = primary2;
5651 }
5652 }
5653 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5654 *primaries++ = primary1;
5655 if(primary2 != UCOL_IGNORABLE) {
5656 *primaries++ = primary2; /* second part */
5657 }
5658 }
5659 }
5660
5661 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5662 /* This is compression code. */
5663 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5664 ++count2;
5665 } else {
5666 if (count2 > 0) {
5667 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5668 while (count2 > UCOL_TOP_COUNT2) {
5669 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5670 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5671 }
5672 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5673 } else {
5674 while (count2 > UCOL_BOT_COUNT2) {
5675 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5676 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5677 }
5678 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5679 }
5680 count2 = 0;
5681 }
5682 *secondaries++ = secondary;
5683 }
5684 }
5685
5686 if(notIsContinuation) {
5687 tertiary ^= caseSwitch;
5688 }
5689
5690 if(tertiary > 0) {
5691 /* This is compression code. */
5692 /* sequence size check is included in the if clause */
5693 if (tertiary == tertiaryCommon && notIsContinuation) {
5694 ++count3;
5695 } else {
5696 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5697 tertiary += tertiaryAddition;
5698 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5699 tertiary -= tertiaryAddition;
5700 }
5701 if (count3 > 0) {
5702 if ((tertiary > tertiaryCommon)) {
5703 while (count3 > coll->tertiaryTopCount) {
5704 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5705 count3 -= (uint32_t)coll->tertiaryTopCount;
5706 }
5707 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5708 } else {
5709 while (count3 > coll->tertiaryBottomCount) {
5710 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5711 count3 -= (uint32_t)coll->tertiaryBottomCount;
5712 }
5713 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5714 }
5715 count3 = 0;
5716 }
5717 *tertiaries++ = tertiary;
5718 }
5719 }
5720
5721 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5722 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5723 IInit_collIterate(coll, (UChar *)source, len, &s);
5724 if(source == normSource) {
5725 s.flags &= ~UCOL_ITER_NORM;
5726 }
5727 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5728 *status = U_BUFFER_OVERFLOW_ERROR;
5729 finished = TRUE;
5730 break;
5731 } else { /* It's much nicer if we can actually reallocate */
5732 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5733 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5734 if(U_SUCCESS(*status)) {
5735 *result = primStart;
5736 primarySafeEnd = primStart + resultLength - 2;
5737 } else {
5738 IInit_collIterate(coll, (UChar *)source, len, &s);
5739 if(source == normSource) {
5740 s.flags &= ~UCOL_ITER_NORM;
5741 }
5742 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5743 finished = TRUE;
5744 break;
5745 }
5746 }
5747 }
5748 }
5749 if(finished) {
5750 break;
5751 } else {
5752 prevBuffSize = minBufferSize;
5753 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5754 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5755 minBufferSize *= 2;
5756 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5757 IInit_collIterate(coll, (UChar *)source, len, &s);
5758 if(source == normSource) {
5759 s.flags &= ~UCOL_ITER_NORM;
5760 }
5761 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5762 break;
5763 }
5764 }
5765 }
5766
5767 if(U_SUCCESS(*status)) {
5768 sortKeySize += (primaries - primStart);
5769 /* we have done all the CE's, now let's put them together to form a key */
5770 if (count2 > 0) {
5771 while (count2 > UCOL_BOT_COUNT2) {
5772 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5773 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5774 }
5775 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5776 }
5777 uint32_t secsize = secondaries-secStart;
5778 sortKeySize += secsize;
5779 if(sortKeySize <= resultLength) {
5780 *(primaries++) = UCOL_LEVELTERMINATOR;
5781 uprv_memcpy(primaries, secStart, secsize);
5782 primaries += secsize;
5783 } else {
5784 if(allocateSKBuffer == TRUE) {
5785 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5786 if(U_SUCCESS(*status)) {
5787 *(primaries++) = UCOL_LEVELTERMINATOR;
5788 *result = primStart;
5789 uprv_memcpy(primaries, secStart, secsize);
5790 }
5791 } else {
5792 *status = U_BUFFER_OVERFLOW_ERROR;
5793 }
5794 }
5795
5796 if (count3 > 0) {
5797 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5798 while (count3 >= coll->tertiaryTopCount) {
5799 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5800 count3 -= (uint32_t)coll->tertiaryTopCount;
5801 }
5802 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5803 } else {
5804 while (count3 > coll->tertiaryBottomCount) {
5805 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5806 count3 -= (uint32_t)coll->tertiaryBottomCount;
5807 }
5808 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5809 }
5810 }
5811 uint32_t tersize = tertiaries - terStart;
5812 sortKeySize += tersize;
5813 if(sortKeySize <= resultLength) {
5814 *(primaries++) = UCOL_LEVELTERMINATOR;
5815 uprv_memcpy(primaries, terStart, tersize);
5816 primaries += tersize;
5817 } else {
5818 if(allocateSKBuffer == TRUE) {
5819 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5820 if(U_SUCCESS(*status)) {
5821 *result = primStart;
5822 *(primaries++) = UCOL_LEVELTERMINATOR;
5823 uprv_memcpy(primaries, terStart, tersize);
5824 }
5825 } else {
5826 *status = U_MEMORY_ALLOCATION_ERROR;
5827 }
5828 }
5829
5830 *(primaries++) = '\0';
5831 }
5832
5833 if(terStart != tert) {
5834 uprv_free(terStart);
5835 uprv_free(secStart);
5836 }
5837
5838 if(normSource != normBuffer) {
5839 uprv_free(normSource);
5840 }
5841
5842 if(allocateSKBuffer == TRUE) {
5843 *result = (uint8_t*)uprv_malloc(sortKeySize);
374ca955
A
5844 /* test for NULL */
5845 if (*result == NULL) {
5846 *status = U_MEMORY_ALLOCATION_ERROR;
5847 return sortKeySize;
5848 }
b75a7d8f
A
5849 uprv_memcpy(*result, primStart, sortKeySize);
5850 if(primStart != prim) {
5851 uprv_free(primStart);
5852 }
5853 }
5854
5855 return sortKeySize;
5856}
5857
5858static inline
5859UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5860 UBool notIsContinuation = !isContinuation(CE);
374ca955 5861 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
b75a7d8f
A
5862 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5863 || (!notIsContinuation && *wasShifted))
5864 || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5865 // The stuff below should probably be in the sortkey code... maybe not...
5866 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5867 /* we should just completely ignore it */
5868 *wasShifted = TRUE;
5869 //continue;
5870 }
5871 //*wasShifted = TRUE;
5872 return TRUE;
5873 } else {
5874 *wasShifted = FALSE;
5875 return FALSE;
5876 }
5877}
5878static inline
5879void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5880 if(level < maxLevel) {
5881 dest[i++] = UCOL_LEVELTERMINATOR;
5882 } else {
5883 dest[i++] = 0;
5884 }
5885}
5886
5887/** enumeration of level identifiers for partial sort key generation */
5888enum {
5889 UCOL_PSK_PRIMARY = 0,
5890 UCOL_PSK_SECONDARY = 1,
5891 UCOL_PSK_CASE = 2,
5892 UCOL_PSK_TERTIARY = 3,
5893 UCOL_PSK_QUATERNARY = 4,
5894 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5895 UCOL_PSK_IDENTICAL = 6,
5896 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5897 UCOL_PSK_LIMIT
5898};
5899
374ca955
A
5900/** collation state enum. *_SHIFT value is how much to shift right
5901 * to get the state piece to the right. *_MASK value should be
b75a7d8f
A
5902 * ANDed with the shifted state. This data is stored in state[1]
5903 * field.
5904 */
5905enum {
5906 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5907 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5908 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
374ca955 5909 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
b75a7d8f
A
5910 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5911 * This field is also used to denote that the French secondary level is finished
5912 */
5913 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5914 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5915 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5916 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5917 /** When we do French we need to reverse secondary values. However, continuations
374ca955 5918 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
b75a7d8f
A
5919 */
5920 UCOL_PSK_USED_ELEMENTS_SHIFT = 7,
5921 UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF,
5922 UCOL_PSK_ITER_SKIP_SHIFT = 17,
5923 UCOL_PSK_ITER_SKIP_MASK = 0x7FFF
5924};
5925
5926
374ca955
A
5927/** main sortkey part procedure. On the first call,
5928 * you should pass in a collator, an iterator, empty state
b75a7d8f
A
5929 * state[0] == state[1] == 0, a buffer to hold results
5930 * number of bytes you need and an error code pointer.
5931 * Make sure your buffer is big enough to hold the wanted
374ca955
A
5932 * number of sortkey bytes. I don't check.
5933 * The only meaningful status you can get back is
5934 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
b75a7d8f
A
5935 * have been dealt a raw deal and that you probably won't
5936 * be able to use partial sortkey generation for this
5937 * particular combination of string and collator. This
5938 * is highly unlikely, but you should still check the error code.
374ca955
A
5939 * Any other status means that you're not in a sane situation
5940 * anymore. After the first call, preserve state values and
b75a7d8f
A
5941 * use them on subsequent calls to obtain more bytes of a sortkey.
5942 * Use until the number of bytes written is smaller than the requested
5943 * number of bytes. Generated sortkey is not compatible with the
5944 * one generated by ucol_getSortKey, as we don't do any compression.
5945 * However, levels are still terminated by a 1 (one) and the sortkey
5946 * is terminated by a 0 (zero). Identical level is the same as in the
374ca955
A
5947 * regular sortkey - internal bocu-1 implementation is used.
5948 * For curious, although you cannot do much about this, here is
b75a7d8f
A
5949 * the structure of state words.
5950 * state[0] - iterator state. Depends on the iterator implementation,
5951 * but allows the iterator to continue where it stopped in
5952 * the last iteration.
5953 * state[1] - collation processing state. Here is the distribution
5954 * of the bits:
5955 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5956 * quaternary, quin (we don't use this one), identical and
5957 * null (producing only zeroes - first one to terminate the
5958 * sortkey and subsequent to fill the buffer).
5959 * 3 - byte count. Number of bytes written on the primary level.
5960 * 4 - was shifted. Whether the previous iteration finished in the
5961 * shifted state.
5962 * 5, 6 - French continuation bytes written. See the comment in the enum
374ca955
A
5963 * 7..16 - Used elements. Number of CEs that were already used from the
5964 * expansion buffer or number of bytes from a bocu sequence on
b75a7d8f 5965 * the identical level.
374ca955 5966 * 17..31 - iterator skip. Number of move operations iterator needs to
b75a7d8f
A
5967 * skip from the current state in order to continue. This is used
5968 * only if normalization is turned on, since the normalizing iterator
374ca955 5969 * can return undefined state, which means that it's in the middle
b75a7d8f
A
5970 * of normalizing sequence.
5971 */
374ca955 5972U_CAPI int32_t U_EXPORT2
b75a7d8f
A
5973ucol_nextSortKeyPart(const UCollator *coll,
5974 UCharIterator *iter,
5975 uint32_t state[2],
5976 uint8_t *dest, int32_t count,
5977 UErrorCode *status) {
5978 /* error checking */
5979 if(status==NULL || U_FAILURE(*status)) {
5980 return 0;
5981 }
374ca955 5982 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
b75a7d8f
A
5983 if( coll==NULL || iter==NULL ||
5984 state==NULL ||
5985 count<0 || (count>0 && dest==NULL)
5986 ) {
5987 *status=U_ILLEGAL_ARGUMENT_ERROR;
5988 }
5989
374ca955
A
5990 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5991 coll, iter, state[0], state[1], dest, count);
b75a7d8f
A
5992
5993 if(count==0) {
5994 /* nothing to do */
374ca955 5995 UTRACE_EXIT_VALUE(0);
b75a7d8f
A
5996 return 0;
5997 }
5998
5999 /** Setting up situation according to the state we got from the previous iteration */
6000 // The state of the iterator from the previous invocation
6001 uint32_t iterState = state[0];
6002 // Has the last iteration ended in the shifted state
6003 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
6004 // What is the current level of the sortkey?
6005 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
6006 // Have we written only one byte from a two byte primary in the previous iteration?
6007 // Also on secondary level - have we finished with the French secondary?
374ca955 6008 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
b75a7d8f
A
6009 // number of bytes in the continuation buffer for French
6010 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
6011 // Skip the CEs that we got from an extraction
6012 // and delivered in the previous call
6013 int32_t usedElements = (state[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT) & UCOL_PSK_USED_ELEMENTS_MASK;
6014 // Number of times to skip because the iterator returned
374ca955 6015 // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
b75a7d8f
A
6016 // last valid state.
6017 int32_t iterSkips = (state[1] >> UCOL_PSK_ITER_SKIP_SHIFT) & UCOL_PSK_ITER_SKIP_MASK;
6018
6019 /** values that depend on the collator attributes */
374ca955 6020 // strength of the collator.
b75a7d8f
A
6021 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
6022 // maximal level of the partial sortkey. Need to take whether case level is done
6023 int32_t maxLevel = 0;
6024 if(strength < UCOL_TERTIARY) {
6025 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6026 maxLevel = UCOL_PSK_CASE;
6027 } else {
6028 maxLevel = strength;
6029 }
6030 } else {
6031 if(strength == UCOL_TERTIARY) {
6032 maxLevel = UCOL_PSK_TERTIARY;
6033 } else if(strength == UCOL_QUATERNARY) {
6034 maxLevel = UCOL_PSK_QUATERNARY;
6035 } else { // identical
6036 maxLevel = UCOL_IDENTICAL;
6037 }
6038 }
6039 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
374ca955 6040 uint8_t UCOL_HIRAGANA_QUAD =
b75a7d8f
A
6041 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
6042 // Boundary value that decides whether a CE is shifted or not
6043 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
6044 // Are we doing French collation?
6045 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
6046
6047 /** initializing the collation state */
6048 UBool notIsContinuation = FALSE;
6049 uint32_t CE = UCOL_NO_MORE_CES;
6050
6051 collIterate s;
6052 IInit_collIterate(coll, NULL, -1, &s);
6053 s.iterator = iter;
6054 s.flags |= UCOL_USE_ITERATOR;
6055 // This variable tells us whether we have produced some other levels in this iteration
374ca955 6056 // before we moved to the identical level. In that case, we need to switch the
b75a7d8f
A
6057 // type of the iterator.
6058 UBool doingIdenticalFromStart = FALSE;
6059 // Normalizing iterator
6060 // The division for the array length may truncate the array size to
6061 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6062 // for all platforms anyway.
6063 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6064 UNormIterator *normIter = NULL;
6065 // If the normalization is turned on for the collator and we are below identical level
6066 // we will use a FCD normalizing iterator
6067 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
6068 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6069 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
6070 s.flags &= ~UCOL_ITER_NORM;
6071 if(U_FAILURE(*status)) {
374ca955 6072 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
6073 return 0;
6074 }
6075 } else if(level == UCOL_PSK_IDENTICAL) {
374ca955 6076 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
b75a7d8f
A
6077 // will be updating the state - and this cannot be done on an ordinary iterator.
6078 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6079 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6080 s.flags &= ~UCOL_ITER_NORM;
6081 if(U_FAILURE(*status)) {
374ca955 6082 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
6083 return 0;
6084 }
6085 doingIdenticalFromStart = TRUE;
6086 }
6087
6088 // This is the tentative new state of the iterator. The problem
6089 // is that the iterator might return an undefined state, in
6090 // which case we should save the last valid state and increase
6091 // the iterator skip value.
6092 uint32_t newState = 0;
6093
6094 // First, we set the iterator to the last valid position
6095 // from the last iteration. This was saved in state[0].
6096 if(iterState == 0) {
6097 /* initial state */
6098 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
6099 s.iterator->move(s.iterator, 0, UITER_LIMIT);
6100 } else {
6101 s.iterator->move(s.iterator, 0, UITER_START);
6102 }
6103 } else {
6104 /* reset to previous state */
6105 s.iterator->setState(s.iterator, iterState, status);
6106 if(U_FAILURE(*status)) {
374ca955 6107 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
6108 return 0;
6109 }
6110 }
6111
6112 // Then, we may have to move more, if the normalizing iterator
6113 // was going through a normalizing sequence.
6114 if(iterSkips) {
6115 // if we are on secondary level AND we do French, we need to go backward instead of forward
6116 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6117 s.iterator->move(s.iterator, -iterSkips, UITER_CURRENT);
6118 } else {
6119 s.iterator->move(s.iterator, iterSkips, UITER_CURRENT);
6120 }
6121 }
6122
6123
6124 // Number of expansion CEs that were already consumed in the
6125 // previous iteration for the last code point processed. We
374ca955 6126 // want to clean out the expansion buffer, so that we can
b75a7d8f
A
6127 // get correct CEs. This value is persistent over iterations,
6128 // since we can have several iterations on the one expansion
6129 // buffer.
6130 int32_t consumedExpansionCEs = usedElements;
6131 // Number of bytes already writted from a bocsu sequence. Since
6132 // the longes bocsu sequence is 4 long, this can be up to 3. It
6133 // shares the state field with consumedExpansionCEs value, since
6134 // they cannot simultanously appear on the same level
6135 int32_t bocsuBytesUsed = 0;
374ca955 6136 // Clean out the expansion buffer unless we are on
b75a7d8f
A
6137 // identical level. In that case we use this field
6138 // to store the number of bytes already written
6139 // from the previous bocsu sequence.
6140 if(level < UCOL_PSK_IDENTICAL && usedElements != 0) {
6141 while(usedElements-->0) {
374ca955 6142 // If we're doing French and we are on the secondary level,
b75a7d8f
A
6143 // we go backwards.
6144 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6145 CE = ucol_IGetPrevCE(coll, &s, status);
6146 } else {
6147 CE = ucol_IGetNextCE(coll, &s, status);
6148 }
6149 if(CE==UCOL_NO_MORE_CES) {
6150 /* should not happen */
6151 *status=U_INTERNAL_PROGRAM_ERROR;
374ca955 6152 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
6153 return 0;
6154 }
6155 }
6156 } else {
6157 bocsuBytesUsed = usedElements;
6158 }
6159
6160 // This variable prevents the adjusting of iterator
374ca955 6161 // skip variable when we are the first time on a
b75a7d8f
A
6162 // level. I hope there is a better way to do it, but
6163 // I could not think of it.
6164 UBool firstTimeOnLevel = TRUE;
6165 // French secondary needs to know whether the iterator state of zero came from previous level OR
6166 // from a new invocation...
6167 UBool wasDoingPrimary = FALSE;
374ca955 6168 // Case level is kind of goofy. This variable tells us that
b75a7d8f 6169 // we are still not done with the case level.
374ca955 6170 UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE;
b75a7d8f
A
6171 // destination buffer byte counter. When this guy
6172 // gets to count, we're done with the iteration
374ca955
A
6173 int32_t i = 0;
6174 // used to count the zero bytes written after we
b75a7d8f
A
6175 // have finished with the sort key
6176 int32_t j = 0;
6177
6178
6179 // Hm.... I think we're ready to plunge in. Basic story is as following:
6180 // we have a fall through case based on level. This is used for initial
6181 // positioning on iteration start. Every level processor contains a
6182 // for(;;) which will be broken when we exhaust all the CEs. Other
6183 // way to exit is a goto saveState, which happens when we have filled
6184 // out our buffer.
6185 switch(level) {
374ca955 6186 case UCOL_PSK_PRIMARY:
b75a7d8f
A
6187 wasDoingPrimary = TRUE;
6188 for(;;) {
6189 if(i==count) {
6190 goto saveState;
6191 }
6192 // We should save the state only if we
6193 // are sure that we are done with the
6194 // previous iterator state
6195 if(consumedExpansionCEs == 0 && byteCountOrFrenchDone == 0) {
6196 newState = s.iterator->getState(s.iterator);
6197 if(newState != UITER_NO_STATE) {
6198 iterState = newState;
6199 iterSkips = 0;
6200 } else {
6201 if(!firstTimeOnLevel && !byteCountOrFrenchDone) {
6202 iterSkips++;
6203 }
6204 }
6205 }
6206 firstTimeOnLevel = FALSE;
6207 CE = ucol_IGetNextCE(coll, &s, status);
6208 if(CE==UCOL_NO_MORE_CES) {
6209 // Add the level separator
6210 terminatePSKLevel(level, maxLevel, i, dest);
6211 byteCountOrFrenchDone=0;
6212 // Restart the iteration an move to the
6213 // second level
6214 s.iterator->move(s.iterator, 0, UITER_START);
6215 level = UCOL_PSK_SECONDARY;
6216 break;
6217 }
6218 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6219 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
6220 if(CE != 0) {
6221 if(byteCountOrFrenchDone == 0) {
6222 // get the second byte of primary
6223 dest[i++]=(uint8_t)(CE >> 8);
6224 } else {
6225 byteCountOrFrenchDone = 0;
6226 }
6227 if((CE &=0xff)!=0) {
6228 if(i==count) {
6229 /* overflow */
6230 byteCountOrFrenchDone=1;
6231 goto saveState;
6232 }
6233 dest[i++]=(uint8_t)CE;
6234 }
6235 }
6236 }
374ca955 6237 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
b75a7d8f
A
6238 // s.pos != NULL means there is a normalization buffer in effect
6239 // in iterative case, this means that we are doing Thai (maybe discontiguos)
6240 consumedExpansionCEs++;
6241 } else {
6242 consumedExpansionCEs = 0;
6243 }
374ca955 6244 if(s.pos && *s.pos == 0) {
b75a7d8f
A
6245 // maybe it is the end of Thai - we have to have
6246 // an extra skip
6247 iterSkips++;
6248 }
6249 }
6250 /* fall through to next level */
6251 case UCOL_PSK_SECONDARY:
6252 if(strength >= UCOL_SECONDARY) {
6253 if(!doingFrench) {
6254 for(;;) {
6255 if(i == count) {
6256 goto saveState;
6257 }
6258 // We should save the state only if we
6259 // are sure that we are done with the
6260 // previous iterator state
6261 if(consumedExpansionCEs == 0) {
6262 newState = s.iterator->getState(s.iterator);
6263 if(newState != UITER_NO_STATE) {
6264 iterState = newState;
6265 iterSkips = 0;
6266 } else {
6267 if(!firstTimeOnLevel) {
6268 iterSkips++;
6269 }
6270 }
6271 }
6272 firstTimeOnLevel = FALSE;
6273 CE = ucol_IGetNextCE(coll, &s, status);
6274 if(CE==UCOL_NO_MORE_CES) {
6275 // Add the level separator
6276 terminatePSKLevel(level, maxLevel, i, dest);
6277 byteCountOrFrenchDone=0;
6278 // Restart the iteration an move to the
6279 // second level
374ca955 6280 s.iterator->move(s.iterator, 0, UITER_START);
b75a7d8f
A
6281 level = UCOL_PSK_CASE;
6282 break;
6283 }
6284 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6285 CE >>= 8; /* get secondary */
6286 if(CE != 0) {
6287 dest[i++]=(uint8_t)CE;
6288 }
6289 }
6290 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6291 consumedExpansionCEs++;
6292 } else {
6293 consumedExpansionCEs = 0;
6294 }
374ca955 6295 if(s.pos && *s.pos == 0) {
b75a7d8f
A
6296 iterSkips++;
6297 }
6298 }
6299 } else { // French secondary processing
6300 uint8_t frenchBuff[UCOL_MAX_BUFFER];
6301 int32_t frenchIndex = 0;
6302 // Here we are going backwards.
374ca955
A
6303 // If the iterator is at the beggining, it should be
6304 // moved to end.
b75a7d8f
A
6305 if(wasDoingPrimary) {
6306 s.iterator->move(s.iterator, 0, UITER_LIMIT);
6307 }
6308 for(;;) {
6309 if(i == count) {
6310 goto saveState;
6311 }
6312 if(consumedExpansionCEs == 0) {
6313 newState = s.iterator->getState(s.iterator);
6314 if(newState != UITER_NO_STATE) {
6315 iterState = newState;
6316 iterSkips = 0;
374ca955 6317 } else {
b75a7d8f
A
6318 if(!firstTimeOnLevel) {
6319 iterSkips++;
6320 }
6321 }
6322 }
6323 firstTimeOnLevel = FALSE;
6324 CE = ucol_IGetPrevCE(coll, &s, status);
6325 if(CE==UCOL_NO_MORE_CES) {
6326 // Add the level separator
6327 terminatePSKLevel(level, maxLevel, i, dest);
6328 byteCountOrFrenchDone=0;
6329 // Restart the iteration an move to the next level
6330 s.iterator->move(s.iterator, 0, UITER_START);
6331 level = UCOL_PSK_CASE;
6332 break;
6333 }
374ca955 6334 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
b75a7d8f
A
6335 // reverse when we get a first non-continuation CE.
6336 CE >>= 8;
6337 frenchBuff[frenchIndex++] = (uint8_t)CE;
374ca955 6338 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
b75a7d8f
A
6339 CE >>= 8; /* get secondary */
6340 if(!frenchIndex) {
6341 if(CE != 0) {
6342 dest[i++]=(uint8_t)CE;
6343 }
6344 } else {
6345 frenchBuff[frenchIndex++] = (uint8_t)CE;
6346 frenchIndex -= usedFrench;
6347 usedFrench = 0;
6348 while(i < count && frenchIndex) {
6349 dest[i++] = frenchBuff[--frenchIndex];
6350 usedFrench++;
6351 }
6352 }
6353 }
6354 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6355 consumedExpansionCEs++;
6356 } else {
6357 consumedExpansionCEs = 0;
6358 }
6359 if(s.pos && *s.pos == 0) {
6360 iterSkips++;
6361 }
6362 }
6363 }
6364 } else {
6365 level = UCOL_PSK_CASE;
6366 }
6367 /* fall through to next level */
6368 case UCOL_PSK_CASE:
6369 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6370 uint32_t caseShift = UCOL_CASE_SHIFT_START;
6371 uint8_t caseByte = UCOL_CASE_BYTE_START;
6372 uint8_t caseBits = 0;
6373
6374 for(;;) {
6375 if(i == count) {
6376 goto saveState;
6377 }
6378 // We should save the state only if we
6379 // are sure that we are done with the
6380 // previous iterator state
6381 if(consumedExpansionCEs == 0) {
6382 newState = s.iterator->getState(s.iterator);
6383 if(newState != UITER_NO_STATE) {
6384 iterState = newState;
6385 iterSkips = 0;
6386 } else {
6387 if(!firstTimeOnLevel) {
6388 iterSkips++;
6389 }
6390 }
6391 }
6392 firstTimeOnLevel = FALSE;
6393 CE = ucol_IGetNextCE(coll, &s, status);
6394 if(CE==UCOL_NO_MORE_CES) {
6395 // On the case level we might have an unfinished
6396 // case byte. Add one if it's started.
6397 if(caseShift != UCOL_CASE_SHIFT_START) {
6398 dest[i++] = caseByte;
6399 }
6400 // This is kind of tricky - situation where
374ca955 6401 // we need to keep the iterator in the old
b75a7d8f
A
6402 // state, but don't need to bring anything
6403 // to the next invocation
6404 if(i < count) {
6405 // Add the level separator
6406 terminatePSKLevel(level, maxLevel, i, dest);
6407 // Restart the iteration and move to the
6408 // next level
6409 s.iterator->move(s.iterator, 0, UITER_START);
6410 level = UCOL_PSK_TERTIARY;
6411 } else {
6412 dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE;
6413 }
6414 break;
6415 }
6416
6417 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6418 if(!isContinuation(CE)) {
6419 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6420 caseBits = (uint8_t)(CE & 0xC0);
374ca955 6421 // this copies the case level logic from the
b75a7d8f
A
6422 // sort key generation code
6423 if(CE != 0) {
6424 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6425 if((caseBits & 0xC0) == 0) {
6426 caseByte |= 1 << (--caseShift);
6427 } else {
6428 caseByte |= 0 << (--caseShift);
6429 /* second bit */
6430 if(caseShift == 0) {
6431 dest[i++] = caseByte;
6432 caseShift = UCOL_CASE_SHIFT_START;
6433 caseByte = UCOL_CASE_BYTE_START;
6434 }
6435 caseByte |= ((caseBits>>6)&1) << (--caseShift);
6436 }
6437 } else {
6438 if((caseBits & 0xC0) == 0) {
6439 caseByte |= 0 << (--caseShift);
6440 } else {
6441 caseByte |= 1 << (--caseShift);
6442 /* second bit */
6443 if(caseShift == 0) {
6444 dest[i++] = caseByte;
6445 caseShift = UCOL_CASE_SHIFT_START;
6446 caseByte = UCOL_CASE_BYTE_START;
6447 }
6448 caseByte |= ((caseBits>>7)&1) << (--caseShift);
6449 }
6450 }
6451 }
6452
6453 }
6454 }
6455 // Not sure this is correct for the case level - revisit
6456 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6457 consumedExpansionCEs++;
6458 } else {
6459 consumedExpansionCEs = 0;
6460 }
6461 if(s.pos && *s.pos == 0) {
6462 iterSkips++;
6463 }
6464 }
6465 } else {
6466 level = UCOL_PSK_TERTIARY;
6467 }
6468 /* fall through to next level */
6469 case UCOL_PSK_TERTIARY:
6470 if(strength >= UCOL_TERTIARY) {
6471 for(;;) {
6472 if(i == count) {
6473 goto saveState;
6474 }
6475 // We should save the state only if we
6476 // are sure that we are done with the
6477 // previous iterator state
6478 if(consumedExpansionCEs == 0) {
6479 newState = s.iterator->getState(s.iterator);
6480 if(newState != UITER_NO_STATE) {
6481 iterState = newState;
6482 iterSkips = 0;
6483 } else {
6484 if(!firstTimeOnLevel) {
6485 iterSkips++;
6486 }
6487 }
6488 }
6489 firstTimeOnLevel = FALSE;
6490 CE = ucol_IGetNextCE(coll, &s, status);
6491 if(CE==UCOL_NO_MORE_CES) {
6492 // Add the level separator
6493 terminatePSKLevel(level, maxLevel, i, dest);
6494 byteCountOrFrenchDone=0;
6495 // Restart the iteration an move to the
6496 // second level
6497 s.iterator->move(s.iterator, 0, UITER_START);
6498 level = UCOL_PSK_QUATERNARY;
6499 break;
6500 }
6501 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6502 notIsContinuation = !isContinuation(CE);
6503
6504 if(notIsContinuation) {
6505 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6506 CE ^= coll->caseSwitch;
6507 CE &= coll->tertiaryMask;
6508 } else {
6509 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6510 }
6511
6512 if(CE != 0) {
6513 dest[i++]=(uint8_t)CE;
6514 }
6515 }
6516 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6517 consumedExpansionCEs++;
6518 } else {
6519 consumedExpansionCEs = 0;
6520 }
6521 if(s.pos && *s.pos == 0) {
6522 iterSkips++;
6523 }
6524 }
6525 } else {
6526 // if we're not doing tertiary
6527 // skip to the end
6528 level = UCOL_PSK_NULL;
6529 }
6530 /* fall through to next level */
6531 case UCOL_PSK_QUATERNARY:
6532 if(strength >= UCOL_QUATERNARY) {
6533 for(;;) {
6534 if(i == count) {
6535 goto saveState;
6536 }
6537 // We should save the state only if we
6538 // are sure that we are done with the
6539 // previous iterator state
6540 if(consumedExpansionCEs == 0) {
6541 newState = s.iterator->getState(s.iterator);
6542 if(newState != UITER_NO_STATE) {
6543 iterState = newState;
6544 iterSkips = 0;
6545 } else {
6546 if(!firstTimeOnLevel) {
6547 iterSkips++;
6548 }
6549 }
6550 }
6551 firstTimeOnLevel = FALSE;
6552 CE = ucol_IGetNextCE(coll, &s, status);
6553 if(CE==UCOL_NO_MORE_CES) {
6554 // Add the level separator
6555 terminatePSKLevel(level, maxLevel, i, dest);
374ca955 6556 //dest[i++] = UCOL_LEVELTERMINATOR;
b75a7d8f
A
6557 byteCountOrFrenchDone=0;
6558 // Restart the iteration an move to the
6559 // second level
6560 s.iterator->move(s.iterator, 0, UITER_START);
6561 level = UCOL_PSK_QUIN;
6562 break;
6563 }
6564 if(isShiftedCE(CE, LVT, &wasShifted)) {
6565 CE >>= 16; /* get primary */
6566 if(CE != 0) {
6567 if(byteCountOrFrenchDone == 0) {
6568 dest[i++]=(uint8_t)(CE >> 8);
6569 } else {
6570 byteCountOrFrenchDone = 0;
6571 }
6572 if((CE &=0xff)!=0) {
6573 if(i==count) {
6574 /* overflow */
6575 byteCountOrFrenchDone=1;
6576 goto saveState;
6577 }
6578 dest[i++]=(uint8_t)CE;
6579 }
6580 }
6581 } else {
6582 notIsContinuation = !isContinuation(CE);
6583 if(notIsContinuation) {
6584 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6585 dest[i++] = UCOL_HIRAGANA_QUAD;
6586 } else {
6587 dest[i++] = 0xFF;
6588 }
6589 }
6590 }
6591 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6592 consumedExpansionCEs++;
6593 } else {
6594 consumedExpansionCEs = 0;
6595 }
6596 if(s.pos && *s.pos == 0) {
6597 iterSkips++;
6598 }
6599 }
6600 } else {
6601 // if we're not doing quaternary
6602 // skip to the end
6603 level = UCOL_PSK_NULL;
6604 }
6605 /* fall through to next level */
6606 case UCOL_PSK_QUIN:
6607 level = UCOL_PSK_IDENTICAL;
6608 /* fall through to next level */
6609 case UCOL_PSK_IDENTICAL:
6610 if(strength >= UCOL_IDENTICAL) {
6611 UChar32 first, second;
6612 int32_t bocsuBytesWritten = 0;
374ca955 6613 // We always need to do identical on
b75a7d8f
A
6614 // the NFD form of the string.
6615 if(normIter == NULL) {
6616 // we arrived from the level below and
6617 // normalization was not turned on.
6618 // therefore, we need to make a fresh NFD iterator
6619 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6620 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
374ca955 6621 } else if(!doingIdenticalFromStart) {
b75a7d8f 6622 // there is an iterator, but we did some other levels.
374ca955
A
6623 // therefore, we have a FCD iterator - need to make
6624 // a NFD one.
b75a7d8f
A
6625 // normIter being at the beginning does not guarantee
6626 // that the underlying iterator is at the beginning
6627 iter->move(iter, 0, UITER_START);
6628 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6629 }
6630 // At this point we have a NFD iterator that is positioned
6631 // in the right place
6632 if(U_FAILURE(*status)) {
374ca955 6633 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
6634 return 0;
6635 }
6636 first = uiter_previous32(s.iterator);
6637 // maybe we're at the start of the string
6638 if(first == U_SENTINEL) {
6639 first = 0;
6640 } else {
6641 uiter_next32(s.iterator);
6642 }
6643
6644 j = 0;
6645 for(;;) {
6646 if(i == count) {
6647 if(j+1 < bocsuBytesWritten) {
6648 bocsuBytesUsed = j+1;
6649 }
6650 goto saveState;
6651 }
6652
374ca955 6653 // On identical level, we will always save
b75a7d8f
A
6654 // the state if we reach this point, since
6655 // we don't depend on getNextCE for content
6656 // all the content is in our buffer and we
6657 // already either stored the full buffer OR
6658 // otherwise we won't arrive here.
6659 newState = s.iterator->getState(s.iterator);
6660 if(newState != UITER_NO_STATE) {
6661 iterState = newState;
6662 iterSkips = 0;
6663 } else {
6664 iterSkips++;
6665 }
6666
6667 uint8_t buff[4];
6668 second = uiter_next32(s.iterator);
6669
6670 // end condition for identical level
6671 if(second == U_SENTINEL) {
6672 terminatePSKLevel(level, maxLevel, i, dest);
6673 level = UCOL_PSK_NULL;
6674 break;
6675 }
6676 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6677 first = second;
6678
6679 j = 0;
6680 if(bocsuBytesUsed != 0) {
6681 while(bocsuBytesUsed-->0) {
6682 j++;
6683 }
6684 }
6685
6686 while(i < count && j < bocsuBytesWritten) {
6687 dest[i++] = buff[j++];
6688 }
6689 }
6690
6691 } else {
6692 level = UCOL_PSK_NULL;
6693 }
6694 /* fall through to next level */
6695 case UCOL_PSK_NULL:
6696 j = i;
6697 while(j<count) {
6698 dest[j++]=0;
6699 }
6700 break;
6701 default:
6702 *status = U_INTERNAL_PROGRAM_ERROR;
374ca955 6703 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
6704 return 0;
6705 }
6706
6707saveState:
6708 // Now we need to return stuff. First we want to see whether we have
6709 // done everything for the current state of iterator.
374ca955 6710 if(consumedExpansionCEs || byteCountOrFrenchDone
b75a7d8f 6711 || dontAdvanceIteratorBecauseWeNeedALevelTerminator) {
374ca955
A
6712 // Any of above mean that the previous transaction
6713 // wasn't finished and that we should store the
b75a7d8f
A
6714 // previous iterator state.
6715 state[0] = iterState;
6716 } else {
374ca955 6717 // The transaction is complete. We will continue in
b75a7d8f
A
6718 // next iteration.
6719 if((newState = s.iterator->getState(s.iterator))!= UITER_NO_STATE) {
6720 state[0] = s.iterator->getState(s.iterator);
6721 iterSkips = 0;
6722 } else {
6723 state[0] = iterState;
6724 iterSkips++;
6725 }
6726 }
6727 // Store the number of elements processed. On CE levels, this is
6728 // the number of expansion CEs processed. On identical level, this
6729 // is the number of bocsu bytes written.
6730 if(level < UCOL_PSK_IDENTICAL) {
6731 if((consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) != consumedExpansionCEs) {
6732 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6733 }
6734 state[1] = (consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6735 } else {
6736 if((bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) != bocsuBytesUsed) {
6737 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6738 }
6739 state[1] = (bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6740 }
6741
6742 // Next we put in the level of comparison
374ca955 6743 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
b75a7d8f
A
6744
6745 // If we are doing French, we need to store whether we have just finished the French level
6746 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6747 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6748 } else {
6749 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6750 }
6751
6752 // Was the latest CE shifted
6753 if(wasShifted) {
6754 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6755 }
6756 // Check for iterSkips overflow
6757 if((iterSkips & UCOL_PSK_ITER_SKIP_MASK) != iterSkips) {
6758 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6759 }
6760 // Store iterSkips
6761 state[1] |= ((iterSkips & UCOL_PSK_ITER_SKIP_MASK) << UCOL_PSK_ITER_SKIP_SHIFT);
6762
6763 // Check for French overflow
6764 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6765 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6766 }
6767 // Store number of bytes written in the French secondary continuation sequence
6768 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6769
6770
6771 // If we have used normalizing iterator, get rid of it
6772 if(normIter != NULL) {
6773 unorm_closeIter(normIter);
6774 }
6775
6776 // Return number of meaningful sortkey bytes.
374ca955
A
6777 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6778 dest,i, state[0], state[1]);
6779 UTRACE_EXIT_VALUE(i);
b75a7d8f
A
6780 return i;
6781}
6782
6783/**
6784 * Produce a bound for a given sortkey and a number of levels.
6785 */
374ca955 6786U_CAPI int32_t U_EXPORT2
b75a7d8f
A
6787ucol_getBound(const uint8_t *source,
6788 int32_t sourceLength,
6789 UColBoundMode boundType,
6790 uint32_t noOfLevels,
6791 uint8_t *result,
6792 int32_t resultLength,
6793 UErrorCode *status) {
374ca955 6794 // consistency checks
b75a7d8f
A
6795 if(status == NULL || U_FAILURE(*status)) {
6796 return 0;
6797 }
6798 if(source == NULL) {
6799 *status = U_ILLEGAL_ARGUMENT_ERROR;
6800 return 0;
6801 }
6802
6803 int32_t sourceIndex = 0;
6804 // Scan the string until we skip enough of the key OR reach the end of the key
6805 do {
6806 sourceIndex++;
6807 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6808 noOfLevels--;
6809 }
374ca955 6810 } while (noOfLevels > 0
b75a7d8f
A
6811 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6812
6813 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6814 && noOfLevels > 0) {
6815 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6816 }
6817
6818
6819 // READ ME: this code assumes that the values for boundType
6820 // enum will not changes. They are set so that the enum value
374ca955 6821 // corresponds to the number of extra bytes each bound type
b75a7d8f
A
6822 // needs.
6823 if(result != NULL && resultLength >= sourceIndex+boundType) {
6824 uprv_memcpy(result, source, sourceIndex);
6825 switch(boundType) {
6826 // Lower bound just gets terminated. No extra bytes
6827 case UCOL_BOUND_LOWER: // = 0
6828 break;
6829 // Upper bound needs one extra byte
6830 case UCOL_BOUND_UPPER: // = 1
6831 result[sourceIndex++] = 2;
6832 break;
6833 // Upper long bound needs two extra bytes
6834 case UCOL_BOUND_UPPER_LONG: // = 2
6835 result[sourceIndex++] = 0xFF;
6836 result[sourceIndex++] = 0xFF;
6837 break;
6838 default:
6839 *status = U_ILLEGAL_ARGUMENT_ERROR;
6840 return 0;
6841 }
6842 result[sourceIndex++] = 0;
6843
6844 return sourceIndex;
6845 } else {
374ca955 6846 return sourceIndex+boundType+1;
b75a7d8f
A
6847 }
6848}
6849
6850static
6851inline void uprv_appendByteToHexString(char *dst, uint8_t val) {
6852 uint32_t len = (uint32_t)uprv_strlen(dst);
6853 *(dst+len) = T_CString_itosOffset((val >> 4));
6854 *(dst+len+1) = T_CString_itosOffset((val & 0xF));
6855 *(dst+len+2) = 0;
6856}
6857
6858/* this function makes a string with representation of a sortkey */
6859U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
6860 int32_t strength = UCOL_PRIMARY;
6861 uint32_t res_size = 0;
6862 UBool doneCase = FALSE;
6863
6864 char *current = buffer;
6865 const uint8_t *currentSk = sortkey;
6866
6867 uprv_strcpy(current, "[");
6868
6869 while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
6870 if(strength > UCOL_PRIMARY) {
374ca955 6871 uprv_strcat(current, " . ");
b75a7d8f
A
6872 }
6873 while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
6874 uprv_appendByteToHexString(current, *currentSk++);
6875 uprv_strcat(current, " ");
6876 }
6877 if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
6878 doneCase = TRUE;
6879 } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
6880 strength ++;
6881 }
6882 uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
6883 if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6884 break;
6885 }
6886 }
6887
6888 if(coll->strength == UCOL_IDENTICAL) {
6889 uprv_strcat(current, " . ");
6890 while(*currentSk != 0) {
6891 uprv_appendByteToHexString(current, *currentSk++);
6892 uprv_strcat(current, " ");
6893 }
6894
6895 uprv_appendByteToHexString(current, *currentSk++);
6896 }
6897 uprv_strcat(current, "]");
6898
6899 if(res_size > *len) {
6900 return NULL;
6901 }
6902
6903 return buffer;
6904}
6905
6906
6907/****************************************************************************/
6908/* Following are the functions that deal with the properties of a collator */
6909/* there are new APIs and some compatibility APIs */
6910/****************************************************************************/
6911
6912static inline void
6913ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6914 int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6915 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6916 UBool reverseSecondary = FALSE;
6917 if(!isContinuation(CE)) {
6918 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6919 tertiary ^= coll->caseSwitch;
6920 reverseSecondary = TRUE;
6921 } else {
6922 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6923 tertiary &= UCOL_REMOVE_CASE;
6924 reverseSecondary = FALSE;
6925 }
6926
6927 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6928 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6929 primary1 = (uint8_t)(CE >> 8);
6930
6931 if(primary1 != 0) {
6932 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6933 *primShift -= 8;
6934 }
6935 if(primary2 != 0) {
6936 if(*primShift < 0) {
6937 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6938 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6939 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6940 return;
6941 }
6942 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6943 *primShift -= 8;
6944 }
6945 if(secondary != 0) {
6946 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6947 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6948 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
374ca955 6949 } else { // normal case
b75a7d8f
A
6950 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6951 }
6952 *secShift -= 8;
6953 }
6954 if(tertiary != 0) {
6955 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6956 *terShift -= 8;
6957 }
6958}
6959
6960static inline UBool
6961ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6962 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6963 if(newTable == NULL) {
6964 *status = U_MEMORY_ALLOCATION_ERROR;
6965 coll->latinOneFailed = TRUE;
6966 return FALSE;
6967 }
6968 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6969 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6970 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6971 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6972 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6973 coll->latinOneTableLen = size;
6974 uprv_free(coll->latinOneCEs);
6975 coll->latinOneCEs = newTable;
6976 return TRUE;
6977}
6978
6979static UBool
6980ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6981 UBool result = TRUE;
6982 if(coll->latinOneCEs == NULL) {
6983 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6984 if(coll->latinOneCEs == NULL) {
6985 *status = U_MEMORY_ALLOCATION_ERROR;
6986 return FALSE;
6987 }
6988 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6989 }
6990 UChar ch = 0;
6991 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6992 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6993
6994 int32_t primShift = 24, secShift = 24, terShift = 24;
6995 uint32_t CE = 0;
6996 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6997
6998 // TODO: make safe if you get more than you wanted...
6999 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
7000 primShift = 24; secShift = 24; terShift = 24;
7001 if(ch < 0x100) {
7002 CE = coll->latinOneMapping[ch];
7003 } else {
7004 CE = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
374ca955
A
7005 if(CE == UCOL_NOT_FOUND && coll->UCA) {
7006 CE = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
b75a7d8f
A
7007 }
7008 }
7009 if(CE < UCOL_NOT_FOUND) {
7010 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
7011 } else {
7012 switch (getCETag(CE)) {
7013 case EXPANSION_TAG:
374ca955 7014 case DIGIT_TAG:
b75a7d8f 7015 ucol_setText(it, &ch, 1, status);
374ca955 7016 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
b75a7d8f
A
7017 if(primShift < 0 || secShift < 0 || terShift < 0) {
7018 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
7019 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
7020 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
7021 break;
7022 }
7023 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
7024 }
7025 break;
7026 case CONTRACTION_TAG:
7027 // here is the trick
7028 // F2 is contraction. We do something very similar to contractions
7029 // but have two indices, one in the real contraction table and the
7030 // other to where we stuffed things. This hopes that we don't have
7031 // many contractions (this should work for latin-1 tables).
7032 {
7033 if((CE & 0x00FFF000) != 0) {
7034 *status = U_UNSUPPORTED_ERROR;
374ca955 7035 coll->latinOneFailed = TRUE;
b75a7d8f
A
7036 return FALSE;
7037 }
7038
7039 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
7040
7041 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
374ca955 7042
b75a7d8f
A
7043 coll->latinOneCEs[ch] = CE;
7044 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
7045 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
7046
7047 // We're going to jump into contraction table, pick the elements
7048 // and use them
7049 do {
7050 CE = *(coll->contractionCEs +
7051 (UCharOffset - coll->contractionIndex));
374ca955 7052 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
b75a7d8f
A
7053 uint32_t size;
7054 uint32_t i; /* general counter */
7055 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
7056 size = getExpansionCount(CE);
7057 //CE = *CEOffset++;
7058 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
7059 for(i = 0; i<size; i++) {
7060 if(primShift < 0 || secShift < 0 || terShift < 0) {
7061 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7062 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7063 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7064 break;
7065 }
7066 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
7067 }
7068 } else { /* else, we do */
7069 while(*CEOffset != 0) {
7070 if(primShift < 0 || secShift < 0 || terShift < 0) {
7071 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7072 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7073 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7074 break;
7075 }
7076 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
7077 }
7078 }
7079 contractionOffset++;
7080 } else if(CE < UCOL_NOT_FOUND) {
7081 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
7082 } else {
7083 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7084 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7085 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7086 contractionOffset++;
374ca955 7087 }
b75a7d8f
A
7088 UCharOffset++;
7089 primShift = 24; secShift = 24; terShift = 24;
7090 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
7091 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
374ca955 7092 coll->latinOneFailed = TRUE;
b75a7d8f
A
7093 return FALSE;
7094 }
7095 }
7096 } while(*UCharOffset != 0xFFFF);
7097 }
7098 break;
7099 default:
7100 coll->latinOneFailed = TRUE;
7101 result = FALSE;
7102 break;
7103 }
7104 }
7105 }
7106 ucol_closeElements(it);
7107 // compact table
7108 if(contractionOffset < coll->latinOneTableLen) {
7109 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
374ca955
A
7110 coll->latinOneFailed = TRUE;
7111 return FALSE;
b75a7d8f
A
7112 }
7113 }
7114 return result;
7115}
7116
7117void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
7118 if(U_SUCCESS(*status)) {
7119 if(coll->caseFirst == UCOL_UPPER_FIRST) {
7120 coll->caseSwitch = UCOL_CASE_SWITCH;
7121 } else {
7122 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
7123 }
7124
7125 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
7126 coll->tertiaryMask = UCOL_REMOVE_CASE;
7127 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7128 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
7129 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
7130 coll->tertiaryBottom = UCOL_COMMON_BOT3;
7131 } else {
7132 coll->tertiaryMask = UCOL_KEEP_CASE;
7133 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
7134 if(coll->caseFirst == UCOL_UPPER_FIRST) {
7135 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
7136 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
7137 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
7138 } else {
7139 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7140 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
7141 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
7142 }
7143 }
7144
7145 /* Set the compression values */
7146 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
7147 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
7148 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
7149
7150 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
7151 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
7152 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
7153 } else {
7154 coll->sortKeyGen = ucol_calcSortKey;
7155 }
374ca955 7156 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
b75a7d8f
A
7157 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
7158 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
7159 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
7160 //fprintf(stderr, "F");
7161 coll->latinOneUse = TRUE;
7162 } else {
7163 coll->latinOneUse = FALSE;
7164 }
374ca955
A
7165 if(*status == U_UNSUPPORTED_ERROR) {
7166 *status = U_ZERO_ERROR;
7167 }
b75a7d8f
A
7168 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7169 coll->latinOneUse = TRUE;
7170 }
7171 } else {
7172 coll->latinOneUse = FALSE;
374ca955 7173 }
b75a7d8f
A
7174 }
7175
7176}
7177
7178U_CAPI uint32_t U_EXPORT2
7179ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
7180 if(U_FAILURE(*status) || coll == NULL) {
7181 return 0;
7182 }
7183 if(len == -1) {
7184 len = u_strlen(varTop);
7185 }
7186 if(len == 0) {
7187 *status = U_ILLEGAL_ARGUMENT_ERROR;
7188 return 0;
7189 }
7190
7191 collIterate s;
7192 IInit_collIterate(coll, varTop, len, &s);
7193
7194 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
7195
7196 /* here we check if we have consumed all characters */
7197 /* you can put in either one character or a contraction */
7198 /* you shouldn't put more... */
7199 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
7200 *status = U_CE_NOT_FOUND_ERROR;
7201 return 0;
7202 }
7203
7204 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
7205
7206 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
7207 *status = U_PRIMARY_TOO_LONG_ERROR;
7208 return 0;
7209 }
374ca955
A
7210 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
7211 coll->variableTopValueisDefault = FALSE;
7212 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
7213 }
b75a7d8f
A
7214
7215 return CE & UCOL_PRIMARYMASK;
7216}
7217
7218U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
7219 if(U_FAILURE(*status) || coll == NULL) {
7220 return 0;
7221 }
7222 return coll->variableTopValue<<16;
7223}
7224
7225U_CAPI void U_EXPORT2
7226ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
7227 if(U_FAILURE(*status) || coll == NULL) {
7228 return;
7229 }
374ca955
A
7230
7231 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
7232 coll->variableTopValueisDefault = FALSE;
7233 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
7234 }
b75a7d8f
A
7235}
7236/* Attribute setter API */
7237U_CAPI void U_EXPORT2
7238ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
7239 if(U_FAILURE(*status) || coll == NULL) {
7240 return;
7241 }
7242 UColAttributeValue oldFrench = coll->frenchCollation;
7243 UColAttributeValue oldCaseFirst = coll->caseFirst;
7244 switch(attr) {
7245 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
7246 if(value == UCOL_ON) {
7247 coll->numericCollation = UCOL_ON;
7248 coll->numericCollationisDefault = FALSE;
7249 } else if (value == UCOL_OFF) {
7250 coll->numericCollation = UCOL_OFF;
7251 coll->numericCollationisDefault = FALSE;
7252 } else if (value == UCOL_DEFAULT) {
7253 coll->numericCollationisDefault = TRUE;
7254 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
7255 } else {
7256 *status = U_ILLEGAL_ARGUMENT_ERROR;
7257 }
7258 break;
7259 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
7260 if(value == UCOL_ON) {
7261 coll->hiraganaQ = UCOL_ON;
7262 coll->hiraganaQisDefault = FALSE;
7263 } else if (value == UCOL_OFF) {
7264 coll->hiraganaQ = UCOL_OFF;
7265 coll->hiraganaQisDefault = FALSE;
7266 } else if (value == UCOL_DEFAULT) {
7267 coll->hiraganaQisDefault = TRUE;
7268 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
7269 } else {
7270 *status = U_ILLEGAL_ARGUMENT_ERROR;
7271 }
7272 break;
7273 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7274 if(value == UCOL_ON) {
7275 coll->frenchCollation = UCOL_ON;
7276 coll->frenchCollationisDefault = FALSE;
7277 } else if (value == UCOL_OFF) {
7278 coll->frenchCollation = UCOL_OFF;
7279 coll->frenchCollationisDefault = FALSE;
7280 } else if (value == UCOL_DEFAULT) {
7281 coll->frenchCollationisDefault = TRUE;
7282 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
7283 } else {
7284 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7285 }
7286 break;
7287 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7288 if(value == UCOL_SHIFTED) {
7289 coll->alternateHandling = UCOL_SHIFTED;
7290 coll->alternateHandlingisDefault = FALSE;
7291 } else if (value == UCOL_NON_IGNORABLE) {
7292 coll->alternateHandling = UCOL_NON_IGNORABLE;
7293 coll->alternateHandlingisDefault = FALSE;
7294 } else if (value == UCOL_DEFAULT) {
7295 coll->alternateHandlingisDefault = TRUE;
7296 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
7297 } else {
7298 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7299 }
7300 break;
7301 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7302 if(value == UCOL_LOWER_FIRST) {
7303 coll->caseFirst = UCOL_LOWER_FIRST;
7304 coll->caseFirstisDefault = FALSE;
7305 } else if (value == UCOL_UPPER_FIRST) {
7306 coll->caseFirst = UCOL_UPPER_FIRST;
7307 coll->caseFirstisDefault = FALSE;
7308 } else if (value == UCOL_OFF) {
7309 coll->caseFirst = UCOL_OFF;
7310 coll->caseFirstisDefault = FALSE;
7311 } else if (value == UCOL_DEFAULT) {
7312 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7313 coll->caseFirstisDefault = TRUE;
7314 } else {
7315 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7316 }
7317 break;
7318 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7319 if(value == UCOL_ON) {
7320 coll->caseLevel = UCOL_ON;
7321 coll->caseLevelisDefault = FALSE;
7322 } else if (value == UCOL_OFF) {
7323 coll->caseLevel = UCOL_OFF;
7324 coll->caseLevelisDefault = FALSE;
7325 } else if (value == UCOL_DEFAULT) {
7326 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7327 coll->caseLevelisDefault = TRUE;
7328 } else {
7329 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7330 }
7331 break;
7332 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7333 if(value == UCOL_ON) {
7334 coll->normalizationMode = UCOL_ON;
7335 coll->normalizationModeisDefault = FALSE;
7336 } else if (value == UCOL_OFF) {
7337 coll->normalizationMode = UCOL_OFF;
7338 coll->normalizationModeisDefault = FALSE;
7339 } else if (value == UCOL_DEFAULT) {
7340 coll->normalizationModeisDefault = TRUE;
7341 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7342 } else {
7343 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7344 }
7345 break;
7346 case UCOL_STRENGTH: /* attribute for strength */
7347 if (value == UCOL_DEFAULT) {
7348 coll->strengthisDefault = TRUE;
7349 coll->strength = (UColAttributeValue)coll->options->strength;
7350 } else if (value <= UCOL_IDENTICAL) {
7351 coll->strengthisDefault = FALSE;
7352 coll->strength = value;
7353 } else {
7354 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7355 }
7356 break;
7357 case UCOL_ATTRIBUTE_COUNT:
7358 default:
7359 *status = U_ILLEGAL_ARGUMENT_ERROR;
7360 break;
7361 }
7362 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7363 coll->latinOneRegenTable = TRUE;
374ca955 7364 } else {
b75a7d8f
A
7365 coll->latinOneRegenTable = FALSE;
7366 }
7367 ucol_updateInternalState(coll, status);
7368}
7369
7370U_CAPI UColAttributeValue U_EXPORT2
7371ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7372 if(U_FAILURE(*status) || coll == NULL) {
7373 return UCOL_DEFAULT;
7374 }
7375 switch(attr) {
7376 case UCOL_NUMERIC_COLLATION:
374ca955 7377 return coll->numericCollation;
b75a7d8f
A
7378 case UCOL_HIRAGANA_QUATERNARY_MODE:
7379 return coll->hiraganaQ;
7380 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7381 return coll->frenchCollation;
7382 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7383 return coll->alternateHandling;
7384 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7385 return coll->caseFirst;
7386 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7387 return coll->caseLevel;
7388 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7389 return coll->normalizationMode;
7390 case UCOL_STRENGTH: /* attribute for strength */
7391 return coll->strength;
7392 case UCOL_ATTRIBUTE_COUNT:
7393 default:
7394 *status = U_ILLEGAL_ARGUMENT_ERROR;
7395 break;
7396 }
7397 return UCOL_DEFAULT;
7398}
7399
7400U_CAPI void U_EXPORT2
7401ucol_setStrength( UCollator *coll,
7402 UCollationStrength strength)
7403{
7404 UErrorCode status = U_ZERO_ERROR;
7405 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7406}
7407
7408U_CAPI UCollationStrength U_EXPORT2
7409ucol_getStrength(const UCollator *coll)
7410{
7411 UErrorCode status = U_ZERO_ERROR;
7412 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7413}
7414
7415/****************************************************************************/
7416/* Following are misc functions */
7417/* there are new APIs and some compatibility APIs */
7418/****************************************************************************/
7419
7420U_CAPI UCollator* U_EXPORT2
7421ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
7422{
7423 UCollator * localCollator;
7424 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
7425 char *stackBufferChars = (char *)stackBuffer;
7426
7427 if (status == NULL || U_FAILURE(*status)){
7428 return 0;
7429 }
7430 if ((stackBuffer && !pBufferSize) || !coll){
7431 *status = U_ILLEGAL_ARGUMENT_ERROR;
7432 return 0;
7433 }
7434 /* Pointers on 64-bit platforms need to be aligned
7435 * on a 64-bit boundry in memory.
7436 */
7437 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
7438 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
7439 *pBufferSize -= offsetUp;
7440 stackBufferChars += offsetUp;
7441 }
7442 stackBuffer = (void *)stackBufferChars;
7443
7444 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
7445 *pBufferSize = bufferSizeNeeded;
7446 return 0;
7447 }
7448 if (!stackBuffer || *pBufferSize < bufferSizeNeeded) {
7449 /* allocate one here...*/
7450 int32_t length;
7451 const UChar * rules = ucol_getRules(coll, &length);
7452
7453 localCollator = ucol_openRules(rules,
7454 length,
7455 ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status),
7456 ucol_getStrength(coll),
7457 NULL,
7458 status);
7459 if (U_SUCCESS(*status))
7460 {
7461 *status = U_SAFECLONE_ALLOCATED_WARNING;
7462 }
7463 } else {
7464 localCollator = (UCollator *)stackBuffer;
374ca955 7465 uprv_memcpy(localCollator, coll, sizeof(UCollator));
b75a7d8f 7466 localCollator->freeOnClose = FALSE;
374ca955
A
7467 localCollator->requestedLocale = NULL; // zero copies of pointers
7468 localCollator->validLocale = NULL;
b75a7d8f
A
7469 }
7470 return localCollator;
7471}
7472
7473U_CAPI int32_t U_EXPORT2
7474ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
7475 UErrorCode status = U_ZERO_ERROR;
7476 int32_t len = 0;
7477 int32_t UCAlen = 0;
7478 const UChar* ucaRules = 0;
7479 const UChar *rules = ucol_getRules(coll, &len);
7480 if(delta == UCOL_FULL_RULES) {
7481 /* take the UCA rules and append real rules at the end */
7482 /* UCA rules will be probably coming from the root RB */
7483 ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status);
374ca955
A
7484 /*
7485 UResourceBundle* cresb = ures_getByKeyWithFallback(coll->rb, "collations", NULL, &status);
7486 UResourceBundle* uca = ures_getByKeyWithFallback(cresb, "UCA", NULL, &status);
7487 ucaRules = ures_getStringByKey(uca,"Sequence",&UCAlen,&status);
7488 ures_close(uca);
7489 ures_close(cresb);
7490 */
b75a7d8f
A
7491 }
7492 if(U_FAILURE(status)) {
7493 return 0;
7494 }
7495 if(buffer!=0 && bufferLen>0){
7496 *buffer=0;
7497 if(UCAlen > 0) {
7498 u_memcpy(buffer, ucaRules, uprv_min(UCAlen, bufferLen));
7499 }
7500 if(len > 0 && bufferLen > UCAlen) {
7501 u_memcpy(buffer+UCAlen, rules, uprv_min(len, bufferLen-UCAlen));
7502 }
7503 }
7504 return u_terminateUChars(buffer, bufferLen, len+UCAlen, &status);
7505}
7506
7507static const UChar _NUL = 0;
7508
7509U_CAPI const UChar* U_EXPORT2
7510ucol_getRules( const UCollator *coll,
7511 int32_t *length)
7512{
7513 if(coll->rules != NULL) {
7514 *length = coll->rulesLength;
7515 return coll->rules;
7516 } else {
7517 UErrorCode status = U_ZERO_ERROR;
374ca955 7518 if(coll->elements != NULL) {
b75a7d8f
A
7519 if(U_SUCCESS(status)) {
7520 /*Semantic const */
374ca955 7521 ((UCollator *)coll)->rules = ures_getStringByKey(coll->elements, "Sequence", length, &status);
b75a7d8f
A
7522 ((UCollator *)coll)->rulesLength = *length;
7523 ((UCollator *)coll)->freeRulesOnClose = FALSE;
b75a7d8f
A
7524 return coll->rules;
7525 }
7526 }
7527 *length = 0;
7528 return &_NUL;
7529 }
7530}
7531
7532U_CAPI int32_t U_EXPORT2
7533ucol_getDisplayName( const char *objLoc,
7534 const char *dispLoc,
7535 UChar *result,
7536 int32_t resultLength,
7537 UErrorCode *status)
7538{
7539
7540 if(U_FAILURE(*status)) return -1;
7541 UnicodeString dst;
7542 if(!(result==NULL && resultLength==0)) {
7543 // NULL destination for pure preflighting: empty dummy string
7544 // otherwise, alias the destination buffer
7545 dst.setTo(result, 0, resultLength);
7546 }
7547 Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst);
7548 return dst.extract(result, resultLength, *status);
7549}
7550
7551U_CAPI const char* U_EXPORT2
7552ucol_getAvailable(int32_t index)
7553{
7554 return uloc_getAvailable(index);
7555}
7556
7557U_CAPI int32_t U_EXPORT2
7558ucol_countAvailable()
7559{
7560 return uloc_countAvailable();
7561}
7562
374ca955
A
7563#if !UCONFIG_NO_SERVICE
7564U_CAPI UEnumeration* U_EXPORT2
7565ucol_openAvailableLocales(UErrorCode *status) {
7566 // This is a wrapper over Collator::getAvailableLocales()
7567 if (U_FAILURE(*status)) {
7568 return NULL;
7569 }
7570 StringEnumeration *s = Collator::getAvailableLocales();
7571 if (s == NULL) {
7572 *status = U_MEMORY_ALLOCATION_ERROR;
7573 return NULL;
7574 }
7575 return uenum_openStringEnumeration(s, status);
7576}
7577#endif
7578
7579// Note: KEYWORDS[0] != RESOURCE_NAME - alan
7580
7581static const char* RESOURCE_NAME = "collations";
7582
7583static const char* KEYWORDS[] = { "collation" };
7584
7585#define KEYWORD_COUNT (sizeof(KEYWORDS)/sizeof(KEYWORDS[0]))
7586
7587U_CAPI UEnumeration* U_EXPORT2
7588ucol_getKeywords(UErrorCode *status) {
7589 UEnumeration *result = NULL;
7590 if (U_SUCCESS(*status)) {
7591 return uenum_openCharStringsEnumeration(KEYWORDS, KEYWORD_COUNT, status);
7592 }
7593 return result;
7594}
7595
7596U_CAPI UEnumeration* U_EXPORT2
7597ucol_getKeywordValues(const char *keyword, UErrorCode *status) {
7598 // hard-coded to accept exactly one collation keyword
7599 // modify if additional collation keyword is added later
7600 if (U_SUCCESS(*status) &&
7601 keyword==NULL || uprv_strcmp(keyword, KEYWORDS[0])!=0) {
7602 *status = U_ILLEGAL_ARGUMENT_ERROR;
7603 return NULL;
7604 }
7605 return ures_getKeywordValues(U_ICUDATA_COLL, RESOURCE_NAME, status);
7606}
7607
7608U_CAPI int32_t U_EXPORT2
7609ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity,
7610 const char* keyword, const char* locale,
7611 UBool* isAvailable, UErrorCode* status) {
7612 // N.B.: Resource name is "collations" but keyword is "collation"
7613 return ures_getFunctionalEquivalent(result, resultCapacity, U_ICUDATA_COLL,
7614 "collations", keyword, locale,
7615 isAvailable, TRUE, status);
7616}
7617
b75a7d8f
A
7618U_CAPI void U_EXPORT2
7619ucol_getVersion(const UCollator* coll,
7620 UVersionInfo versionInfo)
7621{
7622 /* RunTime version */
7623 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7624 /* Builder version*/
7625 uint8_t bdVersion = coll->image->version[0];
7626
7627 /* Charset Version. Need to get the version from cnv files
7628 * makeconv should populate cnv files with version and
7629 * an api has to be provided in ucnv.h to obtain this version
7630 */
7631 uint8_t csVersion = 0;
7632
7633 /* combine the version info */
7634 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7635
7636 /* Tailoring rules */
7637 versionInfo[0] = (uint8_t)(cmbVersion>>8);
7638 versionInfo[1] = (uint8_t)cmbVersion;
7639 versionInfo[2] = coll->image->version[1];
374ca955
A
7640 if(coll->UCA) {
7641 versionInfo[3] = coll->UCA->image->UCAVersion[0];
7642 } else {
7643 versionInfo[3] = 0;
7644 }
b75a7d8f
A
7645}
7646
7647
7648/* This internal API checks whether a character is tailored or not */
7649U_CAPI UBool U_EXPORT2
7650ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7651 uint32_t CE = UCOL_NOT_FOUND;
7652 const UChar *ContractionStart = NULL;
7653 if(U_SUCCESS(*status) && coll != NULL) {
374ca955 7654 if(coll == coll->UCA) {
b75a7d8f
A
7655 return FALSE;
7656 } else if(u < 0x100) { /* latin-1 */
7657 CE = coll->latinOneMapping[u];
374ca955 7658 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
b75a7d8f
A
7659 return FALSE;
7660 }
7661 } else { /* regular */
7662 /*CE = ucmpe32_get(coll->mapping, u);*/
7663 CE = UTRIE_GET32_FROM_LEAD(coll->mapping, u);
7664
7665 }
7666
7667 if(isContraction(CE)) {
7668 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7669 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7670 }
7671
7672 if(CE == UCOL_NOT_FOUND) {
7673 return FALSE;
7674 } else {
7675 return TRUE;
7676 }
7677 } else {
7678 return FALSE;
7679 }
7680}
7681
7682
7683/****************************************************************************/
7684/* Following are the string compare functions */
7685/* */
7686/****************************************************************************/
7687
7688
7689/* ucol_checkIdent internal function. Does byte level string compare. */
7690/* Used by strcoll if strength == identical and strings */
7691/* are otherwise equal. Moved out-of-line because this */
7692/* is a rare case. */
7693/* */
7694/* Comparison must be done on NFD normalized strings. */
7695/* FCD is not good enough. */
7696/* */
7697/* TODO: make an incremental NFD Comparison function, which could */
7698/* be of general use */
7699
7700static
7701UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7702{
7703
374ca955 7704 // TODO: When we have an UChar iterator, we need to access the whole string. One
b75a7d8f
A
7705 // useful modification would be a UChar iterator extract API, since reset next next...
7706 // is not optimal.
7707 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7708
7709 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
374ca955 7710 // of same type, but that doesn't really mean that it will stay that way.
b75a7d8f
A
7711
7712 // The division for the array length may truncate the array size to
7713 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7714 // for all platforms anyway.
7715 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7716 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7717 //UChar sStackBuf[256], tStackBuf[256];
7718 //int32_t sBufSize = 256, tBufSize = 256;
7719 int32_t comparison;
7720 int32_t sLen = 0;
7721 UChar *sBuf = NULL;
7722 int32_t tLen = 0;
7723 UChar *tBuf = NULL;
7724 UBool freeSBuf = FALSE, freeTBuf = FALSE;
7725
7726 if (sColl->flags & UCOL_USE_ITERATOR) {
7727 UNormIterator *sNIt = NULL, *tNIt = NULL;
7728 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7729 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7730 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7731 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7732 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7733 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7734 comparison = u_strCompareIter(sIt, tIt, TRUE);
7735 unorm_closeIter(sNIt);
7736 unorm_closeIter(tNIt);
7737 } else {
7738 sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7739 sBuf = sColl->string;
7740 tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7741 tBuf = tColl->string;
7742
7743 if (normalize) {
7744 *status = U_ZERO_ERROR;
7745 if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7746 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7747 sBuf, sLen,
7748 FALSE, 0,
7749 status);
7750 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7751 if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7752 &sColl->writableBuffer,
7753 (int32_t *)&sColl->writableBufSize, sLen,
7754 0)
7755 ) {
7756 *status = U_MEMORY_ALLOCATION_ERROR;
7757 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7758 }
7759 *status = U_ZERO_ERROR;
7760 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7761 sBuf, sLen,
7762 FALSE, 0,
7763 status);
7764 }
7765 if(freeSBuf) {
7766 uprv_free(sBuf);
7767 freeSBuf = FALSE;
7768 }
7769 sBuf = sColl->writableBuffer;
7770 if (sBuf != sColl->stackWritableBuffer) {
7771 sColl->flags |= UCOL_ITER_ALLOCATED;
7772 }
7773 }
7774
7775 *status = U_ZERO_ERROR;
7776 if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7777 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7778 tBuf, tLen,
7779 FALSE, 0,
7780 status);
7781 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7782 if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7783 &tColl->writableBuffer,
7784 (int32_t *)&tColl->writableBufSize, tLen,
7785 0)
7786 ) {
7787 *status = U_MEMORY_ALLOCATION_ERROR;
7788 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7789 }
7790 *status = U_ZERO_ERROR;
7791 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7792 tBuf, tLen,
7793 FALSE, 0,
7794 status);
7795 }
7796 if(freeTBuf) {
7797 uprv_free(tBuf);
7798 freeTBuf = FALSE;
7799 }
7800 tBuf = tColl->writableBuffer;
7801 if (tBuf != tColl->stackWritableBuffer) {
7802 tColl->flags |= UCOL_ITER_ALLOCATED;
7803 }
7804 }
7805 }
7806
7807 if (sLen == -1 && tLen == -1) {
7808 comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7809 } else {
7810 if (sLen == -1) {
7811 sLen = u_strlen(sBuf);
7812 }
7813 if (tLen == -1) {
7814 tLen = u_strlen(tBuf);
7815 }
7816 comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7817 if (comparison == 0) {
7818 comparison = sLen - tLen;
7819 }
7820 }
7821 }
7822
7823 if (comparison < 0) {
7824 return UCOL_LESS;
7825 } else if (comparison == 0) {
7826 return UCOL_EQUAL;
7827 } else /* comparison > 0 */ {
7828 return UCOL_GREATER;
7829 }
7830}
7831
7832/* CEBuf - A struct and some inline functions to handle the saving */
7833/* of CEs in a buffer within ucol_strcoll */
7834
7835#define UCOL_CEBUF_SIZE 512
7836typedef struct ucol_CEBuf {
7837 uint32_t *buf;
7838 uint32_t *endp;
7839 uint32_t *pos;
7840 uint32_t localArray[UCOL_CEBUF_SIZE];
7841} ucol_CEBuf;
7842
7843
7844static
7845inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7846 (b)->buf = (b)->pos = (b)->localArray;
7847 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7848};
7849
7850static
7851void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7852 uint32_t oldSize;
7853 uint32_t newSize;
7854 uint32_t *newBuf;
7855
7856 ci->flags |= UCOL_ITER_ALLOCATED;
7857 oldSize = b->pos - b->buf;
7858 newSize = oldSize * 2;
7859 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7860 if(newBuf != NULL) {
7861 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7862 if (b->buf != b->localArray) {
7863 uprv_free(b->buf);
7864 }
7865 b->buf = newBuf;
7866 b->endp = b->buf + newSize;
7867 b->pos = b->buf + oldSize;
7868 }
7869}
7870
7871static
7872inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7873 if (b->pos == b->endp) {
7874 ucol_CEBuf_Expand(b, ci);
7875}
7876 *(b)->pos++ = ce;
7877};
7878
7879/* This is a trick string compare function that goes in and uses sortkeys to compare */
7880/* It is used when compare gets in trouble and needs to bail out */
7881static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7882 collIterate *tColl)
7883{
7884 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7885 uint8_t *sourceKeyP = sourceKey;
7886 uint8_t *targetKeyP = targetKey;
7887 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7888 const UCollator *coll = sColl->coll;
7889 UChar *source = NULL;
7890 UChar *target = NULL;
7891 UChar sStackBuf[256], tStackBuf[256];
7892 int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7893 int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7894
7895 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7896 if(sColl->flags & UCOL_USE_ITERATOR) {
7897 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7898 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7899 source = sStackBuf;
7900 UChar *sBufp = source;
7901 target = tStackBuf;
7902 UChar *tBufp = target;
7903 while(sColl->iterator->hasNext(sColl->iterator)) {
7904 *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7905 }
7906 while(tColl->iterator->hasNext(tColl->iterator)) {
7907 *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7908 }
7909 sourceLength = sBufp - source;
7910 targetLength = tBufp - target;
7911 } else { // no iterators
7912 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7913 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7914 source = sColl->string;
7915 target = tColl->string;
7916 }
7917
7918
7919
7920 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7921 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7922 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7923 if(sourceKeyP != NULL) {
7924 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7925 }
7926 }
7927
7928 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7929 if(targetKeyLen > UCOL_MAX_BUFFER) {
7930 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7931 if(targetKeyP != NULL) {
7932 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7933 }
7934 }
7935
7936 int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7937
7938 if(sourceKeyP != sourceKey) {
7939 uprv_free(sourceKeyP);
7940 }
7941
7942 if(targetKeyP != targetKey) {
7943 uprv_free(targetKeyP);
7944 }
7945
7946 if(result<0) {
7947 return UCOL_LESS;
7948 } else if(result>0) {
7949 return UCOL_GREATER;
7950 } else {
7951 return UCOL_EQUAL;
7952 }
7953}
7954
7955
374ca955 7956static inline UCollationResult
b75a7d8f
A
7957ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7958// const UCollator *coll,
7959// const UChar *source,
7960// int32_t sourceLength,
7961// const UChar *target,
7962// int32_t targetLength,
7963 UErrorCode *status)
7964{
7965 U_ALIGN_CODE(16);
7966
7967 const UCollator *coll = sColl->coll;
7968
7969
7970 // setting up the collator parameters
7971 UColAttributeValue strength = coll->strength;
7972 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7973
7974 UBool checkSecTer = initialCheckSecTer;
7975 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7976 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7977 UBool checkIdent = (strength == UCOL_IDENTICAL);
7978 UBool checkCase = (coll->caseLevel == UCOL_ON);
7979 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7980 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7981 UBool qShifted = shifted && checkQuad;
7982 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7983
7984 if(doHiragana && shifted) {
7985 return (ucol_compareUsingSortKeys(sColl, tColl));
7986 }
7987 uint8_t caseSwitch = coll->caseSwitch;
7988 uint8_t tertiaryMask = coll->tertiaryMask;
7989
7990 // This is the lowest primary value that will not be ignored if shifted
7991 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7992
7993 UCollationResult result = UCOL_EQUAL;
7994 UCollationResult hirResult = UCOL_EQUAL;
7995
7996 // Preparing the CE buffers. They will be filled during the primary phase
7997 ucol_CEBuf sCEs;
7998 ucol_CEBuf tCEs;
7999 UCOL_INIT_CEBUF(&sCEs);
8000 UCOL_INIT_CEBUF(&tCEs);
8001
8002 uint32_t secS = 0, secT = 0;
8003 uint32_t sOrder=0, tOrder=0;
8004
8005 // Non shifted primary processing is quite simple
8006 if(!shifted) {
8007 for(;;) {
8008
8009 // We fetch CEs until we hit a non ignorable primary or end.
8010 do {
8011 // We get the next CE
8012 sOrder = ucol_IGetNextCE(coll, sColl, status);
8013 // Stuff it in the buffer
8014 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8015 // And keep just the primary part.
8016 sOrder &= UCOL_PRIMARYMASK;
8017 } while(sOrder == 0);
8018
8019 // see the comments on the above block
8020 do {
8021 tOrder = ucol_IGetNextCE(coll, tColl, status);
8022 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8023 tOrder &= UCOL_PRIMARYMASK;
8024 } while(tOrder == 0);
8025
8026 // if both primaries are the same
8027 if(sOrder == tOrder) {
8028 // and there are no more CEs, we advance to the next level
8029 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
8030 break;
8031 }
8032 if(doHiragana && hirResult == UCOL_EQUAL) {
8033 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
374ca955 8034 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
b75a7d8f
A
8035 ? UCOL_LESS:UCOL_GREATER;
8036 }
8037 }
8038 } else {
8039 // if two primaries are different, we are done
8040 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
8041 goto commonReturn;
8042 }
8043 } // no primary difference... do the rest from the buffers
8044 } else { // shifted - do a slightly more complicated processing :)
8045 for(;;) {
8046 UBool sInShifted = FALSE;
8047 UBool tInShifted = FALSE;
8048 // This version of code can be refactored. However, it seems easier to understand this way.
8049 // Source loop. Sam as the target loop.
8050 for(;;) {
8051 sOrder = ucol_IGetNextCE(coll, sColl, status);
8052 if(sOrder == UCOL_NO_MORE_CES) {
8053 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8054 break;
374ca955
A
8055 } else if(sOrder == 0
8056 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
b75a7d8f
A
8057 /* UCA amendment - ignore ignorables that follow shifted code points */
8058 continue;
8059 } else if(isContinuation(sOrder)) {
8060 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
8061 if(sInShifted) {
8062 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
8063 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8064 continue;
8065 } else {
8066 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8067 break;
8068 }
8069 } else { /* Just lower level values */
8070 if(sInShifted) {
8071 continue;
8072 } else {
8073 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8074 continue;
8075 }
8076 }
8077 } else { /* regular */
8078 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
8079 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8080 break;
8081 } else {
8082 if((sOrder & UCOL_PRIMARYMASK) > 0) {
8083 sInShifted = TRUE;
8084 sOrder &= UCOL_PRIMARYMASK;
8085 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8086 continue;
8087 } else {
8088 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8089 sInShifted = FALSE;
8090 continue;
8091 }
8092 }
8093 }
8094 }
8095 sOrder &= UCOL_PRIMARYMASK;
8096 sInShifted = FALSE;
8097
8098 for(;;) {
8099 tOrder = ucol_IGetNextCE(coll, tColl, status);
8100 if(tOrder == UCOL_NO_MORE_CES) {
8101 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8102 break;
8103 } else if(tOrder == 0
374ca955 8104 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
b75a7d8f
A
8105 /* UCA amendment - ignore ignorables that follow shifted code points */
8106 continue;
8107 } else if(isContinuation(tOrder)) {
8108 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
8109 if(tInShifted) {
8110 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
8111 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8112 continue;
8113 } else {
8114 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8115 break;
8116 }
8117 } else { /* Just lower level values */
8118 if(tInShifted) {
8119 continue;
8120 } else {
8121 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8122 continue;
8123 }
8124 }
8125 } else { /* regular */
8126 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
8127 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8128 break;
8129 } else {
8130 if((tOrder & UCOL_PRIMARYMASK) > 0) {
8131 tInShifted = TRUE;
8132 tOrder &= UCOL_PRIMARYMASK;
8133 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8134 continue;
8135 } else {
8136 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8137 tInShifted = FALSE;
8138 continue;
8139 }
8140 }
8141 }
8142 }
8143 tOrder &= UCOL_PRIMARYMASK;
8144 tInShifted = FALSE;
8145
8146 if(sOrder == tOrder) {
8147 /*
8148 if(doHiragana && hirResult == UCOL_EQUAL) {
8149 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
374ca955 8150 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
b75a7d8f
A
8151 ? UCOL_LESS:UCOL_GREATER;
8152 }
8153 }
8154 */
8155 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
8156 break;
8157 } else {
8158 sOrder = 0; tOrder = 0;
8159 continue;
8160 }
8161 } else {
8162 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
8163 goto commonReturn;
8164 }
8165 } /* no primary difference... do the rest from the buffers */
8166 }
8167
8168 /* now, we're gonna reexamine collected CEs */
8169 uint32_t *sCE;
8170 uint32_t *tCE;
8171
8172 /* This is the secondary level of comparison */
8173 if(checkSecTer) {
8174 if(!isFrenchSec) { /* normal */
8175 sCE = sCEs.buf;
8176 tCE = tCEs.buf;
8177 for(;;) {
8178 while (secS == 0) {
8179 secS = *(sCE++) & UCOL_SECONDARYMASK;
8180 }
8181
8182 while(secT == 0) {
8183 secT = *(tCE++) & UCOL_SECONDARYMASK;
8184 }
8185
8186 if(secS == secT) {
8187 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
8188 break;
8189 } else {
8190 secS = 0; secT = 0;
8191 continue;
8192 }
8193 } else {
8194 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8195 goto commonReturn;
8196 }
8197 }
8198 } else { /* do the French */
8199 uint32_t *sCESave = NULL;
8200 uint32_t *tCESave = NULL;
8201 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
8202 tCE = tCEs.pos-2;
8203 for(;;) {
8204 while (secS == 0 && sCE >= sCEs.buf) {
8205 if(sCESave == 0) {
8206 secS = *(sCE--);
8207 if(isContinuation(secS)) {
8208 while(isContinuation(secS = *(sCE--)));
8209 /* after this, secS has the start of continuation, and sCEs points before that */
8210 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
8211 sCE+=2; /* need to point to the first continuation CP */
8212 /* However, now you can just continue doing stuff */
8213 }
8214 } else {
8215 secS = *(sCE++);
8216 if(!isContinuation(secS)) { /* This means we have finished with this cont */
8217 sCE = sCESave; /* reset the pointer to before continuation */
8218 sCESave = 0;
8219 continue;
8220 }
8221 }
8222 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
8223 }
8224
8225 while(secT == 0 && tCE >= tCEs.buf) {
8226 if(tCESave == 0) {
8227 secT = *(tCE--);
8228 if(isContinuation(secT)) {
8229 while(isContinuation(secT = *(tCE--)));
8230 /* after this, secS has the start of continuation, and sCEs points before that */
8231 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
8232 tCE+=2; /* need to point to the first continuation CP */
8233 /* However, now you can just continue doing stuff */
8234 }
8235 } else {
8236 secT = *(tCE++);
8237 if(!isContinuation(secT)) { /* This means we have finished with this cont */
8238 tCE = tCESave; /* reset the pointer to before continuation */
8239 tCESave = 0;
8240 continue;
8241 }
8242 }
8243 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
8244 }
8245
8246 if(secS == secT) {
8247 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
8248 break;
8249 } else {
8250 secS = 0; secT = 0;
8251 continue;
8252 }
8253 } else {
8254 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8255 goto commonReturn;
8256 }
8257 }
8258 }
8259 }
8260
8261 /* doing the case bit */
8262 if(checkCase) {
8263 sCE = sCEs.buf;
8264 tCE = tCEs.buf;
8265 for(;;) {
8266 while((secS & UCOL_REMOVE_CASE) == 0) {
8267 if(!isContinuation(*sCE++)) {
8268 secS =*(sCE-1) & UCOL_TERT_CASE_MASK;
8269 secS ^= caseSwitch;
8270 } else {
8271 secS = 0;
8272 }
8273 }
8274
8275 while((secT & UCOL_REMOVE_CASE) == 0) {
8276 if(!isContinuation(*tCE++)) {
8277 secT = *(tCE-1) & UCOL_TERT_CASE_MASK;
8278 secT ^= caseSwitch;
8279 } else {
8280 secT = 0;
8281 }
8282 }
8283
8284 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
8285 result = UCOL_LESS;
8286 goto commonReturn;
8287 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
8288 result = UCOL_GREATER;
8289 goto commonReturn;
8290 }
8291
8292 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
8293 break;
8294 } else {
8295 secS = 0;
8296 secT = 0;
8297 }
8298 }
8299 }
8300
8301 /* Tertiary level */
8302 if(checkTertiary) {
8303 secS = 0;
8304 secT = 0;
8305 sCE = sCEs.buf;
8306 tCE = tCEs.buf;
8307 for(;;) {
8308 while((secS & UCOL_REMOVE_CASE) == 0) {
8309 secS = *(sCE++) & tertiaryMask;
8310 if(!isContinuation(secS)) {
8311 secS ^= caseSwitch;
8312 } else {
8313 secS &= UCOL_REMOVE_CASE;
8314 }
8315 }
8316
8317 while((secT & UCOL_REMOVE_CASE) == 0) {
8318 secT = *(tCE++) & tertiaryMask;
8319 if(!isContinuation(secT)) {
8320 secT ^= caseSwitch;
8321 } else {
8322 secT &= UCOL_REMOVE_CASE;
8323 }
8324 }
8325
8326 if(secS == secT) {
8327 if((secS & UCOL_REMOVE_CASE) == 1) {
8328 break;
8329 } else {
8330 secS = 0; secT = 0;
8331 continue;
8332 }
8333 } else {
8334 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8335 goto commonReturn;
8336 }
8337 }
8338 }
8339
8340
8341 if(qShifted /*checkQuad*/) {
8342 UBool sInShifted = TRUE;
8343 UBool tInShifted = TRUE;
8344 secS = 0;
8345 secT = 0;
8346 sCE = sCEs.buf;
8347 tCE = tCEs.buf;
8348 for(;;) {
8349 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
8350 secS = *(sCE++);
8351 if(isContinuation(secS)) {
8352 if(!sInShifted) {
8353 continue;
8354 }
8355 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
8356 secS = UCOL_PRIMARYMASK;
8357 sInShifted = FALSE;
8358 } else {
8359 sInShifted = TRUE;
8360 }
8361 }
8362 secS &= UCOL_PRIMARYMASK;
8363
8364
8365 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
8366 secT = *(tCE++);
8367 if(isContinuation(secT)) {
8368 if(!tInShifted) {
8369 continue;
8370 }
8371 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
8372 secT = UCOL_PRIMARYMASK;
8373 tInShifted = FALSE;
8374 } else {
8375 tInShifted = TRUE;
8376 }
8377 }
8378 secT &= UCOL_PRIMARYMASK;
8379
8380 if(secS == secT) {
8381 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
8382 break;
8383 } else {
8384 secS = 0; secT = 0;
8385 continue;
8386 }
8387 } else {
8388 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8389 goto commonReturn;
8390 }
8391 }
8392 } else if(doHiragana && hirResult != UCOL_EQUAL) {
8393 // If we're fine on quaternaries, we might be different
8394 // on Hiragana. This, however, might fail us in shifted.
8395 result = hirResult;
8396 goto commonReturn;
8397 }
8398
8399 /* For IDENTICAL comparisons, we use a bitwise character comparison */
8400 /* as a tiebreaker if all else is equal. */
8401 /* Getting here should be quite rare - strings are not identical - */
8402 /* that is checked first, but compared == through all other checks. */
8403 if(checkIdent)
8404 {
8405 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8406 result = ucol_checkIdent(sColl, tColl, TRUE, status);
8407 }
8408
8409commonReturn:
8410 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
8411 freeHeapWritableBuffer(sColl);
8412 freeHeapWritableBuffer(tColl);
8413
8414 if (sCEs.buf != sCEs.localArray ) {
8415 uprv_free(sCEs.buf);
8416 }
8417 if (tCEs.buf != tCEs.localArray ) {
8418 uprv_free(tCEs.buf);
8419 }
8420 }
8421
8422 return result;
8423}
8424
8425
374ca955
A
8426static inline uint32_t
8427ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
b75a7d8f
A
8428 uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
8429 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8430 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8431 int32_t offset = 1;
8432 UChar schar = 0, tchar = 0;
8433
8434 for(;;) {
8435 if(len == -1) {
8436 if(s[*index] == 0) { // end of string
8437 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8438 } else {
8439 schar = s[*index];
8440 }
8441 } else {
8442 if(*index == len) {
8443 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8444 } else {
8445 schar = s[*index];
8446 }
8447 }
8448
8449 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8450 offset++;
8451 }
8452
8453 if (schar == tchar) {
8454 (*index)++;
8455 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8456 }
8457 else
8458 {
8459 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8460 return UCOL_BAIL_OUT_CE;
8461 }
8462 // skip completely ignorables
8463 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
8464 if(isZeroCE == 0) { // we have to ignore completely ignorables
8465 (*index)++;
8466 continue;
8467 }
8468
8469 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8470 }
8471 }
8472}
8473
8474
374ca955
A
8475/**
8476 * This is a fast strcoll, geared towards text in Latin-1.
b75a7d8f
A
8477 * It supports contractions of size two, French secondaries
8478 * and case switching. You can use it with strengths primary
8479 * to tertiary. It does not support shifted and case level.
8480 * It relies on the table build by setupLatin1Table. If it
8481 * doesn't understand something, it will go to the regular
374ca955 8482 * strcoll.
b75a7d8f 8483 */
374ca955 8484static inline UCollationResult
b75a7d8f
A
8485ucol_strcollUseLatin1( const UCollator *coll,
8486 const UChar *source,
8487 int32_t sLen,
8488 const UChar *target,
8489 int32_t tLen,
374ca955 8490 UErrorCode *status)
b75a7d8f
A
8491{
8492 U_ALIGN_CODE(16);
8493 int32_t strength = coll->strength;
8494
8495 int32_t sIndex = 0, tIndex = 0;
8496 UChar sChar = 0, tChar = 0;
8497 uint32_t sOrder=0, tOrder=0;
8498
8499 UBool endOfSource = FALSE, endOfTarget = FALSE;
8500
8501 uint32_t *elements = coll->latinOneCEs;
8502
8503 UBool haveContractions = FALSE; // if we have contractions in our string
8504 // we cannot do French secondary
8505
8506 // Do the primary level
8507 for(;;) {
8508 while(sOrder==0) { // this loop skips primary ignorables
8509 // sOrder=getNextlatinOneCE(source);
8510 if(sLen==-1) { // handling zero terminated strings
8511 sChar=source[sIndex++];
8512 if(sChar==0) {
8513 endOfSource = TRUE;
8514 break;
8515 }
8516 } else { // handling strings with known length
8517 if(sIndex==sLen) {
8518 endOfSource = TRUE;
8519 break;
8520 }
8521 sChar=source[sIndex++];
8522 }
8523 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8524 //fprintf(stderr, "R");
8525 goto returnRegular;
8526 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8527 }
8528 sOrder = elements[sChar];
8529 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8530 // specials can basically be either contractions or bail-out signs. If we get anything
8531 // else, we'll bail out anywasy
8532 if(getCETag(sOrder) == CONTRACTION_TAG) {
8533 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8534 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8535 // However, if there are contractions in the table, but we always use just one char,
8536 // we might be able to do French. This should be checked out.
8537 }
8538 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8539 //fprintf(stderr, "S");
8540 goto returnRegular;
8541 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8542 }
8543 }
8544 }
8545
8546 while(tOrder==0) { // this loop skips primary ignorables
8547 // tOrder=getNextlatinOneCE(target);
8548 if(tLen==-1) { // handling zero terminated strings
8549 tChar=target[tIndex++];
8550 if(tChar==0) {
374ca955 8551 if(endOfSource) { // this is different than source loop,
b75a7d8f
A
8552 // as we already know that source loop is done here,
8553 // so we can either finish the primary loop if both
374ca955 8554 // strings are done or anounce the result if only
b75a7d8f
A
8555 // target is done. Same below.
8556 goto endOfPrimLoop;
8557 } else {
8558 return UCOL_GREATER;
8559 }
8560 }
8561 } else { // handling strings with known length
8562 if(tIndex==tLen) {
8563 if(endOfSource) {
8564 goto endOfPrimLoop;
8565 } else {
8566 return UCOL_GREATER;
8567 }
8568 }
8569 tChar=target[tIndex++];
8570 }
8571 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8572 //fprintf(stderr, "R");
8573 goto returnRegular;
8574 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8575 }
8576 tOrder = elements[tChar];
8577 if(tOrder >= UCOL_NOT_FOUND) {
8578 // Handling specials, see the comments for source
8579 if(getCETag(tOrder) == CONTRACTION_TAG) {
8580 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8581 haveContractions = TRUE;
8582 }
8583 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8584 //fprintf(stderr, "S");
8585 goto returnRegular;
8586 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8587 }
8588 }
8589 }
8590 if(endOfSource) { // source is finished, but target is not, say the result.
8591 return UCOL_LESS;
8592 }
8593
8594 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8595 sOrder = 0; tOrder = 0;
8596 continue;
8597 } else {
8598 // compare current top bytes
8599 if(((sOrder^tOrder)&0xFF000000)!=0) {
8600 // top bytes differ, return difference
8601 if(sOrder < tOrder) {
8602 return UCOL_LESS;
8603 } else if(sOrder > tOrder) {
8604 return UCOL_GREATER;
8605 }
8606 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8607 // since we must return enum value
8608 }
8609
8610 // top bytes match, continue with following bytes
8611 sOrder<<=8;
8612 tOrder<<=8;
374ca955 8613 }
b75a7d8f
A
8614 }
8615
8616endOfPrimLoop:
374ca955 8617 // after primary loop, we definitely know the sizes of strings,
b75a7d8f
A
8618 // so we set it and use simpler loop for secondaries and tertiaries
8619 sLen = sIndex; tLen = tIndex;
8620 if(strength >= UCOL_SECONDARY) {
8621 // adjust the table beggining
8622 elements += coll->latinOneTableLen;
8623 endOfSource = FALSE; endOfTarget = FALSE;
8624
8625 if(coll->frenchCollation == UCOL_OFF) { // non French
8626 // This loop is a simplified copy of primary loop
374ca955
A
8627 // at this point we know that whole strings are latin-1, so we don't
8628 // check for that. We also know that we only have contractions as
b75a7d8f
A
8629 // specials.
8630 sIndex = 0; tIndex = 0;
8631 for(;;) {
8632 while(sOrder==0) {
8633 if(sIndex==sLen) {
8634 endOfSource = TRUE;
8635 break;
8636 }
8637 sChar=source[sIndex++];
8638 sOrder = elements[sChar];
8639 if(sOrder > UCOL_NOT_FOUND) {
8640 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8641 }
8642 }
8643
8644 while(tOrder==0) {
8645 if(tIndex==tLen) {
8646 if(endOfSource) {
8647 goto endOfSecLoop;
8648 } else {
8649 return UCOL_GREATER;
8650 }
8651 }
8652 tChar=target[tIndex++];
8653 tOrder = elements[tChar];
8654 if(tOrder > UCOL_NOT_FOUND) {
8655 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8656 }
8657 }
8658 if(endOfSource) {
8659 return UCOL_LESS;
8660 }
8661
8662 if(sOrder == tOrder) {
8663 sOrder = 0; tOrder = 0;
8664 continue;
8665 } else {
8666 // see primary loop for comments on this
8667 if(((sOrder^tOrder)&0xFF000000)!=0) {
8668 if(sOrder < tOrder) {
8669 return UCOL_LESS;
8670 } else if(sOrder > tOrder) {
8671 return UCOL_GREATER;
8672 }
8673 }
8674 sOrder<<=8;
8675 tOrder<<=8;
374ca955 8676 }
b75a7d8f
A
8677 }
8678 } else { // French
8679 if(haveContractions) { // if we have contractions, we have to bail out
8680 // since we don't really know how to handle them here
8681 goto returnRegular;
8682 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8683 }
8684 // For French, we go backwards
8685 sIndex = sLen; tIndex = tLen;
8686 for(;;) {
8687 while(sOrder==0) {
8688 if(sIndex==0) {
8689 endOfSource = TRUE;
8690 break;
8691 }
8692 sChar=source[--sIndex];
8693 sOrder = elements[sChar];
8694 // don't even look for contractions
8695 }
8696
8697 while(tOrder==0) {
8698 if(tIndex==0) {
8699 if(endOfSource) {
8700 goto endOfSecLoop;
8701 } else {
8702 return UCOL_GREATER;
8703 }
8704 }
8705 tChar=target[--tIndex];
8706 tOrder = elements[tChar];
8707 // don't even look for contractions
8708 }
8709 if(endOfSource) {
8710 return UCOL_LESS;
8711 }
8712
8713 if(sOrder == tOrder) {
8714 sOrder = 0; tOrder = 0;
8715 continue;
8716 } else {
8717 // see the primary loop for comments
8718 if(((sOrder^tOrder)&0xFF000000)!=0) {
8719 if(sOrder < tOrder) {
8720 return UCOL_LESS;
8721 } else if(sOrder > tOrder) {
8722 return UCOL_GREATER;
8723 }
8724 }
8725 sOrder<<=8;
8726 tOrder<<=8;
374ca955 8727 }
b75a7d8f
A
8728 }
8729 }
374ca955 8730 }
b75a7d8f
A
8731
8732endOfSecLoop:
8733 if(strength >= UCOL_TERTIARY) {
8734 // tertiary loop is the same as secondary (except no French)
8735 elements += coll->latinOneTableLen;
8736 sIndex = 0; tIndex = 0;
8737 endOfSource = FALSE; endOfTarget = FALSE;
8738 for(;;) {
8739 while(sOrder==0) {
8740 if(sIndex==sLen) {
8741 endOfSource = TRUE;
8742 break;
8743 }
8744 sChar=source[sIndex++];
8745 sOrder = elements[sChar];
8746 if(sOrder > UCOL_NOT_FOUND) {
8747 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8748 }
8749 }
8750 while(tOrder==0) {
8751 if(tIndex==tLen) {
8752 if(endOfSource) {
8753 return UCOL_EQUAL; // if both strings are at the end, they are equal
8754 } else {
8755 return UCOL_GREATER;
8756 }
8757 }
8758 tChar=target[tIndex++];
8759 tOrder = elements[tChar];
8760 if(tOrder > UCOL_NOT_FOUND) {
8761 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8762 }
8763 }
8764 if(endOfSource) {
8765 return UCOL_LESS;
8766 }
8767 if(sOrder == tOrder) {
8768 sOrder = 0; tOrder = 0;
8769 continue;
8770 } else {
8771 if(((sOrder^tOrder)&0xff000000)!=0) {
8772 if(sOrder < tOrder) {
8773 return UCOL_LESS;
8774 } else if(sOrder > tOrder) {
8775 return UCOL_GREATER;
8776 }
8777 }
8778 sOrder<<=8;
8779 tOrder<<=8;
374ca955 8780 }
b75a7d8f 8781 }
374ca955 8782 }
b75a7d8f
A
8783 return UCOL_EQUAL;
8784
8785returnRegular:
8786 // Preparing the context objects for iterating over strings
8787 collIterate sColl, tColl;
8788
8789 IInit_collIterate(coll, source, sLen, &sColl);
8790 IInit_collIterate(coll, target, tLen, &tColl);
374ca955 8791 return ucol_strcollRegular(&sColl, &tColl, status);
b75a7d8f
A
8792}
8793
8794
8795U_CAPI UCollationResult U_EXPORT2
8796ucol_strcollIter( const UCollator *coll,
8797 UCharIterator *sIter,
8798 UCharIterator *tIter,
8799 UErrorCode *status) {
374ca955
A
8800 if(!status || U_FAILURE(*status)) {
8801 return UCOL_EQUAL;
8802 }
8803
8804 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8805 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8806
8807 if (sIter == tIter) {
8808 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
b75a7d8f
A
8809 return UCOL_EQUAL;
8810 }
8811 if(sIter == NULL || tIter == NULL || coll == NULL) {
8812 *status = U_ILLEGAL_ARGUMENT_ERROR;
374ca955 8813 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
b75a7d8f
A
8814 return UCOL_EQUAL;
8815 }
8816
8817 UCollationResult result = UCOL_EQUAL;
8818
8819 // Preparing the context objects for iterating over strings
8820 collIterate sColl, tColl;
8821 // The division for the array length may truncate the array size to
8822 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8823 // for all platforms anyway.
8824 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8825 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8826 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8827
8828 IInit_collIterate(coll, NULL, -1, &sColl);
8829 sColl.iterator = sIter;
8830 sColl.flags |= UCOL_USE_ITERATOR;
8831 IInit_collIterate(coll, NULL, -1, &tColl);
8832 tColl.flags |= UCOL_USE_ITERATOR;
8833 tColl.iterator = tIter;
8834
8835 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8836 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8837 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8838 sColl.flags &= ~UCOL_ITER_NORM;
8839
8840 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8841 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8842 tColl.flags &= ~UCOL_ITER_NORM;
8843 }
8844
8845 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
374ca955
A
8846
8847 while((sChar = sColl.iterator->next(sColl.iterator)) ==
b75a7d8f 8848 (tChar = tColl.iterator->next(tColl.iterator))) {
374ca955
A
8849 if(UCOL_ISTHAIPREVOWEL(sChar)) {
8850 break;
8851 }
b75a7d8f
A
8852 if(sChar == U_SENTINEL) {
8853 result = UCOL_EQUAL;
8854 goto end_compare;
8855 }
8856 }
8857
8858 if(sChar == U_SENTINEL) {
8859 tChar = tColl.iterator->previous(tColl.iterator);
8860 }
8861
8862 if(tChar == U_SENTINEL) {
8863 sChar = sColl.iterator->previous(sColl.iterator);
8864 }
8865
8866 sChar = sColl.iterator->previous(sColl.iterator);
8867 tChar = tColl.iterator->previous(tColl.iterator);
8868
8869 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8870 {
8871 // We are stopped in the middle of a contraction.
8872 // Scan backwards through the == part of the string looking for the start of the contraction.
8873 // It doesn't matter which string we scan, since they are the same in this region.
8874 do
8875 {
8876 sChar = sColl.iterator->previous(sColl.iterator);
8877 tChar = tColl.iterator->previous(tColl.iterator);
8878 }
8879 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8880 }
8881
8882
8883 if(U_SUCCESS(*status)) {
8884 result = ucol_strcollRegular(&sColl, &tColl, status);
8885 }
8886
8887end_compare:
8888 if(sNormIter || tNormIter) {
8889 unorm_closeIter(sNormIter);
8890 unorm_closeIter(tNormIter);
8891 }
8892
374ca955 8893 UTRACE_EXIT_VALUE_STATUS(result, *status)
b75a7d8f
A
8894 return result;
8895}
8896
8897
8898
8899/* */
8900/* ucol_strcoll Main public API string comparison function */
8901/* */
8902U_CAPI UCollationResult U_EXPORT2
8903ucol_strcoll( const UCollator *coll,
8904 const UChar *source,
8905 int32_t sourceLength,
8906 const UChar *target,
8907 int32_t targetLength) {
8908 U_ALIGN_CODE(16);
374ca955
A
8909
8910 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8911 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8912 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8913 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8914 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8915 }
8916
b75a7d8f
A
8917 UErrorCode status = U_ZERO_ERROR;
8918 if(source == NULL || target == NULL) {
374ca955 8919 // do not crash, but return. Should have
b75a7d8f 8920 // status argument to return error.
374ca955 8921 UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL);
b75a7d8f
A
8922 return UCOL_EQUAL;
8923 }
8924 collIterate sColl, tColl;
8925
8926 /* Scan the strings. Find: */
8927 /* The length of any leading portion that is equal */
8928 /* Whether they are exactly equal. (in which case we just return) */
8929 const UChar *pSrc = source;
8930 const UChar *pTarg = target;
8931 int32_t equalLength;
8932
8933 if (sourceLength == -1 && targetLength == -1) {
8934 // Both strings are null terminated.
8935 // Check for them being the same string, and scan through
8936 // any leading equal portion.
8937 if (source==target) {
374ca955 8938 UTRACE_EXIT_VALUE(UCOL_EQUAL);
b75a7d8f
A
8939 return UCOL_EQUAL;
8940 }
8941
8942 for (;;) {
8943 if ( *pSrc != *pTarg || *pSrc == 0) {
8944 break;
8945 }
374ca955
A
8946 if(UCOL_ISTHAIPREVOWEL(*pSrc)) {
8947 break;
8948 }
b75a7d8f
A
8949 pSrc++;
8950 pTarg++;
8951 }
8952 if (*pSrc == 0 && *pTarg == 0) {
374ca955 8953 UTRACE_EXIT_VALUE(UCOL_EQUAL);
b75a7d8f
A
8954 return UCOL_EQUAL;
8955 }
8956 equalLength = pSrc - source;
8957 }
8958 else
8959 {
8960 // One or both strings has an explicit length.
8961 /* check if source and target are same strings */
8962
8963 if (source==target && sourceLength==targetLength) {
374ca955 8964 UTRACE_EXIT_VALUE(UCOL_EQUAL);
b75a7d8f
A
8965 return UCOL_EQUAL;
8966 }
8967 const UChar *pSrcEnd = source + sourceLength;
8968 const UChar *pTargEnd = target + targetLength;
8969
8970
8971 // Scan while the strings are bitwise ==, or until one is exhausted.
8972 for (;;) {
8973 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8974 break;
8975 }
8976 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8977 break;
8978 }
8979 if (*pSrc != *pTarg) {
8980 break;
8981 }
374ca955
A
8982 if(UCOL_ISTHAIPREVOWEL(*pSrc)) { // they are the same here, so any will do
8983 break;
8984 }
b75a7d8f
A
8985 pSrc++;
8986 pTarg++;
8987 }
8988 equalLength = pSrc - source;
8989
8990 // If we made it all the way through both strings, we are done. They are ==
8991 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8992 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) { /* and also at end of dest string */
374ca955 8993 UTRACE_EXIT_VALUE(UCOL_EQUAL);
b75a7d8f
A
8994 return UCOL_EQUAL;
8995 }
8996 }
8997 if (equalLength > 0) {
8998 /* There is an identical portion at the beginning of the two strings. */
8999 /* If the identical portion ends within a contraction or a comibining */
9000 /* character sequence, back up to the start of that sequence. */
9001 pSrc = source + equalLength; /* point to the first differing chars */
9002 pTarg = target + equalLength;
9003 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
9004 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
9005 {
9006 // We are stopped in the middle of a contraction.
9007 // Scan backwards through the == part of the string looking for the start of the contraction.
9008 // It doesn't matter which string we scan, since they are the same in this region.
9009 do
9010 {
9011 equalLength--;
9012 pSrc--;
9013 }
9014 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
9015 }
9016
9017 source += equalLength;
9018 target += equalLength;
9019 if (sourceLength > 0) {
9020 sourceLength -= equalLength;
9021 }
9022 if (targetLength > 0) {
9023 targetLength -= equalLength;
9024 }
9025 }
9026
374ca955 9027 UCollationResult returnVal;
b75a7d8f
A
9028 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
9029 // Preparing the context objects for iterating over strings
9030 IInit_collIterate(coll, source, sourceLength, &sColl);
9031 IInit_collIterate(coll, target, targetLength, &tColl);
374ca955 9032 returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
b75a7d8f 9033 } else {
374ca955 9034 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
b75a7d8f 9035 }
374ca955
A
9036 UTRACE_EXIT_VALUE(returnVal);
9037 return returnVal;
b75a7d8f
A
9038}
9039
9040/* convenience function for comparing strings */
9041U_CAPI UBool U_EXPORT2
9042ucol_greater( const UCollator *coll,
9043 const UChar *source,
9044 int32_t sourceLength,
9045 const UChar *target,
9046 int32_t targetLength)
9047{
9048 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9049 == UCOL_GREATER);
9050}
9051
9052/* convenience function for comparing strings */
9053U_CAPI UBool U_EXPORT2
9054ucol_greaterOrEqual( const UCollator *coll,
9055 const UChar *source,
9056 int32_t sourceLength,
9057 const UChar *target,
9058 int32_t targetLength)
9059{
9060 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9061 != UCOL_LESS);
9062}
9063
9064/* convenience function for comparing strings */
9065U_CAPI UBool U_EXPORT2
9066ucol_equal( const UCollator *coll,
9067 const UChar *source,
9068 int32_t sourceLength,
9069 const UChar *target,
9070 int32_t targetLength)
9071{
9072 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9073 == UCOL_EQUAL);
9074}
9075
9076/* returns the locale name the collation data comes from */
9077U_CAPI const char * U_EXPORT2
9078ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
374ca955
A
9079 return ucol_getLocaleByType(coll, type, status);
9080}
9081
9082U_CAPI const char * U_EXPORT2
9083ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
b75a7d8f
A
9084 const char *result = NULL;
9085 if(status == NULL || U_FAILURE(*status)) {
9086 return NULL;
9087 }
374ca955
A
9088 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);
9089 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);
9090
b75a7d8f
A
9091 switch(type) {
9092 case ULOC_ACTUAL_LOCALE:
9093 // validLocale is set only if service registration has explicitly set the
9094 // requested and valid locales. if this is the case, the actual locale
9095 // is considered to be the valid locale.
9096 if (coll->validLocale != NULL) {
9097 result = coll->validLocale;
9098 } else if(coll->elements != NULL) {
9099 result = ures_getLocale(coll->elements, status);
9100 }
9101 break;
9102 case ULOC_VALID_LOCALE:
9103 if (coll->validLocale != NULL) {
9104 result = coll->validLocale;
9105 } else if(coll->rb != NULL) {
9106 result = ures_getLocale(coll->rb, status);
374ca955 9107 }
b75a7d8f
A
9108 break;
9109 case ULOC_REQUESTED_LOCALE:
9110 result = coll->requestedLocale;
9111 break;
9112 default:
9113 *status = U_ILLEGAL_ARGUMENT_ERROR;
9114 }
374ca955
A
9115 UTRACE_DATA1(UTRACE_INFO, "result = %s", result);
9116 UTRACE_EXIT_STATUS(*status);
b75a7d8f
A
9117 return result;
9118}
9119
9120U_CAPI USet * U_EXPORT2
374ca955 9121ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
b75a7d8f
A
9122{
9123 if(status == NULL || U_FAILURE(*status)) {
9124 return NULL;
9125 }
374ca955 9126 if(coll == NULL || coll->UCA == NULL) {
b75a7d8f
A
9127 *status = U_ILLEGAL_ARGUMENT_ERROR;
9128 }
9129 UParseError parseError;
9130 UColTokenParser src;
9131 int32_t rulesLen = 0;
9132 const UChar *rules = ucol_getRules(coll, &rulesLen);
9133 const UChar *current = NULL;
9134 UBool startOfRules = TRUE;
9135 // we internally use the C++ class, for the following reasons:
9136 // 1. we need to utilize canonical iterator, which is a C++ only class
9137 // 2. canonical iterator returns UnicodeStrings - USet cannot take them
9138 // 3. USet is internally really UnicodeSet, C is just a wrapper
9139 UnicodeSet *tailored = new UnicodeSet();
9140 UnicodeString pattern;
374ca955
A
9141 UnicodeString empty;
9142 CanonicalIterator it(empty, *status);
b75a7d8f
A
9143
9144
9145 // The idea is to tokenize the rule set. For each non-reset token,
374ca955
A
9146 // we add all the canonicaly equivalent FCD sequences
9147 ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
b75a7d8f
A
9148 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
9149 startOfRules = FALSE;
9150 if(src.parsedToken.strength != UCOL_TOK_RESET) {
9151 const UChar *stuff = src.source+(src.parsedToken.charsOffset);
9152 it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
9153 pattern = it.next();
9154 while(!pattern.isBogus()) {
9155 if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
9156 tailored->add(pattern);
9157 }
9158 pattern = it.next();
9159 }
9160 }
9161 }
9162 ucol_tok_closeTokenList(&src);
9163 return (USet *)tailored;
9164}
9165
9166U_CAPI UBool U_EXPORT2
9167ucol_equals(const UCollator *source, const UCollator *target) {
9168 UErrorCode status = U_ZERO_ERROR;
9169 // if pointers are equal, collators are equal
9170 if(source == target) {
9171 return TRUE;
9172 }
9173 int32_t i = 0, j = 0;
9174 // if any of attributes are different, collators are not equal
9175 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
9176 if(ucol_getAttribute(source, (UColAttribute)i, &status) != ucol_getAttribute(target, (UColAttribute)i, &status) || U_FAILURE(status)) {
9177 return FALSE;
9178 }
9179 }
9180
9181 int32_t sourceRulesLen = 0, targetRulesLen = 0;
9182 const UChar *sourceRules = ucol_getRules(source, &sourceRulesLen);
9183 const UChar *targetRules = ucol_getRules(target, &targetRulesLen);
9184
9185 if(sourceRulesLen == targetRulesLen && u_strncmp(sourceRules, targetRules, sourceRulesLen) == 0) {
9186 // all the attributes are equal and the rules are equal - collators are equal
9187 return(TRUE);
374ca955 9188 }
b75a7d8f
A
9189 // hard part, need to construct tree from rules and see if they yield the same tailoring
9190 UBool result = TRUE;
9191 UParseError parseError;
9192 UColTokenParser sourceParser, targetParser;
9193 int32_t sourceListLen = 0, targetListLen = 0;
374ca955
A
9194 ucol_tok_initTokenList(&sourceParser, sourceRules, sourceRulesLen, source->UCA, &status);
9195 ucol_tok_initTokenList(&targetParser, targetRules, targetRulesLen, target->UCA, &status);
b75a7d8f
A
9196 sourceListLen = ucol_tok_assembleTokenList(&sourceParser, &parseError, &status);
9197 targetListLen = ucol_tok_assembleTokenList(&targetParser, &parseError, &status);
9198
9199 if(sourceListLen != targetListLen) {
9200 // different number of resets
9201 result = FALSE;
9202 } else {
9203 UColToken *sourceReset = NULL, *targetReset = NULL;
9204 UChar *sourceResetString = NULL, *targetResetString = NULL;
9205 int32_t sourceStringLen = 0, targetStringLen = 0;
9206 for(i = 0; i < sourceListLen; i++) {
9207 sourceReset = sourceParser.lh[i].reset;
9208 sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
9209 sourceStringLen = sourceReset->source >> 24;
9210 for(j = 0; j < sourceListLen; j++) {
9211 targetReset = targetParser.lh[j].reset;
9212 targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
9213 targetStringLen = targetReset->source >> 24;
9214 if(sourceStringLen == targetStringLen && (u_strncmp(sourceResetString, targetResetString, sourceStringLen) == 0)) {
9215 sourceReset = sourceParser.lh[i].first;
9216 targetReset = targetParser.lh[j].first;
9217 while(sourceReset != NULL && targetReset != NULL) {
9218 sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
9219 sourceStringLen = sourceReset->source >> 24;
9220 targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
9221 targetStringLen = targetReset->source >> 24;
9222 if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
9223 result = FALSE;
9224 goto returnResult;
9225 }
9226 // probably also need to check the expansions
9227 if(sourceReset->expansion) {
9228 if(!targetReset->expansion) {
9229 result = FALSE;
9230 goto returnResult;
9231 } else {
9232 // compare expansions
9233 sourceResetString = sourceParser.source+(sourceReset->expansion& 0xFFFFFF);
9234 sourceStringLen = sourceReset->expansion >> 24;
9235 targetResetString = targetParser.source+(targetReset->expansion & 0xFFFFFF);
9236 targetStringLen = targetReset->expansion >> 24;
9237 if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
9238 result = FALSE;
9239 goto returnResult;
9240 }
9241 }
9242 } else {
9243 if(targetReset->expansion) {
9244 result = FALSE;
9245 goto returnResult;
9246 }
9247 }
9248 sourceReset = sourceReset->next;
9249 targetReset = targetReset->next;
9250 }
9251 if(sourceReset != targetReset) { // at least one is not NULL
9252 // there are more tailored elements in one list
9253 result = FALSE;
9254 goto returnResult;
9255 }
9256
9257
9258 break;
9259 }
9260 }
9261 // couldn't find the reset anchor, so the collators are not equal
9262 if(j == sourceListLen) {
9263 result = FALSE;
9264 goto returnResult;
9265 }
9266 }
9267 }
9268
9269returnResult:
9270 ucol_tok_closeTokenList(&sourceParser);
9271 ucol_tok_closeTokenList(&targetParser);
9272 return result;
9273
9274}
374ca955
A
9275
9276U_CAPI void U_EXPORT2
9277ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
9278 if(coll && coll->UCA) {
9279 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
9280 }
9281}
9282
9283U_CAPI int32_t U_EXPORT2
9284ucol_cloneBinary(const UCollator *coll,
9285 uint8_t *buffer, int32_t capacity,
9286 UErrorCode *status)
9287{
9288 int32_t length = 0;
9289 if(U_FAILURE(*status)) {
9290 return length;
9291 }
9292 if(coll->hasRealData == TRUE) {
9293 length = coll->image->size;
9294 if(length <= capacity) {
9295 uprv_memcpy(buffer, coll->image, length);
9296 }
9297 } else {
9298 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
9299 if(length <= capacity) {
9300 /* build the UCATableHeader with minimal entries */
9301 /* do not copy the header from the UCA file because its values are wrong! */
9302 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
9303
9304 /* reset everything */
9305 uprv_memset(buffer, 0, length);
9306
9307 /* set the tailoring-specific values */
9308 UCATableHeader *myData = (UCATableHeader *)buffer;
9309 myData->size = length;
9310
9311 /* offset for the options, the only part of the data that is present after the header */
9312 myData->options = sizeof(UCATableHeader);
9313
9314 /* need to always set the expansion value for an upper bound of the options */
9315 myData->expansion = myData->options + sizeof(UColOptionSet);
9316
9317 myData->magic = UCOL_HEADER_MAGIC;
9318 myData->isBigEndian = U_IS_BIG_ENDIAN;
9319 myData->charSetFamily = U_CHARSET_FAMILY;
9320
9321 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
9322 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
9323
9324 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
9325 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
9326 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
9327 myData->jamoSpecial = coll->image->jamoSpecial;
9328
9329 /* copy the collator options */
9330 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
9331 }
9332 }
9333 return length;
9334}
9335
9336U_CAPI UCollator* U_EXPORT2
9337ucol_openBinary(const uint8_t *bin, int32_t length,
9338 const UCollator *base,
9339 UErrorCode *status)
9340{
9341 UCollator *result = NULL;
9342 if(U_FAILURE(*status)){
9343 return NULL;
9344 }
9345 if(base == NULL) {
9346 // we don't support null base yet
9347 *status = U_ILLEGAL_ARGUMENT_ERROR;
9348 return NULL;
9349 }
9350 UCATableHeader *colData = (UCATableHeader *)bin;
9351 // do we want version check here? We're trying to figure out whether collators are compatible
9352 if(uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
9353 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0 ||
9354 colData->version[0] != UCOL_BUILDER_VERSION) {
9355 *status = U_COLLATOR_VERSION_MISMATCH;
9356 return NULL;
9357 } else {
9358 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
9359 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
9360 if(U_FAILURE(*status)){
9361 return NULL;
9362 }
9363 result->hasRealData = TRUE;
9364 } else {
9365 if(base) {
9366 result = ucol_initCollator(base->image, result, base, status);
9367 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
9368 if(U_FAILURE(*status)){
9369 return NULL;
9370 }
9371 result->hasRealData = FALSE;
9372 } else {
9373 *status = U_USELESS_COLLATOR_ERROR;
9374 return NULL;
9375 }
9376 }
9377 result->freeImageOnClose = FALSE;
9378 }
9379 result->validLocale = NULL;
9380 result->requestedLocale = NULL;
9381 result->rules = NULL;
9382 result->rulesLength = 0;
9383 result->freeRulesOnClose = FALSE;
9384 result->rb = NULL;
9385 result->elements = NULL;
9386 return result;
9387}
9388
b75a7d8f 9389#endif /* #if !UCONFIG_NO_COLLATION */
374ca955 9390