]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/ucol.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / ucol.cpp
... / ...
CommitLineData
1/*
2*******************************************************************************
3* Copyright (C) 1996-2012, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: ucol.cpp
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* Modification history
12* Date Name Comments
13* 1996-1999 various members of ICU team maintained C API for collation framework
14* 02/16/2001 synwee Added internal method getPrevSpecialCE
15* 03/01/2001 synwee Added maxexpansion functionality.
16* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_COLLATION
22
23#include "unicode/bytestream.h"
24#include "unicode/coleitr.h"
25#include "unicode/unorm.h"
26#include "unicode/udata.h"
27#include "unicode/ustring.h"
28
29#include "ucol_imp.h"
30#include "bocsu.h"
31
32#include "normalizer2impl.h"
33#include "unorm_it.h"
34#include "umutex.h"
35#include "cmemory.h"
36#include "ucln_in.h"
37#include "cstring.h"
38#include "utracimp.h"
39#include "putilimp.h"
40#include "uassert.h"
41#include "unicode/coll.h"
42
43#ifdef UCOL_DEBUG
44#include <stdio.h>
45#endif
46
47U_NAMESPACE_USE
48
49#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
50
51#define LAST_BYTE_MASK_ 0xFF
52#define SECOND_LAST_BYTE_SHIFT_ 8
53
54#define ZERO_CC_LIMIT_ 0xC0
55
56// This is static pointer to the NFC implementation instance.
57// it is always the same between calls to u_cleanup
58// and therefore writing to it is not synchronized.
59// It is cleaned in ucol_cleanup
60static const Normalizer2Impl *g_nfcImpl = NULL;
61
62// These are values from UCA required for
63// implicit generation and supressing sort key compression
64// they should regularly be in the UCA, but if one
65// is running without UCA, it could be a problem
66static const int32_t maxRegularPrimary = 0x7A;
67static const int32_t minImplicitPrimary = 0xE0;
68static const int32_t maxImplicitPrimary = 0xE4;
69
70U_CDECL_BEGIN
71static UBool U_CALLCONV
72ucol_cleanup(void)
73{
74 g_nfcImpl = NULL;
75 return TRUE;
76}
77
78static int32_t U_CALLCONV
79_getFoldingOffset(uint32_t data) {
80 return (int32_t)(data&0xFFFFFF);
81}
82
83U_CDECL_END
84
85// init FCD data
86static inline
87UBool initializeFCD(UErrorCode *status) {
88 if (g_nfcImpl != NULL) {
89 return TRUE;
90 } else {
91 // The result is constant, until the library is reloaded.
92 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
93 // Note: Alternatively, we could also store this pointer in each collIterate struct,
94 // same as Normalizer2Factory::getImpl(collIterate->nfd).
95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
96 return U_SUCCESS(*status);
97 }
98}
99
100static
101inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
102 int32_t sourceLen, collIterate *s,
103 UErrorCode *status)
104{
105 (s)->string = (s)->pos = sourceString;
106 (s)->origFlags = 0;
107 (s)->flags = 0;
108 if (sourceLen >= 0) {
109 s->flags |= UCOL_ITER_HASLEN;
110 (s)->endp = (UChar *)sourceString+sourceLen;
111 }
112 else {
113 /* change to enable easier checking for end of string for fcdpositon */
114 (s)->endp = NULL;
115 }
116 (s)->extendCEs = NULL;
117 (s)->extendCEsSize = 0;
118 (s)->CEpos = (s)->toReturn = (s)->CEs;
119 (s)->offsetBuffer = NULL;
120 (s)->offsetBufferSize = 0;
121 (s)->offsetReturn = (s)->offsetStore = NULL;
122 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
123 (s)->coll = (collator);
124 (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
125 (s)->fcdPosition = 0;
126 if(collator->normalizationMode == UCOL_ON) {
127 (s)->flags |= UCOL_ITER_NORM;
128 }
129 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
130 (s)->flags |= UCOL_HIRAGANA_Q;
131 }
132 (s)->iterator = NULL;
133 //(s)->iteratorIndex = 0;
134}
135
136U_CAPI void U_EXPORT2
137uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
138 int32_t sourceLen, collIterate *s,
139 UErrorCode *status) {
140 /* Out-of-line version for use from other files. */
141 IInit_collIterate(collator, sourceString, sourceLen, s, status);
142}
143
144U_CAPI collIterate * U_EXPORT2
145uprv_new_collIterate(UErrorCode *status) {
146 if(U_FAILURE(*status)) {
147 return NULL;
148 }
149 collIterate *s = new collIterate;
150 if(s == NULL) {
151 *status = U_MEMORY_ALLOCATION_ERROR;
152 return NULL;
153 }
154 return s;
155}
156
157U_CAPI void U_EXPORT2
158uprv_delete_collIterate(collIterate *s) {
159 delete s;
160}
161
162U_CAPI UBool U_EXPORT2
163uprv_collIterateAtEnd(collIterate *s) {
164 return s == NULL || s->pos == s->endp;
165}
166
167/**
168* Backup the state of the collIterate struct data
169* @param data collIterate to backup
170* @param backup storage
171*/
172static
173inline void backupState(const collIterate *data, collIterateState *backup)
174{
175 backup->fcdPosition = data->fcdPosition;
176 backup->flags = data->flags;
177 backup->origFlags = data->origFlags;
178 backup->pos = data->pos;
179 backup->bufferaddress = data->writableBuffer.getBuffer();
180 backup->buffersize = data->writableBuffer.length();
181 backup->iteratorMove = 0;
182 backup->iteratorIndex = 0;
183 if(data->iterator != NULL) {
184 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
185 backup->iteratorIndex = data->iterator->getState(data->iterator);
186 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
187 if(backup->iteratorIndex == UITER_NO_STATE) {
188 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
189 backup->iteratorMove++;
190 data->iterator->move(data->iterator, -1, UITER_CURRENT);
191 }
192 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
193 }
194 }
195}
196
197/**
198* Loads the state into the collIterate struct data
199* @param data collIterate to backup
200* @param backup storage
201* @param forwards boolean to indicate if forwards iteration is used,
202* false indicates backwards iteration
203*/
204static
205inline void loadState(collIterate *data, const collIterateState *backup,
206 UBool forwards)
207{
208 UErrorCode status = U_ZERO_ERROR;
209 data->flags = backup->flags;
210 data->origFlags = backup->origFlags;
211 if(data->iterator != NULL) {
212 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
213 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
214 if(backup->iteratorMove != 0) {
215 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
216 }
217 }
218 data->pos = backup->pos;
219
220 if ((data->flags & UCOL_ITER_INNORMBUF) &&
221 data->writableBuffer.getBuffer() != backup->bufferaddress) {
222 /*
223 this is when a new buffer has been reallocated and we'll have to
224 calculate the new position.
225 note the new buffer has to contain the contents of the old buffer.
226 */
227 if (forwards) {
228 data->pos = data->writableBuffer.getTerminatedBuffer() +
229 (data->pos - backup->bufferaddress);
230 }
231 else {
232 /* backwards direction */
233 int32_t temp = backup->buffersize -
234 (int32_t)(data->pos - backup->bufferaddress);
235 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
236 }
237 }
238 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
239 /*
240 this is alittle tricky.
241 if we are initially not in the normalization buffer, even if we
242 normalize in the later stage, the data in the buffer will be
243 ignored, since we skip back up to the data string.
244 however if we are already in the normalization buffer, any
245 further normalization will pull data into the normalization
246 buffer and modify the fcdPosition.
247 since we are keeping the data in the buffer for use, the
248 fcdPosition can not be reverted back.
249 arrgghh....
250 */
251 data->fcdPosition = backup->fcdPosition;
252 }
253}
254
255static UBool
256reallocCEs(collIterate *data, int32_t newCapacity) {
257 uint32_t *oldCEs = data->extendCEs;
258 if(oldCEs == NULL) {
259 oldCEs = data->CEs;
260 }
261 int32_t length = data->CEpos - oldCEs;
262 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
263 if(newCEs == NULL) {
264 return FALSE;
265 }
266 uprv_memcpy(newCEs, oldCEs, length * 4);
267 uprv_free(data->extendCEs);
268 data->extendCEs = newCEs;
269 data->extendCEsSize = newCapacity;
270 data->CEpos = newCEs + length;
271 return TRUE;
272}
273
274static UBool
275increaseCEsCapacity(collIterate *data) {
276 int32_t oldCapacity;
277 if(data->extendCEs != NULL) {
278 oldCapacity = data->extendCEsSize;
279 } else {
280 oldCapacity = LENGTHOF(data->CEs);
281 }
282 return reallocCEs(data, 2 * oldCapacity);
283}
284
285static UBool
286ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
287 int32_t oldCapacity;
288 if(data->extendCEs != NULL) {
289 oldCapacity = data->extendCEsSize;
290 } else {
291 oldCapacity = LENGTHOF(data->CEs);
292 }
293 if(minCapacity <= oldCapacity) {
294 return TRUE;
295 }
296 oldCapacity *= 2;
297 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
298}
299
300void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
301 if(U_FAILURE(errorCode)) {
302 return;
303 }
304 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
305 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
306 if(length >= offsetBufferSize) {
307 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
308 int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
309 if(newBuffer == NULL) {
310 errorCode = U_MEMORY_ALLOCATION_ERROR;
311 return;
312 }
313 if(length > 0) {
314 uprv_memcpy(newBuffer, offsetBuffer, length * 4);
315 }
316 uprv_free(offsetBuffer);
317 offsetBuffer = newBuffer;
318 offsetStore = offsetBuffer + length;
319 offsetBufferSize = newCapacity;
320 }
321 *offsetStore++ = offset;
322}
323
324/*
325* collIter_eos()
326* Checks for a collIterate being positioned at the end of
327* its source string.
328*
329*/
330static
331inline UBool collIter_eos(collIterate *s) {
332 if(s->flags & UCOL_USE_ITERATOR) {
333 return !(s->iterator->hasNext(s->iterator));
334 }
335 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
336 // Null terminated string, but not at null, so not at end.
337 // Whether in main or normalization buffer doesn't matter.
338 return FALSE;
339 }
340
341 // String with length. Can't be in normalization buffer, which is always
342 // null termintated.
343 if (s->flags & UCOL_ITER_HASLEN) {
344 return (s->pos == s->endp);
345 }
346
347 // We are at a null termination, could be either normalization buffer or main string.
348 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
349 // At null at end of main string.
350 return TRUE;
351 }
352
353 // At null at end of normalization buffer. Need to check whether there there are
354 // any characters left in the main buffer.
355 if(s->origFlags & UCOL_USE_ITERATOR) {
356 return !(s->iterator->hasNext(s->iterator));
357 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
358 // Null terminated main string. fcdPosition is the 'return' position into main buf.
359 return (*s->fcdPosition == 0);
360 }
361 else {
362 // Main string with an end pointer.
363 return s->fcdPosition == s->endp;
364 }
365}
366
367/*
368* collIter_bos()
369* Checks for a collIterate being positioned at the start of
370* its source string.
371*
372*/
373static
374inline UBool collIter_bos(collIterate *source) {
375 // if we're going backwards, we need to know whether there is more in the
376 // iterator, even if we are in the side buffer
377 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
378 return !source->iterator->hasPrevious(source->iterator);
379 }
380 if (source->pos <= source->string ||
381 ((source->flags & UCOL_ITER_INNORMBUF) &&
382 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
383 return TRUE;
384 }
385 return FALSE;
386}
387
388/*static
389inline UBool collIter_SimpleBos(collIterate *source) {
390 // if we're going backwards, we need to know whether there is more in the
391 // iterator, even if we are in the side buffer
392 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
393 return !source->iterator->hasPrevious(source->iterator);
394 }
395 if (source->pos == source->string) {
396 return TRUE;
397 }
398 return FALSE;
399}*/
400 //return (data->pos == data->string) ||
401
402
403/****************************************************************************/
404/* Following are the open/close functions */
405/* */
406/****************************************************************************/
407
408static UCollator*
409ucol_initFromBinary(const uint8_t *bin, int32_t length,
410 const UCollator *base,
411 UCollator *fillIn,
412 UErrorCode *status)
413{
414 UCollator *result = fillIn;
415 if(U_FAILURE(*status)) {
416 return NULL;
417 }
418 /*
419 if(base == NULL) {
420 // we don't support null base yet
421 *status = U_ILLEGAL_ARGUMENT_ERROR;
422 return NULL;
423 }
424 */
425 // We need these and we could be running without UCA
426 uprv_uca_initImplicitConstants(status);
427 UCATableHeader *colData = (UCATableHeader *)bin;
428 // do we want version check here? We're trying to figure out whether collators are compatible
429 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
430 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
431 colData->version[0] != UCOL_BUILDER_VERSION)
432 {
433 *status = U_COLLATOR_VERSION_MISMATCH;
434 return NULL;
435 }
436 else {
437 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
438 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
439 if(U_FAILURE(*status)){
440 return NULL;
441 }
442 result->hasRealData = TRUE;
443 }
444 else {
445 if(base) {
446 result = ucol_initCollator(base->image, result, base, status);
447 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
448 if(U_FAILURE(*status)){
449 return NULL;
450 }
451 result->hasRealData = FALSE;
452 }
453 else {
454 *status = U_USELESS_COLLATOR_ERROR;
455 return NULL;
456 }
457 }
458 result->freeImageOnClose = FALSE;
459 }
460 result->actualLocale = NULL;
461 result->validLocale = NULL;
462 result->requestedLocale = NULL;
463 result->rules = NULL;
464 result->rulesLength = 0;
465 result->freeRulesOnClose = FALSE;
466 result->ucaRules = NULL;
467 return result;
468}
469
470U_CAPI UCollator* U_EXPORT2
471ucol_openBinary(const uint8_t *bin, int32_t length,
472 const UCollator *base,
473 UErrorCode *status)
474{
475 return ucol_initFromBinary(bin, length, base, NULL, status);
476}
477
478U_CAPI int32_t U_EXPORT2
479ucol_cloneBinary(const UCollator *coll,
480 uint8_t *buffer, int32_t capacity,
481 UErrorCode *status)
482{
483 int32_t length = 0;
484 if(U_FAILURE(*status)) {
485 return length;
486 }
487 if(capacity < 0) {
488 *status = U_ILLEGAL_ARGUMENT_ERROR;
489 return length;
490 }
491 if(coll->hasRealData == TRUE) {
492 length = coll->image->size;
493 if(length <= capacity) {
494 uprv_memcpy(buffer, coll->image, length);
495 } else {
496 *status = U_BUFFER_OVERFLOW_ERROR;
497 }
498 } else {
499 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
500 if(length <= capacity) {
501 /* build the UCATableHeader with minimal entries */
502 /* do not copy the header from the UCA file because its values are wrong! */
503 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
504
505 /* reset everything */
506 uprv_memset(buffer, 0, length);
507
508 /* set the tailoring-specific values */
509 UCATableHeader *myData = (UCATableHeader *)buffer;
510 myData->size = length;
511
512 /* offset for the options, the only part of the data that is present after the header */
513 myData->options = sizeof(UCATableHeader);
514
515 /* need to always set the expansion value for an upper bound of the options */
516 myData->expansion = myData->options + sizeof(UColOptionSet);
517
518 myData->magic = UCOL_HEADER_MAGIC;
519 myData->isBigEndian = U_IS_BIG_ENDIAN;
520 myData->charSetFamily = U_CHARSET_FAMILY;
521
522 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
523 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
524
525 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
526 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
527 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
528 myData->jamoSpecial = coll->image->jamoSpecial;
529
530 /* copy the collator options */
531 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
532 } else {
533 *status = U_BUFFER_OVERFLOW_ERROR;
534 }
535 }
536 return length;
537}
538
539U_CAPI UCollator* U_EXPORT2
540ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
541{
542 UCollator * localCollator;
543 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
544 char *stackBufferChars = (char *)stackBuffer;
545 int32_t imageSize = 0;
546 int32_t rulesSize = 0;
547 int32_t rulesPadding = 0;
548 int32_t defaultReorderCodesSize = 0;
549 int32_t reorderCodesSize = 0;
550 uint8_t *image;
551 UChar *rules;
552 int32_t* defaultReorderCodes;
553 int32_t* reorderCodes;
554 uint8_t* leadBytePermutationTable;
555 UBool colAllocated = FALSE;
556 UBool imageAllocated = FALSE;
557
558 if (status == NULL || U_FAILURE(*status)){
559 return 0;
560 }
561 if ((stackBuffer && !pBufferSize) || !coll){
562 *status = U_ILLEGAL_ARGUMENT_ERROR;
563 return 0;
564 }
565
566 if (coll->rules && coll->freeRulesOnClose) {
567 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
568 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
569 bufferSizeNeeded += rulesSize + rulesPadding;
570 }
571 // no padding for alignment needed from here since the next two are 4 byte quantities
572 if (coll->defaultReorderCodes) {
573 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
574 bufferSizeNeeded += defaultReorderCodesSize;
575 }
576 if (coll->reorderCodes) {
577 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
578 bufferSizeNeeded += reorderCodesSize;
579 }
580 if (coll->leadBytePermutationTable) {
581 bufferSizeNeeded += 256 * sizeof(uint8_t);
582 }
583
584 if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
585 *pBufferSize = bufferSizeNeeded;
586 return 0;
587 }
588
589 /* Pointers on 64-bit platforms need to be aligned
590 * on a 64-bit boundry in memory.
591 */
592 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
593 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
594 if (*pBufferSize > offsetUp) {
595 *pBufferSize -= offsetUp;
596 stackBufferChars += offsetUp;
597 }
598 else {
599 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
600 *pBufferSize = 1;
601 }
602 }
603 stackBuffer = (void *)stackBufferChars;
604
605 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
606 /* allocate one here...*/
607 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
608 // Null pointer check.
609 if (stackBufferChars == NULL) {
610 *status = U_MEMORY_ALLOCATION_ERROR;
611 return NULL;
612 }
613 colAllocated = TRUE;
614 if (U_SUCCESS(*status)) {
615 *status = U_SAFECLONE_ALLOCATED_WARNING;
616 }
617 }
618 localCollator = (UCollator *)stackBufferChars;
619 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
620 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
621 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
622 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
623
624 {
625 UErrorCode tempStatus = U_ZERO_ERROR;
626 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
627 }
628 if (coll->freeImageOnClose) {
629 image = (uint8_t *)uprv_malloc(imageSize);
630 // Null pointer check
631 if (image == NULL) {
632 *status = U_MEMORY_ALLOCATION_ERROR;
633 return NULL;
634 }
635 ucol_cloneBinary(coll, image, imageSize, status);
636 imageAllocated = TRUE;
637 }
638 else {
639 image = (uint8_t *)coll->image;
640 }
641 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
642 if (U_FAILURE(*status)) {
643 return NULL;
644 }
645
646 if (coll->rules) {
647 if (coll->freeRulesOnClose) {
648 localCollator->rules = u_strcpy(rules, coll->rules);
649 //bufferEnd += rulesSize;
650 }
651 else {
652 localCollator->rules = coll->rules;
653 }
654 localCollator->freeRulesOnClose = FALSE;
655 localCollator->rulesLength = coll->rulesLength;
656 }
657
658 // collator reordering
659 if (coll->defaultReorderCodes) {
660 localCollator->defaultReorderCodes =
661 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
662 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
663 localCollator->freeDefaultReorderCodesOnClose = FALSE;
664 }
665 if (coll->reorderCodes) {
666 localCollator->reorderCodes =
667 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
668 localCollator->reorderCodesLength = coll->reorderCodesLength;
669 localCollator->freeReorderCodesOnClose = FALSE;
670 }
671 if (coll->leadBytePermutationTable) {
672 localCollator->leadBytePermutationTable =
673 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
674 localCollator->freeLeadBytePermutationTableOnClose = FALSE;
675 }
676
677 int32_t i;
678 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
679 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
680 }
681 // zero copies of pointers
682 localCollator->actualLocale = NULL;
683 localCollator->validLocale = NULL;
684 localCollator->requestedLocale = NULL;
685 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
686 localCollator->freeOnClose = colAllocated;
687 localCollator->freeImageOnClose = imageAllocated;
688 return localCollator;
689}
690
691U_CAPI void U_EXPORT2
692ucol_close(UCollator *coll)
693{
694 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
695 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
696 if(coll != NULL) {
697 // these are always owned by each UCollator struct,
698 // so we always free them
699 if(coll->validLocale != NULL) {
700 uprv_free(coll->validLocale);
701 }
702 if(coll->actualLocale != NULL) {
703 uprv_free(coll->actualLocale);
704 }
705 if(coll->requestedLocale != NULL) {
706 uprv_free(coll->requestedLocale);
707 }
708 if(coll->latinOneCEs != NULL) {
709 uprv_free(coll->latinOneCEs);
710 }
711 if(coll->options != NULL && coll->freeOptionsOnClose) {
712 uprv_free(coll->options);
713 }
714 if(coll->rules != NULL && coll->freeRulesOnClose) {
715 uprv_free((UChar *)coll->rules);
716 }
717 if(coll->image != NULL && coll->freeImageOnClose) {
718 uprv_free((UCATableHeader *)coll->image);
719 }
720
721 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
722 uprv_free(coll->leadBytePermutationTable);
723 }
724 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
725 uprv_free(coll->defaultReorderCodes);
726 }
727 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
728 uprv_free(coll->reorderCodes);
729 }
730
731 if(coll->delegate != NULL) {
732 delete (Collator*)coll->delegate;
733 }
734
735 /* Here, it would be advisable to close: */
736 /* - UData for UCA (unless we stuff it in the root resb */
737 /* Again, do we need additional housekeeping... HMMM! */
738 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
739 if(coll->freeOnClose){
740 /* for safeClone, if freeOnClose is FALSE,
741 don't free the other instance data */
742 uprv_free(coll);
743 }
744 }
745 UTRACE_EXIT();
746}
747
748/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
749/* you should be able to get the binary chunk to write out... Doesn't look very full now */
750U_CFUNC uint8_t* U_EXPORT2
751ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
752{
753 uint8_t *result = NULL;
754 if(U_FAILURE(*status)) {
755 return NULL;
756 }
757 if(coll->hasRealData == TRUE) {
758 *length = coll->image->size;
759 result = (uint8_t *)uprv_malloc(*length);
760 /* test for NULL */
761 if (result == NULL) {
762 *status = U_MEMORY_ALLOCATION_ERROR;
763 return NULL;
764 }
765 uprv_memcpy(result, coll->image, *length);
766 } else {
767 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
768 result = (uint8_t *)uprv_malloc(*length);
769 /* test for NULL */
770 if (result == NULL) {
771 *status = U_MEMORY_ALLOCATION_ERROR;
772 return NULL;
773 }
774
775 /* build the UCATableHeader with minimal entries */
776 /* do not copy the header from the UCA file because its values are wrong! */
777 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
778
779 /* reset everything */
780 uprv_memset(result, 0, *length);
781
782 /* set the tailoring-specific values */
783 UCATableHeader *myData = (UCATableHeader *)result;
784 myData->size = *length;
785
786 /* offset for the options, the only part of the data that is present after the header */
787 myData->options = sizeof(UCATableHeader);
788
789 /* need to always set the expansion value for an upper bound of the options */
790 myData->expansion = myData->options + sizeof(UColOptionSet);
791
792 myData->magic = UCOL_HEADER_MAGIC;
793 myData->isBigEndian = U_IS_BIG_ENDIAN;
794 myData->charSetFamily = U_CHARSET_FAMILY;
795
796 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
797 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
798
799 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
800 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
801 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
802 myData->jamoSpecial = coll->image->jamoSpecial;
803
804 /* copy the collator options */
805 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
806 }
807 return result;
808}
809
810void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
811 if(U_FAILURE(*status)) {
812 return;
813 }
814 result->caseFirst = (UColAttributeValue)opts->caseFirst;
815 result->caseLevel = (UColAttributeValue)opts->caseLevel;
816 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
817 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
818 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
819 return;
820 }
821 result->strength = (UColAttributeValue)opts->strength;
822 result->variableTopValue = opts->variableTopValue;
823 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
824 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
825 result->numericCollation = (UColAttributeValue)opts->numericCollation;
826 result->caseFirstisDefault = TRUE;
827 result->caseLevelisDefault = TRUE;
828 result->frenchCollationisDefault = TRUE;
829 result->normalizationModeisDefault = TRUE;
830 result->strengthisDefault = TRUE;
831 result->variableTopValueisDefault = TRUE;
832 result->alternateHandlingisDefault = TRUE;
833 result->hiraganaQisDefault = TRUE;
834 result->numericCollationisDefault = TRUE;
835
836 ucol_updateInternalState(result, status);
837
838 result->options = opts;
839}
840
841
842/**
843* Approximate determination if a character is at a contraction end.
844* Guaranteed to be TRUE if a character is at the end of a contraction,
845* otherwise it is not deterministic.
846* @param c character to be determined
847* @param coll collator
848*/
849static
850inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
851 if (c < coll->minContrEndCP) {
852 return FALSE;
853 }
854
855 int32_t hash = c;
856 uint8_t htbyte;
857 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
858 if (U16_IS_TRAIL(c)) {
859 return TRUE;
860 }
861 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
862 }
863 htbyte = coll->contrEndCP[hash>>3];
864 return (((htbyte >> (hash & 7)) & 1) == 1);
865}
866
867
868
869/*
870* i_getCombiningClass()
871* A fast, at least partly inline version of u_getCombiningClass()
872* This is a candidate for further optimization. Used heavily
873* in contraction processing.
874*/
875static
876inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
877 uint8_t sCC = 0;
878 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
879 sCC = u_getCombiningClass(c);
880 }
881 return sCC;
882}
883
884UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
885 UChar c;
886 UCollator *result = fillIn;
887 if(U_FAILURE(*status) || image == NULL) {
888 return NULL;
889 }
890
891 if(result == NULL) {
892 result = (UCollator *)uprv_malloc(sizeof(UCollator));
893 if(result == NULL) {
894 *status = U_MEMORY_ALLOCATION_ERROR;
895 return result;
896 }
897 result->freeOnClose = TRUE;
898 } else {
899 result->freeOnClose = FALSE;
900 }
901
902 result->delegate = NULL;
903
904 result->image = image;
905 result->mapping.getFoldingOffset = _getFoldingOffset;
906 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
907 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
908 if(U_FAILURE(*status)) {
909 if(result->freeOnClose == TRUE) {
910 uprv_free(result);
911 result = NULL;
912 }
913 return result;
914 }
915
916 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
917 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
918 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
919 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
920 result->rules = NULL;
921 result->rulesLength = 0;
922 result->freeRulesOnClose = FALSE;
923 result->defaultReorderCodes = NULL;
924 result->defaultReorderCodesLength = 0;
925 result->freeDefaultReorderCodesOnClose = FALSE;
926 result->reorderCodes = NULL;
927 result->reorderCodesLength = 0;
928 result->freeReorderCodesOnClose = FALSE;
929 result->leadBytePermutationTable = NULL;
930 result->freeLeadBytePermutationTableOnClose = FALSE;
931
932 /* get the version info from UCATableHeader and populate the Collator struct*/
933 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
934 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
935 result->dataVersion[2] = 0;
936 result->dataVersion[3] = 0;
937
938 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
939 result->minUnsafeCP = 0;
940 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
941 if (ucol_unsafeCP(c, result)) break;
942 }
943 result->minUnsafeCP = c;
944
945 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
946 result->minContrEndCP = 0;
947 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
948 if (ucol_contractionEndCP(c, result)) break;
949 }
950 result->minContrEndCP = c;
951
952 /* max expansion tables */
953 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
954 result->image->endExpansionCE);
955 result->lastEndExpansionCE = result->endExpansionCE +
956 result->image->endExpansionCECount - 1;
957 result->expansionCESize = (uint8_t*)result->image +
958 result->image->expansionCESize;
959
960
961 //result->errorCode = *status;
962
963 result->latinOneCEs = NULL;
964
965 result->latinOneRegenTable = FALSE;
966 result->latinOneFailed = FALSE;
967 result->UCA = UCA;
968
969 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
970 result->ucaRules = NULL;
971 result->actualLocale = NULL;
972 result->validLocale = NULL;
973 result->requestedLocale = NULL;
974 result->hasRealData = FALSE; // real data lives in .dat file...
975 result->freeImageOnClose = FALSE;
976
977 /* set attributes */
978 ucol_setOptionsFromHeader(
979 result,
980 (UColOptionSet*)((uint8_t*)result->image+result->image->options),
981 status);
982 result->freeOptionsOnClose = FALSE;
983
984 return result;
985}
986
987/* new Mark's code */
988
989/**
990 * For generation of Implicit CEs
991 * @author Davis
992 *
993 * Cleaned up so that changes can be made more easily.
994 * Old values:
995# First Implicit: E26A792D
996# Last Implicit: E3DC70C0
997# First CJK: E0030300
998# Last CJK: E0A9DD00
999# First CJK_A: E0A9DF00
1000# Last CJK_A: E0DE3100
1001 */
1002/* Following is a port of Mark's code for new treatment of implicits.
1003 * It is positioned here, since ucol_initUCA need to initialize the
1004 * variables below according to the data in the fractional UCA.
1005 */
1006
1007/**
1008 * Function used to:
1009 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1010 * b) bump any non-CJK characters by 10FFFF.
1011 * The relevant blocks are:
1012 * A: 4E00..9FFF; CJK Unified Ideographs
1013 * F900..FAFF; CJK Compatibility Ideographs
1014 * B: 3400..4DBF; CJK Unified Ideographs Extension A
1015 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
1016 * As long as
1017 * no new B characters are allocated between 4E00 and FAFF, and
1018 * no new A characters are outside of this range,
1019 * (very high probability) this simple code will work.
1020 * The reordered blocks are:
1021 * Block1 is CJK
1022 * Block2 is CJK_COMPAT_USED
1023 * Block3 is CJK_A
1024 * (all contiguous)
1025 * Any other CJK gets its normal code point
1026 * Any non-CJK gets +10FFFF
1027 * When we reorder Block1, we make sure that it is at the very start,
1028 * so that it will use a 3-byte form.
1029 * Warning: the we only pick up the compatibility characters that are
1030 * NOT decomposed, so that block is smaller!
1031 */
1032
1033// CONSTANTS
1034static const UChar32
1035 NON_CJK_OFFSET = 0x110000,
1036 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1037
1038/**
1039 * Precomputed by initImplicitConstants()
1040 */
1041static int32_t
1042 final3Multiplier = 0,
1043 final4Multiplier = 0,
1044 final3Count = 0,
1045 final4Count = 0,
1046 medialCount = 0,
1047 min3Primary = 0,
1048 min4Primary = 0,
1049 max4Primary = 0,
1050 minTrail = 0,
1051 maxTrail = 0,
1052 max3Trail = 0,
1053 max4Trail = 0,
1054 min4Boundary = 0;
1055
1056static const UChar32
1057 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
1058 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
1059 CJK_BASE = 0x4E00,
1060 CJK_LIMIT = 0x9FCC+1,
1061 // Unified CJK ideographs in the compatibility ideographs block.
1062 CJK_COMPAT_USED_BASE = 0xFA0E,
1063 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1064 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1065 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1066 CJK_A_BASE = 0x3400,
1067 CJK_A_LIMIT = 0x4DB5+1,
1068 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1069 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1070 CJK_B_BASE = 0x20000,
1071 CJK_B_LIMIT = 0x2A6D6+1,
1072 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1073 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1074 CJK_C_BASE = 0x2A700,
1075 CJK_C_LIMIT = 0x2B734+1,
1076 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1077 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1078 CJK_D_BASE = 0x2B740,
1079 CJK_D_LIMIT = 0x2B81D+1;
1080 // when adding to this list, look for all occurrences (in project)
1081 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1082
1083static UChar32 swapCJK(UChar32 i) {
1084 if (i < CJK_A_BASE) {
1085 // non-CJK
1086 } else if (i < CJK_A_LIMIT) {
1087 // Extension A has lower code points than the original Unihan+compat
1088 // but sorts higher.
1089 return i - CJK_A_BASE
1090 + (CJK_LIMIT - CJK_BASE)
1091 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1092 } else if (i < CJK_BASE) {
1093 // non-CJK
1094 } else if (i < CJK_LIMIT) {
1095 return i - CJK_BASE;
1096 } else if (i < CJK_COMPAT_USED_BASE) {
1097 // non-CJK
1098 } else if (i < CJK_COMPAT_USED_LIMIT) {
1099 return i - CJK_COMPAT_USED_BASE
1100 + (CJK_LIMIT - CJK_BASE);
1101 } else if (i < CJK_B_BASE) {
1102 // non-CJK
1103 } else if (i < CJK_B_LIMIT) {
1104 return i; // non-BMP-CJK
1105 } else if (i < CJK_C_BASE) {
1106 // non-CJK
1107 } else if (i < CJK_C_LIMIT) {
1108 return i; // non-BMP-CJK
1109 } else if (i < CJK_D_BASE) {
1110 // non-CJK
1111 } else if (i < CJK_D_LIMIT) {
1112 return i; // non-BMP-CJK
1113 }
1114 return i + NON_CJK_OFFSET; // non-CJK
1115}
1116
1117U_CAPI UChar32 U_EXPORT2
1118uprv_uca_getRawFromCodePoint(UChar32 i) {
1119 return swapCJK(i)+1;
1120}
1121
1122U_CAPI UChar32 U_EXPORT2
1123uprv_uca_getCodePointFromRaw(UChar32 i) {
1124 i--;
1125 UChar32 result = 0;
1126 if(i >= NON_CJK_OFFSET) {
1127 result = i - NON_CJK_OFFSET;
1128 } else if(i >= CJK_B_BASE) {
1129 result = i;
1130 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1131 if(i < CJK_LIMIT - CJK_BASE) {
1132 result = i + CJK_BASE;
1133 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1134 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1135 } else {
1136 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1137 }
1138 } else {
1139 result = -1;
1140 }
1141 return result;
1142}
1143
1144// GET IMPLICIT PRIMARY WEIGHTS
1145// Return value is left justified primary key
1146U_CAPI uint32_t U_EXPORT2
1147uprv_uca_getImplicitFromRaw(UChar32 cp) {
1148 /*
1149 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1150 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1151 }
1152 */
1153 int32_t last0 = cp - min4Boundary;
1154 if (last0 < 0) {
1155 int32_t last1 = cp / final3Count;
1156 last0 = cp % final3Count;
1157
1158 int32_t last2 = last1 / medialCount;
1159 last1 %= medialCount;
1160
1161 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1162 last1 = minTrail + last1; // offset
1163 last2 = min3Primary + last2; // offset
1164 /*
1165 if (last2 >= min4Primary) {
1166 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1167 }
1168 */
1169 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1170 } else {
1171 int32_t last1 = last0 / final4Count;
1172 last0 %= final4Count;
1173
1174 int32_t last2 = last1 / medialCount;
1175 last1 %= medialCount;
1176
1177 int32_t last3 = last2 / medialCount;
1178 last2 %= medialCount;
1179
1180 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1181 last1 = minTrail + last1; // offset
1182 last2 = minTrail + last2; // offset
1183 last3 = min4Primary + last3; // offset
1184 /*
1185 if (last3 > max4Primary) {
1186 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1187 }
1188 */
1189 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1190 }
1191}
1192
1193static uint32_t U_EXPORT2
1194uprv_uca_getImplicitPrimary(UChar32 cp) {
1195 //fprintf(stdout, "Incoming: %04x\n", cp);
1196 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1197
1198 cp = swapCJK(cp);
1199 cp++;
1200 // we now have a range of numbers from 0 to 21FFFF.
1201
1202 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1203 //fprintf(stdout, "CJK swapped: %04x\n", cp);
1204
1205 return uprv_uca_getImplicitFromRaw(cp);
1206}
1207
1208/**
1209 * Converts implicit CE into raw integer ("code point")
1210 * @param implicit
1211 * @return -1 if illegal format
1212 */
1213U_CAPI UChar32 U_EXPORT2
1214uprv_uca_getRawFromImplicit(uint32_t implicit) {
1215 UChar32 result;
1216 UChar32 b3 = implicit & 0xFF;
1217 UChar32 b2 = (implicit >> 8) & 0xFF;
1218 UChar32 b1 = (implicit >> 16) & 0xFF;
1219 UChar32 b0 = (implicit >> 24) & 0xFF;
1220
1221 // simple parameter checks
1222 if (b0 < min3Primary || b0 > max4Primary
1223 || b1 < minTrail || b1 > maxTrail)
1224 return -1;
1225 // normal offsets
1226 b1 -= minTrail;
1227
1228 // take care of the final values, and compose
1229 if (b0 < min4Primary) {
1230 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1231 return -1;
1232 b2 -= minTrail;
1233 UChar32 remainder = b2 % final3Multiplier;
1234 if (remainder != 0)
1235 return -1;
1236 b0 -= min3Primary;
1237 b2 /= final3Multiplier;
1238 result = ((b0 * medialCount) + b1) * final3Count + b2;
1239 } else {
1240 if (b2 < minTrail || b2 > maxTrail
1241 || b3 < minTrail || b3 > max4Trail)
1242 return -1;
1243 b2 -= minTrail;
1244 b3 -= minTrail;
1245 UChar32 remainder = b3 % final4Multiplier;
1246 if (remainder != 0)
1247 return -1;
1248 b3 /= final4Multiplier;
1249 b0 -= min4Primary;
1250 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1251 }
1252 // final check
1253 if (result < 0 || result > UCOL_MAX_INPUT)
1254 return -1;
1255 return result;
1256}
1257
1258
1259static inline int32_t divideAndRoundUp(int a, int b) {
1260 return 1 + (a-1)/b;
1261}
1262
1263/* this function is either called from initUCA or from genUCA before
1264 * doing canonical closure for the UCA.
1265 */
1266
1267/**
1268 * Set up to generate implicits.
1269 * Maintenance Note: this function may end up being called more than once, due
1270 * to threading races during initialization. Make sure that
1271 * none of the Constants is ever transiently assigned an
1272 * incorrect value.
1273 * @param minPrimary
1274 * @param maxPrimary
1275 * @param minTrail final byte
1276 * @param maxTrail final byte
1277 * @param gap3 the gap we leave for tailoring for 3-byte forms
1278 * @param gap4 the gap we leave for tailoring for 4-byte forms
1279 */
1280static void initImplicitConstants(int minPrimary, int maxPrimary,
1281 int minTrailIn, int maxTrailIn,
1282 int gap3, int primaries3count,
1283 UErrorCode *status) {
1284 // some simple parameter checks
1285 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1286 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1287 || (primaries3count < 1))
1288 {
1289 *status = U_ILLEGAL_ARGUMENT_ERROR;
1290 return;
1291 };
1292
1293 minTrail = minTrailIn;
1294 maxTrail = maxTrailIn;
1295
1296 min3Primary = minPrimary;
1297 max4Primary = maxPrimary;
1298 // compute constants for use later.
1299 // number of values we can use in trailing bytes
1300 // leave room for empty values between AND above, e.g. if gap = 2
1301 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1302 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1303 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1304 final3Multiplier = gap3 + 1;
1305 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1306 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1307
1308 // medials can use full range
1309 medialCount = (maxTrail - minTrail + 1);
1310 // find out how many values fit in each form
1311 int32_t threeByteCount = medialCount * final3Count;
1312 // now determine where the 3/4 boundary is.
1313 // we use 3 bytes below the boundary, and 4 above
1314 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1315 int32_t primaries4count = primariesAvailable - primaries3count;
1316
1317
1318 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1319 min4Primary = minPrimary + primaries3count;
1320 min4Boundary = min3ByteCoverage;
1321 // Now expand out the multiplier for the 4 bytes, and redo.
1322
1323 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1324 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1325 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1326 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1327 if (gap4 < 1) {
1328 *status = U_ILLEGAL_ARGUMENT_ERROR;
1329 return;
1330 }
1331 final4Multiplier = gap4 + 1;
1332 final4Count = neededPerFinalByte;
1333 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1334}
1335
1336 /**
1337 * Supply parameters for generating implicit CEs
1338 */
1339U_CAPI void U_EXPORT2
1340uprv_uca_initImplicitConstants(UErrorCode *status) {
1341 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1342 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1343 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1344}
1345
1346
1347/* collIterNormalize Incremental Normalization happens here. */
1348/* pick up the range of chars identifed by FCD, */
1349/* normalize it into the collIterate's writable buffer, */
1350/* switch the collIterate's state to use the writable buffer. */
1351/* */
1352static
1353void collIterNormalize(collIterate *collationSource)
1354{
1355 UErrorCode status = U_ZERO_ERROR;
1356 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1357 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1358
1359 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1360 collationSource->writableBuffer,
1361 status);
1362 if (U_FAILURE(status)) {
1363#ifdef UCOL_DEBUG
1364 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1365#endif
1366 return;
1367 }
1368
1369 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer();
1370 collationSource->origFlags = collationSource->flags;
1371 collationSource->flags |= UCOL_ITER_INNORMBUF;
1372 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1373}
1374
1375
1376// This function takes the iterator and extracts normalized stuff up to the next boundary
1377// It is similar in the end results to the collIterNormalize, but for the cases when we
1378// use an iterator
1379/*static
1380inline void normalizeIterator(collIterate *collationSource) {
1381 UErrorCode status = U_ZERO_ERROR;
1382 UBool wasNormalized = FALSE;
1383 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1384 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1385 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1386 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1387 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1388 // reallocate and terminate
1389 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1390 &collationSource->writableBuffer,
1391 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1392 0)
1393 ) {
1394 #ifdef UCOL_DEBUG
1395 fprintf(stderr, "normalizeIterator(), out of memory\n");
1396 #endif
1397 return;
1398 }
1399 status = U_ZERO_ERROR;
1400 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1401 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1402 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1403 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1404 }
1405 // Terminate the buffer - we already checked that it is big enough
1406 collationSource->writableBuffer[normLen] = 0;
1407 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1408 collationSource->flags |= UCOL_ITER_ALLOCATED;
1409 }
1410 collationSource->pos = collationSource->writableBuffer;
1411 collationSource->origFlags = collationSource->flags;
1412 collationSource->flags |= UCOL_ITER_INNORMBUF;
1413 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1414}*/
1415
1416
1417/* Incremental FCD check and normalize */
1418/* Called from getNextCE when normalization state is suspect. */
1419/* When entering, the state is known to be this: */
1420/* o We are working in the main buffer of the collIterate, not the side */
1421/* writable buffer. When in the side buffer, normalization mode is always off, */
1422/* so we won't get here. */
1423/* o The leading combining class from the current character is 0 or */
1424/* the trailing combining class of the previous char was zero. */
1425/* True because the previous call to this function will have always exited */
1426/* that way, and we get called for every char where cc might be non-zero. */
1427static
1428inline UBool collIterFCD(collIterate *collationSource) {
1429 const UChar *srcP, *endP;
1430 uint8_t leadingCC;
1431 uint8_t prevTrailingCC = 0;
1432 uint16_t fcd;
1433 UBool needNormalize = FALSE;
1434
1435 srcP = collationSource->pos-1;
1436
1437 if (collationSource->flags & UCOL_ITER_HASLEN) {
1438 endP = collationSource->endp;
1439 } else {
1440 endP = NULL;
1441 }
1442
1443 // Get the trailing combining class of the current character. If it's zero, we are OK.
1444 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1445 if (fcd != 0) {
1446 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1447
1448 if (prevTrailingCC != 0) {
1449 // The current char has a non-zero trailing CC. Scan forward until we find
1450 // a char with a leading cc of zero.
1451 while (endP == NULL || srcP != endP)
1452 {
1453 const UChar *savedSrcP = srcP;
1454
1455 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1456 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1457 if (leadingCC == 0) {
1458 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1459 // back up over it. (Could be surrogate pair!)
1460 break;
1461 }
1462
1463 if (leadingCC < prevTrailingCC) {
1464 needNormalize = TRUE;
1465 }
1466
1467 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1468 }
1469 }
1470 }
1471
1472 collationSource->fcdPosition = (UChar *)srcP;
1473
1474 return needNormalize;
1475}
1476
1477/****************************************************************************/
1478/* Following are the CE retrieval functions */
1479/* */
1480/****************************************************************************/
1481
1482static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1483static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1484
1485/* there should be a macro version of this function in the header file */
1486/* This is the first function that tries to fetch a collation element */
1487/* If it's not succesfull or it encounters a more difficult situation */
1488/* some more sofisticated and slower functions are invoked */
1489static
1490inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1491 uint32_t order = 0;
1492 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1493 order = *(collationSource->toReturn++); /* if so, return them */
1494 if(collationSource->CEpos == collationSource->toReturn) {
1495 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1496 }
1497 return order;
1498 }
1499
1500 UChar ch = 0;
1501 collationSource->offsetReturn = NULL;
1502
1503 do {
1504 for (;;) /* Loop handles case when incremental normalize switches */
1505 { /* to or from the side buffer / original string, and we */
1506 /* need to start again to get the next character. */
1507
1508 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1509 {
1510 // The source string is null terminated and we're not working from the side buffer,
1511 // and we're not normalizing. This is the fast path.
1512 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1513 ch = *collationSource->pos++;
1514 if (ch != 0) {
1515 break;
1516 }
1517 else {
1518 return UCOL_NO_MORE_CES;
1519 }
1520 }
1521
1522 if (collationSource->flags & UCOL_ITER_HASLEN) {
1523 // Normal path for strings when length is specified.
1524 // (We can't be in side buffer because it is always null terminated.)
1525 if (collationSource->pos >= collationSource->endp) {
1526 // Ran off of the end of the main source string. We're done.
1527 return UCOL_NO_MORE_CES;
1528 }
1529 ch = *collationSource->pos++;
1530 }
1531 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1532 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1533 if(iterCh == U_SENTINEL) {
1534 return UCOL_NO_MORE_CES;
1535 }
1536 ch = (UChar)iterCh;
1537 }
1538 else
1539 {
1540 // Null terminated string.
1541 ch = *collationSource->pos++;
1542 if (ch == 0) {
1543 // Ran off end of buffer.
1544 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1545 // Ran off end of main string. backing up one character.
1546 collationSource->pos--;
1547 return UCOL_NO_MORE_CES;
1548 }
1549 else
1550 {
1551 // Hit null in the normalize side buffer.
1552 // Usually this means the end of the normalized data,
1553 // except for one odd case: a null followed by combining chars,
1554 // which is the case if we are at the start of the buffer.
1555 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1556 break;
1557 }
1558
1559 // Null marked end of side buffer.
1560 // Revert to the main string and
1561 // loop back to top to try again to get a character.
1562 collationSource->pos = collationSource->fcdPosition;
1563 collationSource->flags = collationSource->origFlags;
1564 continue;
1565 }
1566 }
1567 }
1568
1569 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1570 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1571 * based on whether the previous codepoint was Hiragana or Katakana.
1572 */
1573 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1574 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1575 collationSource->flags |= UCOL_WAS_HIRAGANA;
1576 } else {
1577 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1578 }
1579 }
1580
1581 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1582 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1583 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1584 break;
1585 }
1586
1587 if (collationSource->fcdPosition >= collationSource->pos) {
1588 // An earlier FCD check has already covered the current character.
1589 // We can go ahead and process this char.
1590 break;
1591 }
1592
1593 if (ch < ZERO_CC_LIMIT_ ) {
1594 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1595 break;
1596 }
1597
1598 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1599 // We need to peek at the next character in order to tell if we are FCD
1600 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1601 // We are at the last char of source string.
1602 // It is always OK for FCD check.
1603 break;
1604 }
1605
1606 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1607 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1608 break;
1609 }
1610 }
1611
1612
1613 // Need a more complete FCD check and possible normalization.
1614 if (collIterFCD(collationSource)) {
1615 collIterNormalize(collationSource);
1616 }
1617 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1618 // No normalization was needed. Go ahead and process the char we already had.
1619 break;
1620 }
1621
1622 // Some normalization happened. Next loop iteration will pick up a char
1623 // from the normalization buffer.
1624
1625 } // end for (;;)
1626
1627
1628 if (ch <= 0xFF) {
1629 /* For latin-1 characters we never need to fall back to the UCA table */
1630 /* because all of the UCA data is replicated in the latinOneMapping array */
1631 order = coll->latinOneMapping[ch];
1632 if (order > UCOL_NOT_FOUND) {
1633 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1634 }
1635 }
1636 else
1637 {
1638 // Always use UCA for Han, Hangul
1639 // (Han extension A is before main Han block)
1640 // **** Han compatibility chars ?? ****
1641 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1642 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1643 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1644 // between the two target ranges; do normal lookup
1645 // **** this range is YI, Modifier tone letters, ****
1646 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1647 // **** Latin-D might be tailored, so we need to ****
1648 // **** do the normal lookup for these guys. ****
1649 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1650 } else {
1651 // in one of the target ranges; use UCA
1652 order = UCOL_NOT_FOUND;
1653 }
1654 } else {
1655 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1656 }
1657
1658 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1659 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1660 }
1661
1662 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1663 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1664 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1665
1666 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1667 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1668 }
1669 }
1670 }
1671 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1672
1673 if(order == UCOL_NOT_FOUND) {
1674 order = getImplicit(ch, collationSource);
1675 }
1676 return order; /* return the CE */
1677}
1678
1679/* ucol_getNextCE, out-of-line version for use from other files. */
1680U_CAPI uint32_t U_EXPORT2
1681ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1682 return ucol_IGetNextCE(coll, collationSource, status);
1683}
1684
1685
1686/**
1687* Incremental previous normalization happens here. Pick up the range of chars
1688* identifed by FCD, normalize it into the collIterate's writable buffer,
1689* switch the collIterate's state to use the writable buffer.
1690* @param data collation iterator data
1691*/
1692static
1693void collPrevIterNormalize(collIterate *data)
1694{
1695 UErrorCode status = U_ZERO_ERROR;
1696 const UChar *pEnd = data->pos; /* End normalize + 1 */
1697 const UChar *pStart;
1698
1699 /* Start normalize */
1700 if (data->fcdPosition == NULL) {
1701 pStart = data->string;
1702 }
1703 else {
1704 pStart = data->fcdPosition + 1;
1705 }
1706
1707 int32_t normLen =
1708 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1709 data->writableBuffer,
1710 status).
1711 length();
1712 if(U_FAILURE(status)) {
1713 return;
1714 }
1715 /*
1716 this puts the null termination infront of the normalized string instead
1717 of the end
1718 */
1719 data->writableBuffer.insert(0, (UChar)0);
1720
1721 /*
1722 * The usual case at this point is that we've got a base
1723 * character followed by marks that were normalized. If
1724 * fcdPosition is NULL, that means that we backed up to
1725 * the beginning of the string and there's no base character.
1726 *
1727 * Forward processing will usually normalize when it sees
1728 * the first mark, so that mark will get it's natural offset
1729 * and the rest will get the offset of the character following
1730 * the marks. The base character will also get its natural offset.
1731 *
1732 * We write the offset of the base character, if there is one,
1733 * followed by the offset of the first mark and then the offsets
1734 * of the rest of the marks.
1735 */
1736 int32_t firstMarkOffset = 0;
1737 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
1738 int32_t trailCount = normLen - 1;
1739
1740 if (data->fcdPosition != NULL) {
1741 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1742 UChar baseChar = *data->fcdPosition;
1743
1744 firstMarkOffset = baseOffset + 1;
1745
1746 /*
1747 * If the base character is the start of a contraction, forward processing
1748 * will normalize the marks while checking for the contraction, which means
1749 * that the offset of the first mark will the same as the other marks.
1750 *
1751 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1752 */
1753 if (baseChar >= 0x100) {
1754 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1755
1756 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1757 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1758 }
1759
1760 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1761 firstMarkOffset = trailOffset;
1762 }
1763 }
1764
1765 data->appendOffset(baseOffset, status);
1766 }
1767
1768 data->appendOffset(firstMarkOffset, status);
1769
1770 for (int32_t i = 0; i < trailCount; i += 1) {
1771 data->appendOffset(trailOffset, status);
1772 }
1773
1774 data->offsetRepeatValue = trailOffset;
1775
1776 data->offsetReturn = data->offsetStore - 1;
1777 if (data->offsetReturn == data->offsetBuffer) {
1778 data->offsetStore = data->offsetBuffer;
1779 }
1780
1781 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1782 data->origFlags = data->flags;
1783 data->flags |= UCOL_ITER_INNORMBUF;
1784 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1785}
1786
1787
1788/**
1789* Incremental FCD check for previous iteration and normalize. Called from
1790* getPrevCE when normalization state is suspect.
1791* When entering, the state is known to be this:
1792* o We are working in the main buffer of the collIterate, not the side
1793* writable buffer. When in the side buffer, normalization mode is always
1794* off, so we won't get here.
1795* o The leading combining class from the current character is 0 or the
1796* trailing combining class of the previous char was zero.
1797* True because the previous call to this function will have always exited
1798* that way, and we get called for every char where cc might be non-zero.
1799* @param data collation iterate struct
1800* @return normalization status, TRUE for normalization to be done, FALSE
1801* otherwise
1802*/
1803static
1804inline UBool collPrevIterFCD(collIterate *data)
1805{
1806 const UChar *src, *start;
1807 uint8_t leadingCC;
1808 uint8_t trailingCC = 0;
1809 uint16_t fcd;
1810 UBool result = FALSE;
1811
1812 start = data->string;
1813 src = data->pos + 1;
1814
1815 /* Get the trailing combining class of the current character. */
1816 fcd = g_nfcImpl->previousFCD16(start, src);
1817
1818 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1819
1820 if (leadingCC != 0) {
1821 /*
1822 The current char has a non-zero leading combining class.
1823 Scan backward until we find a char with a trailing cc of zero.
1824 */
1825 for (;;)
1826 {
1827 if (start == src) {
1828 data->fcdPosition = NULL;
1829 return result;
1830 }
1831
1832 fcd = g_nfcImpl->previousFCD16(start, src);
1833
1834 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1835
1836 if (trailingCC == 0) {
1837 break;
1838 }
1839
1840 if (leadingCC < trailingCC) {
1841 result = TRUE;
1842 }
1843
1844 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1845 }
1846 }
1847
1848 data->fcdPosition = (UChar *)src;
1849
1850 return result;
1851}
1852
1853/** gets a code unit from the string at a given offset
1854 * Handles both normal and iterative cases.
1855 * No error checking - caller beware!
1856 */
1857static inline
1858UChar peekCodeUnit(collIterate *source, int32_t offset) {
1859 if(source->pos != NULL) {
1860 return *(source->pos + offset);
1861 } else if(source->iterator != NULL) {
1862 UChar32 c;
1863 if(offset != 0) {
1864 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1865 c = source->iterator->next(source->iterator);
1866 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1867 } else {
1868 c = source->iterator->current(source->iterator);
1869 }
1870 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
1871 } else {
1872 return 0xfffd;
1873 }
1874}
1875
1876// Code point version. Treats the offset as a _code point_ delta.
1877// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1878// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1879static inline
1880UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1881 UChar32 c;
1882 if(source->pos != NULL) {
1883 const UChar *p = source->pos;
1884 if(offset >= 0) {
1885 // Skip forward over (offset-1) code points.
1886 while(--offset >= 0) {
1887 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1888 ++p;
1889 }
1890 }
1891 // Read the code point there.
1892 c = *p++;
1893 UChar trail;
1894 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1895 c = U16_GET_SUPPLEMENTARY(c, trail);
1896 }
1897 } else /* offset<0 */ {
1898 // Skip backward over (offset-1) code points.
1899 while(++offset < 0) {
1900 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1901 --p;
1902 }
1903 }
1904 // Read the code point before that.
1905 c = *--p;
1906 UChar lead;
1907 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1908 c = U16_GET_SUPPLEMENTARY(lead, c);
1909 }
1910 }
1911 } else if(source->iterator != NULL) {
1912 if(offset >= 0) {
1913 // Skip forward over (offset-1) code points.
1914 int32_t fwd = offset;
1915 while(fwd-- > 0) {
1916 uiter_next32(source->iterator);
1917 }
1918 // Read the code point there.
1919 c = uiter_current32(source->iterator);
1920 // Return to the starting point, skipping backward over (offset-1) code points.
1921 while(offset-- > 0) {
1922 uiter_previous32(source->iterator);
1923 }
1924 } else /* offset<0 */ {
1925 // Read backward, reading offset code points, remember only the last-read one.
1926 int32_t back = offset;
1927 do {
1928 c = uiter_previous32(source->iterator);
1929 } while(++back < 0);
1930 // Return to the starting position, skipping forward over offset code points.
1931 do {
1932 uiter_next32(source->iterator);
1933 } while(++offset < 0);
1934 }
1935 } else {
1936 c = U_SENTINEL;
1937 }
1938 return c;
1939}
1940
1941/**
1942* Determines if we are at the start of the data string in the backwards
1943* collation iterator
1944* @param data collation iterator
1945* @return TRUE if we are at the start
1946*/
1947static
1948inline UBool isAtStartPrevIterate(collIterate *data) {
1949 if(data->pos == NULL && data->iterator != NULL) {
1950 return !data->iterator->hasPrevious(data->iterator);
1951 }
1952 //return (collIter_bos(data)) ||
1953 return (data->pos == data->string) ||
1954 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1955 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1956}
1957
1958static
1959inline void goBackOne(collIterate *data) {
1960# if 0
1961 // somehow, it looks like we need to keep iterator synced up
1962 // at all times, as above.
1963 if(data->pos) {
1964 data->pos--;
1965 }
1966 if(data->iterator) {
1967 data->iterator->previous(data->iterator);
1968 }
1969#endif
1970 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1971 data->iterator->previous(data->iterator);
1972 }
1973 if(data->pos) {
1974 data->pos --;
1975 }
1976}
1977
1978/**
1979* Inline function that gets a simple CE.
1980* So what it does is that it will first check the expansion buffer. If the
1981* expansion buffer is not empty, ie the end pointer to the expansion buffer
1982* is different from the string pointer, we return the collation element at the
1983* return pointer and decrement it.
1984* For more complicated CEs it resorts to getComplicatedCE.
1985* @param coll collator data
1986* @param data collation iterator struct
1987* @param status error status
1988*/
1989static
1990inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1991 UErrorCode *status)
1992{
1993 uint32_t result = (uint32_t)UCOL_NULLORDER;
1994
1995 if (data->offsetReturn != NULL) {
1996 if (data->offsetRepeatCount > 0) {
1997 data->offsetRepeatCount -= 1;
1998 } else {
1999 if (data->offsetReturn == data->offsetBuffer) {
2000 data->offsetReturn = NULL;
2001 data->offsetStore = data->offsetBuffer;
2002 } else {
2003 data->offsetReturn -= 1;
2004 }
2005 }
2006 }
2007
2008 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
2009 (!data->extendCEs && data->toReturn > data->CEs))
2010 {
2011 data->toReturn -= 1;
2012 result = *(data->toReturn);
2013 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
2014 data->CEpos = data->toReturn;
2015 }
2016 }
2017 else {
2018 UChar ch = 0;
2019
2020 do {
2021 /*
2022 Loop handles case when incremental normalize switches to or from the
2023 side buffer / original string, and we need to start again to get the
2024 next character.
2025 */
2026 for (;;) {
2027 if (data->flags & UCOL_ITER_HASLEN) {
2028 /*
2029 Normal path for strings when length is specified.
2030 Not in side buffer because it is always null terminated.
2031 */
2032 if (data->pos <= data->string) {
2033 /* End of the main source string */
2034 return UCOL_NO_MORE_CES;
2035 }
2036 data->pos --;
2037 ch = *data->pos;
2038 }
2039 // we are using an iterator to go back. Pray for us!
2040 else if (data->flags & UCOL_USE_ITERATOR) {
2041 UChar32 iterCh = data->iterator->previous(data->iterator);
2042 if(iterCh == U_SENTINEL) {
2043 return UCOL_NO_MORE_CES;
2044 } else {
2045 ch = (UChar)iterCh;
2046 }
2047 }
2048 else {
2049 data->pos --;
2050 ch = *data->pos;
2051 /* we are in the side buffer. */
2052 if (ch == 0) {
2053 /*
2054 At the start of the normalize side buffer.
2055 Go back to string.
2056 Because pointer points to the last accessed character,
2057 hence we have to increment it by one here.
2058 */
2059 data->flags = data->origFlags;
2060 data->offsetRepeatValue = 0;
2061
2062 if (data->fcdPosition == NULL) {
2063 data->pos = data->string;
2064 return UCOL_NO_MORE_CES;
2065 }
2066 else {
2067 data->pos = data->fcdPosition + 1;
2068 }
2069
2070 continue;
2071 }
2072 }
2073
2074 if(data->flags&UCOL_HIRAGANA_Q) {
2075 if(ch>=0x3040 && ch<=0x309f) {
2076 data->flags |= UCOL_WAS_HIRAGANA;
2077 } else {
2078 data->flags &= ~UCOL_WAS_HIRAGANA;
2079 }
2080 }
2081
2082 /*
2083 * got a character to determine if there's fcd and/or normalization
2084 * stuff to do.
2085 * if the current character is not fcd.
2086 * if current character is at the start of the string
2087 * Trailing combining class == 0.
2088 * Note if pos is in the writablebuffer, norm is always 0
2089 */
2090 if (ch < ZERO_CC_LIMIT_ ||
2091 // this should propel us out of the loop in the iterator case
2092 (data->flags & UCOL_ITER_NORM) == 0 ||
2093 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2094 || data->string == data->pos) {
2095 break;
2096 }
2097
2098 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2099 /* if next character is FCD */
2100 if (data->pos == data->string) {
2101 /* First char of string is always OK for FCD check */
2102 break;
2103 }
2104
2105 /* Not first char of string, do the FCD fast test */
2106 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2107 break;
2108 }
2109 }
2110
2111 /* Need a more complete FCD check and possible normalization. */
2112 if (collPrevIterFCD(data)) {
2113 collPrevIterNormalize(data);
2114 }
2115
2116 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2117 /* No normalization. Go ahead and process the char. */
2118 break;
2119 }
2120
2121 /*
2122 Some normalization happened.
2123 Next loop picks up a char from the normalization buffer.
2124 */
2125 }
2126
2127 /* attempt to handle contractions, after removal of the backwards
2128 contraction
2129 */
2130 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2131 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2132 } else {
2133 if (ch <= 0xFF) {
2134 result = coll->latinOneMapping[ch];
2135 }
2136 else {
2137 // Always use UCA for [3400..9FFF], [AC00..D7AF]
2138 // **** [FA0E..FA2F] ?? ****
2139 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2140 (ch >= 0x3400 && ch <= 0xD7AF)) {
2141 if (ch > 0x9FFF && ch < 0xAC00) {
2142 // between the two target ranges; do normal lookup
2143 // **** this range is YI, Modifier tone letters, ****
2144 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
2145 // **** Latin-D might be tailored, so we need to ****
2146 // **** do the normal lookup for these guys. ****
2147 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2148 } else {
2149 result = UCOL_NOT_FOUND;
2150 }
2151 } else {
2152 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2153 }
2154 }
2155 if (result > UCOL_NOT_FOUND) {
2156 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2157 }
2158 if (result == UCOL_NOT_FOUND) { // Not found in master list
2159 if (!isAtStartPrevIterate(data) &&
2160 ucol_contractionEndCP(ch, data->coll))
2161 {
2162 result = UCOL_CONTRACTION;
2163 } else {
2164 if(coll->UCA) {
2165 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2166 }
2167 }
2168
2169 if (result > UCOL_NOT_FOUND) {
2170 if(coll->UCA) {
2171 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2172 }
2173 }
2174 }
2175 }
2176 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2177
2178 if(result == UCOL_NOT_FOUND) {
2179 result = getPrevImplicit(ch, data);
2180 }
2181 }
2182
2183 return result;
2184}
2185
2186
2187/* ucol_getPrevCE, out-of-line version for use from other files. */
2188U_CFUNC uint32_t U_EXPORT2
2189ucol_getPrevCE(const UCollator *coll, collIterate *data,
2190 UErrorCode *status) {
2191 return ucol_IGetPrevCE(coll, data, status);
2192}
2193
2194
2195/* this should be connected to special Jamo handling */
2196U_CFUNC uint32_t U_EXPORT2
2197ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2198 collIterate colIt;
2199 IInit_collIterate(coll, &u, 1, &colIt, status);
2200 if(U_FAILURE(*status)) {
2201 return 0;
2202 }
2203 return ucol_IGetNextCE(coll, &colIt, status);
2204}
2205
2206/**
2207* Inserts the argument character into the end of the buffer pushing back the
2208* null terminator.
2209* @param data collIterate struct data
2210* @param ch character to be appended
2211* @return the position of the new addition
2212*/
2213static
2214inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2215{
2216 int32_t oldLength = data->writableBuffer.length();
2217 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2218}
2219
2220/**
2221* Inserts the argument string into the end of the buffer pushing back the
2222* null terminator.
2223* @param data collIterate struct data
2224* @param string to be appended
2225* @param length of the string to be appended
2226* @return the position of the new addition
2227*/
2228static
2229inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2230{
2231 int32_t oldLength = data->writableBuffer.length();
2232 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2233}
2234
2235/**
2236* Special normalization function for contraction in the forwards iterator.
2237* This normalization sequence will place the current character at source->pos
2238* and its following normalized sequence into the buffer.
2239* The fcd position, pos will be changed.
2240* pos will now point to positions in the buffer.
2241* Flags will be changed accordingly.
2242* @param data collation iterator data
2243*/
2244static
2245inline void normalizeNextContraction(collIterate *data)
2246{
2247 int32_t strsize;
2248 UErrorCode status = U_ZERO_ERROR;
2249 /* because the pointer points to the next character */
2250 const UChar *pStart = data->pos - 1;
2251 const UChar *pEnd;
2252
2253 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2254 data->writableBuffer.setTo(*(pStart - 1));
2255 strsize = 1;
2256 }
2257 else {
2258 strsize = data->writableBuffer.length();
2259 }
2260
2261 pEnd = data->fcdPosition;
2262
2263 data->writableBuffer.append(
2264 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2265 if(U_FAILURE(status)) {
2266 return;
2267 }
2268
2269 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
2270 data->origFlags = data->flags;
2271 data->flags |= UCOL_ITER_INNORMBUF;
2272 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2273}
2274
2275/**
2276* Contraction character management function that returns the next character
2277* for the forwards iterator.
2278* Does nothing if the next character is in buffer and not the first character
2279* in it.
2280* Else it checks next character in data string to see if it is normalizable.
2281* If it is not, the character is simply copied into the buffer, else
2282* the whole normalized substring is copied into the buffer, including the
2283* current character.
2284* @param data collation element iterator data
2285* @return next character
2286*/
2287static
2288inline UChar getNextNormalizedChar(collIterate *data)
2289{
2290 UChar nextch;
2291 UChar ch;
2292 // Here we need to add the iterator code. One problem is the way
2293 // end of string is handled. If we just return next char, it could
2294 // be the sentinel. Most of the cases already check for this, but we
2295 // need to be sure.
2296 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2297 /* if no normalization and not in buffer. */
2298 if(data->flags & UCOL_USE_ITERATOR) {
2299 return (UChar)data->iterator->next(data->iterator);
2300 } else {
2301 return *(data->pos ++);
2302 }
2303 }
2304
2305 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2306 //normalizeIterator(data);
2307 //}
2308
2309 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2310 if ((innormbuf && *data->pos != 0) ||
2311 (data->fcdPosition != NULL && !innormbuf &&
2312 data->pos < data->fcdPosition)) {
2313 /*
2314 if next character is in normalized buffer, no further normalization
2315 is required
2316 */
2317 return *(data->pos ++);
2318 }
2319
2320 if (data->flags & UCOL_ITER_HASLEN) {
2321 /* in data string */
2322 if (data->pos + 1 == data->endp) {
2323 return *(data->pos ++);
2324 }
2325 }
2326 else {
2327 if (innormbuf) {
2328 // inside the normalization buffer, but at the end
2329 // (since we encountered zero). This means, in the
2330 // case we're using char iterator, that we need to
2331 // do another round of normalization.
2332 //if(data->origFlags & UCOL_USE_ITERATOR) {
2333 // we need to restore original flags,
2334 // otherwise, we'll lose them
2335 //data->flags = data->origFlags;
2336 //normalizeIterator(data);
2337 //return *(data->pos++);
2338 //} else {
2339 /*
2340 in writable buffer, at this point fcdPosition can not be
2341 pointing to the end of the data string. see contracting tag.
2342 */
2343 if(data->fcdPosition) {
2344 if (*(data->fcdPosition + 1) == 0 ||
2345 data->fcdPosition + 1 == data->endp) {
2346 /* at the end of the string, dump it into the normalizer */
2347 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2348 // Check if data->pos received a null pointer
2349 if (data->pos == NULL) {
2350 return (UChar)-1; // Return to indicate error.
2351 }
2352 return *(data->fcdPosition ++);
2353 }
2354 data->pos = data->fcdPosition;
2355 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2356 // if we are here, we're using a normalizing iterator.
2357 // we should just continue further.
2358 data->flags = data->origFlags;
2359 data->pos = NULL;
2360 return (UChar)data->iterator->next(data->iterator);
2361 }
2362 //}
2363 }
2364 else {
2365 if (*(data->pos + 1) == 0) {
2366 return *(data->pos ++);
2367 }
2368 }
2369 }
2370
2371 ch = *data->pos ++;
2372 nextch = *data->pos;
2373
2374 /*
2375 * if the current character is not fcd.
2376 * Trailing combining class == 0.
2377 */
2378 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2379 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2380 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2381 /*
2382 Need a more complete FCD check and possible normalization.
2383 normalize substring will be appended to buffer
2384 */
2385 if (collIterFCD(data)) {
2386 normalizeNextContraction(data);
2387 return *(data->pos ++);
2388 }
2389 else if (innormbuf) {
2390 /* fcdposition shifted even when there's no normalization, if we
2391 don't input the rest into this, we'll get the wrong position when
2392 we reach the end of the writableBuffer */
2393 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2394 data->pos = insertBufferEnd(data, data->pos - 1, length);
2395 // Check if data->pos received a null pointer
2396 if (data->pos == NULL) {
2397 return (UChar)-1; // Return to indicate error.
2398 }
2399 return *(data->pos ++);
2400 }
2401 }
2402
2403 if (innormbuf) {
2404 /*
2405 no normalization is to be done hence only one character will be
2406 appended to the buffer.
2407 */
2408 data->pos = insertBufferEnd(data, ch) + 1;
2409 // Check if data->pos received a null pointer
2410 if (data->pos == NULL) {
2411 return (UChar)-1; // Return to indicate error.
2412 }
2413 }
2414
2415 /* points back to the pos in string */
2416 return ch;
2417}
2418
2419
2420
2421/**
2422* Function to copy the buffer into writableBuffer and sets the fcd position to
2423* the correct position
2424* @param source data string source
2425* @param buffer character buffer
2426*/
2427static
2428inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2429{
2430 /* okay confusing part here. to ensure that the skipped characters are
2431 considered later, we need to place it in the appropriate position in the
2432 normalization buffer and reassign the pos pointer. simple case if pos
2433 reside in string, simply copy to normalization buffer and
2434 fcdposition = pos, pos = start of normalization buffer. if pos in
2435 normalization buffer, we'll insert the copy infront of pos and point pos
2436 to the start of the normalization buffer. why am i doing these copies?
2437 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2438 not require any changes, which be really painful. */
2439 if (source->flags & UCOL_ITER_INNORMBUF) {
2440 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2441 source->writableBuffer.replace(0, replaceLength, buffer);
2442 }
2443 else {
2444 source->fcdPosition = source->pos;
2445 source->origFlags = source->flags;
2446 source->flags |= UCOL_ITER_INNORMBUF;
2447 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2448 source->writableBuffer = buffer;
2449 }
2450
2451 source->pos = source->writableBuffer.getTerminatedBuffer();
2452}
2453
2454/**
2455* Function to get the discontiguos collation element within the source.
2456* Note this function will set the position to the appropriate places.
2457* @param coll current collator used
2458* @param source data string source
2459* @param constart index to the start character in the contraction table
2460* @return discontiguos collation element offset
2461*/
2462static
2463uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2464 const UChar *constart)
2465{
2466 /* source->pos currently points to the second combining character after
2467 the start character */
2468 const UChar *temppos = source->pos;
2469 UnicodeString buffer;
2470 const UChar *tempconstart = constart;
2471 uint8_t tempflags = source->flags;
2472 UBool multicontraction = FALSE;
2473 collIterateState discState;
2474
2475 backupState(source, &discState);
2476
2477 buffer.setTo(peekCodePoint(source, -1));
2478 for (;;) {
2479 UChar *UCharOffset;
2480 UChar schar,
2481 tchar;
2482 uint32_t result;
2483
2484 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2485 || (peekCodeUnit(source, 0) == 0 &&
2486 //|| (*source->pos == 0 &&
2487 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2488 source->fcdPosition == NULL ||
2489 source->fcdPosition == source->endp ||
2490 *(source->fcdPosition) == 0 ||
2491 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2492 /* end of string in null terminated string or stopped by a
2493 null character, note fcd does not always point to a base
2494 character after the discontiguos change */
2495 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2496 //u_getCombiningClass(*(source->pos)) == 0) {
2497 //constart = (UChar *)coll->image + getContractOffset(CE);
2498 if (multicontraction) {
2499 source->pos = temppos - 1;
2500 setDiscontiguosAttribute(source, buffer);
2501 return *(coll->contractionCEs +
2502 (tempconstart - coll->contractionIndex));
2503 }
2504 constart = tempconstart;
2505 break;
2506 }
2507
2508 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2509 schar = getNextNormalizedChar(source);
2510
2511 while (schar > (tchar = *UCharOffset)) {
2512 UCharOffset++;
2513 }
2514
2515 if (schar != tchar) {
2516 /* not the correct codepoint. we stuff the current codepoint into
2517 the discontiguos buffer and try the next character */
2518 buffer.append(schar);
2519 continue;
2520 }
2521 else {
2522 if (u_getCombiningClass(schar) ==
2523 u_getCombiningClass(peekCodePoint(source, -2))) {
2524 buffer.append(schar);
2525 continue;
2526 }
2527 result = *(coll->contractionCEs +
2528 (UCharOffset - coll->contractionIndex));
2529 }
2530
2531 if (result == UCOL_NOT_FOUND) {
2532 break;
2533 } else if (isContraction(result)) {
2534 /* this is a multi-contraction*/
2535 tempconstart = (UChar *)coll->image + getContractOffset(result);
2536 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2537 != UCOL_NOT_FOUND) {
2538 multicontraction = TRUE;
2539 temppos = source->pos + 1;
2540 }
2541 } else {
2542 setDiscontiguosAttribute(source, buffer);
2543 return result;
2544 }
2545 }
2546
2547 /* no problems simply reverting just like that,
2548 if we are in string before getting into this function, points back to
2549 string hence no problem.
2550 if we are in normalization buffer before getting into this function,
2551 since we'll never use another normalization within this function, we
2552 know that fcdposition points to a base character. the normalization buffer
2553 never change, hence this revert works. */
2554 loadState(source, &discState, TRUE);
2555 goBackOne(source);
2556
2557 //source->pos = temppos - 1;
2558 source->flags = tempflags;
2559 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2560}
2561
2562/* now uses Mark's getImplicitPrimary code */
2563static
2564inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2565 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2566 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2567 collationSource->offsetRepeatCount += 1;
2568 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2569}
2570
2571/**
2572* Inserts the argument character into the front of the buffer replacing the
2573* front null terminator.
2574* @param data collation element iterator data
2575* @param ch character to be appended
2576*/
2577static
2578inline void insertBufferFront(collIterate *data, UChar ch)
2579{
2580 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2581}
2582
2583/**
2584* Special normalization function for contraction in the previous iterator.
2585* This normalization sequence will place the current character at source->pos
2586* and its following normalized sequence into the buffer.
2587* The fcd position, pos will be changed.
2588* pos will now point to positions in the buffer.
2589* Flags will be changed accordingly.
2590* @param data collation iterator data
2591*/
2592static
2593inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2594{
2595 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2596 const UChar *pStart;
2597
2598 UnicodeString endOfBuffer;
2599 if (data->flags & UCOL_ITER_HASLEN) {
2600 /*
2601 normalization buffer not used yet, we'll pull down the next
2602 character into the end of the buffer
2603 */
2604 endOfBuffer.setTo(*pEnd);
2605 }
2606 else {
2607 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
2608 }
2609
2610 if (data->fcdPosition == NULL) {
2611 pStart = data->string;
2612 }
2613 else {
2614 pStart = data->fcdPosition + 1;
2615 }
2616 int32_t normLen =
2617 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2618 data->writableBuffer,
2619 *status).
2620 length();
2621 if(U_FAILURE(*status)) {
2622 return;
2623 }
2624 /*
2625 this puts the null termination infront of the normalized string instead
2626 of the end
2627 */
2628 data->pos =
2629 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2630 1 + normLen;
2631 data->origFlags = data->flags;
2632 data->flags |= UCOL_ITER_INNORMBUF;
2633 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2634}
2635
2636/**
2637* Contraction character management function that returns the previous character
2638* for the backwards iterator.
2639* Does nothing if the previous character is in buffer and not the first
2640* character in it.
2641* Else it checks previous character in data string to see if it is
2642* normalizable.
2643* If it is not, the character is simply copied into the buffer, else
2644* the whole normalized substring is copied into the buffer, including the
2645* current character.
2646* @param data collation element iterator data
2647* @return previous character
2648*/
2649static
2650inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2651{
2652 UChar prevch;
2653 UChar ch;
2654 const UChar *start;
2655 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2656 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2657 (innormbuf && *(data->pos - 1) != 0)) {
2658 /*
2659 if no normalization.
2660 if previous character is in normalized buffer, no further normalization
2661 is required
2662 */
2663 if(data->flags & UCOL_USE_ITERATOR) {
2664 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2665 return (UChar)data->iterator->next(data->iterator);
2666 } else {
2667 return *(data->pos - 1);
2668 }
2669 }
2670
2671 start = data->pos;
2672 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2673 /* in data string */
2674 if ((start - 1) == data->string) {
2675 return *(start - 1);
2676 }
2677 start --;
2678 ch = *start;
2679 prevch = *(start - 1);
2680 }
2681 else {
2682 /*
2683 in writable buffer, at this point fcdPosition can not be NULL.
2684 see contracting tag.
2685 */
2686 if (data->fcdPosition == data->string) {
2687 /* at the start of the string, just dump it into the normalizer */
2688 insertBufferFront(data, *(data->fcdPosition));
2689 data->fcdPosition = NULL;
2690 return *(data->pos - 1);
2691 }
2692 start = data->fcdPosition;
2693 ch = *start;
2694 prevch = *(start - 1);
2695 }
2696 /*
2697 * if the current character is not fcd.
2698 * Trailing combining class == 0.
2699 */
2700 if (data->fcdPosition > start &&
2701 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2702 {
2703 /*
2704 Need a more complete FCD check and possible normalization.
2705 normalize substring will be appended to buffer
2706 */
2707 const UChar *backuppos = data->pos;
2708 data->pos = start;
2709 if (collPrevIterFCD(data)) {
2710 normalizePrevContraction(data, status);
2711 return *(data->pos - 1);
2712 }
2713 data->pos = backuppos;
2714 data->fcdPosition ++;
2715 }
2716
2717 if (innormbuf) {
2718 /*
2719 no normalization is to be done hence only one character will be
2720 appended to the buffer.
2721 */
2722 insertBufferFront(data, ch);
2723 data->fcdPosition --;
2724 }
2725
2726 return ch;
2727}
2728
2729/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2730/* It is called by getNextCE */
2731
2732/* The following should be even */
2733#define UCOL_MAX_DIGITS_FOR_NUMBER 254
2734
2735uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2736 collIterateState entryState;
2737 backupState(source, &entryState);
2738 UChar32 cp = ch;
2739
2740 for (;;) {
2741 // This loop will repeat only in the case of contractions, and only when a contraction
2742 // is found and the first CE resulting from that contraction is itself a special
2743 // (an expansion, for example.) All other special CE types are fully handled the
2744 // first time through, and the loop exits.
2745
2746 const uint32_t *CEOffset = NULL;
2747 switch(getCETag(CE)) {
2748 case NOT_FOUND_TAG:
2749 /* This one is not found, and we'll let somebody else bother about it... no more games */
2750 return CE;
2751 case SPEC_PROC_TAG:
2752 {
2753 // Special processing is getting a CE that is preceded by a certain prefix
2754 // Currently this is only needed for optimizing Japanese length and iteration marks.
2755 // When we encouter a special processing tag, we go backwards and try to see if
2756 // we have a match.
2757 // Contraction tables are used - so the whole process is not unlike contraction.
2758 // prefix data is stored backwards in the table.
2759 const UChar *UCharOffset;
2760 UChar schar, tchar;
2761 collIterateState prefixState;
2762 backupState(source, &prefixState);
2763 loadState(source, &entryState, TRUE);
2764 goBackOne(source); // We want to look at the point where we entered - actually one
2765 // before that...
2766
2767 for(;;) {
2768 // This loop will run once per source string character, for as long as we
2769 // are matching a potential contraction sequence
2770
2771 // First we position ourselves at the begining of contraction sequence
2772 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2773 if (collIter_bos(source)) {
2774 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2775 break;
2776 }
2777 schar = getPrevNormalizedChar(source, status);
2778 goBackOne(source);
2779
2780 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2781 UCharOffset++;
2782 }
2783
2784 if (schar == tchar) {
2785 // Found the source string char in the table.
2786 // Pick up the corresponding CE from the table.
2787 CE = *(coll->contractionCEs +
2788 (UCharOffset - coll->contractionIndex));
2789 }
2790 else
2791 {
2792 // Source string char was not in the table.
2793 // We have not found the prefix.
2794 CE = *(coll->contractionCEs +
2795 (ContractionStart - coll->contractionIndex));
2796 }
2797
2798 if(!isPrefix(CE)) {
2799 // The source string char was in the contraction table, and the corresponding
2800 // CE is not a prefix CE. We found the prefix, break
2801 // out of loop, this CE will end up being returned. This is the normal
2802 // way out of prefix handling when the source actually contained
2803 // the prefix.
2804 break;
2805 }
2806 }
2807 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2808 loadState(source, &prefixState, TRUE);
2809 if(source->origFlags & UCOL_USE_ITERATOR) {
2810 source->flags = source->origFlags;
2811 }
2812 } else { // prefix search was a failure, we have to backup all the way to the start
2813 loadState(source, &entryState, TRUE);
2814 }
2815 break;
2816 }
2817 case CONTRACTION_TAG:
2818 {
2819 /* This should handle contractions */
2820 collIterateState state;
2821 backupState(source, &state);
2822 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2823 const UChar *UCharOffset;
2824 UChar schar, tchar;
2825
2826 for (;;) {
2827 /* This loop will run once per source string character, for as long as we */
2828 /* are matching a potential contraction sequence */
2829
2830 /* First we position ourselves at the begining of contraction sequence */
2831 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2832
2833 if (collIter_eos(source)) {
2834 // Ran off the end of the source string.
2835 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2836 // So we'll pick whatever we have at the point...
2837 if (CE == UCOL_NOT_FOUND) {
2838 // back up the source over all the chars we scanned going into this contraction.
2839 CE = firstCE;
2840 loadState(source, &state, TRUE);
2841 if(source->origFlags & UCOL_USE_ITERATOR) {
2842 source->flags = source->origFlags;
2843 }
2844 }
2845 break;
2846 }
2847
2848 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2849 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2850
2851 schar = getNextNormalizedChar(source);
2852 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2853 UCharOffset++;
2854 }
2855
2856 if (schar == tchar) {
2857 // Found the source string char in the contraction table.
2858 // Pick up the corresponding CE from the table.
2859 CE = *(coll->contractionCEs +
2860 (UCharOffset - coll->contractionIndex));
2861 }
2862 else
2863 {
2864 // Source string char was not in contraction table.
2865 // Unless we have a discontiguous contraction, we have finished
2866 // with this contraction.
2867 // in order to do the proper detection, we
2868 // need to see if we're dealing with a supplementary
2869 /* We test whether the next two char are surrogate pairs.
2870 * This test is done if the iterator is not NULL.
2871 * If there is no surrogate pair, the iterator
2872 * goes back one if needed. */
2873 UChar32 miss = schar;
2874 if (source->iterator) {
2875 UChar32 surrNextChar; /* the next char in the iteration to test */
2876 int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2877 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2878 prevPos = source->iterator->index;
2879 surrNextChar = getNextNormalizedChar(source);
2880 if (U16_IS_TRAIL(surrNextChar)) {
2881 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2882 } else if (prevPos < source->iterator->index){
2883 goBackOne(source);
2884 }
2885 }
2886 } else if (U16_IS_LEAD(schar)) {
2887 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2888 }
2889
2890 uint8_t sCC;
2891 if (miss < 0x300 ||
2892 maxCC == 0 ||
2893 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2894 sCC>maxCC ||
2895 (allSame != 0 && sCC == maxCC) ||
2896 collIter_eos(source))
2897 {
2898 // Contraction can not be discontiguous.
2899 goBackOne(source); // back up the source string by one,
2900 // because the character we just looked at was
2901 // not part of the contraction. */
2902 if(U_IS_SUPPLEMENTARY(miss)) {
2903 goBackOne(source);
2904 }
2905 CE = *(coll->contractionCEs +
2906 (ContractionStart - coll->contractionIndex));
2907 } else {
2908 //
2909 // Contraction is possibly discontiguous.
2910 // Scan more of source string looking for a match
2911 //
2912 UChar tempchar;
2913 /* find the next character if schar is not a base character
2914 and we are not yet at the end of the string */
2915 tempchar = getNextNormalizedChar(source);
2916 // probably need another supplementary thingie here
2917 goBackOne(source);
2918 if (i_getCombiningClass(tempchar, coll) == 0) {
2919 goBackOne(source);
2920 if(U_IS_SUPPLEMENTARY(miss)) {
2921 goBackOne(source);
2922 }
2923 /* Spit out the last char of the string, wasn't tasty enough */
2924 CE = *(coll->contractionCEs +
2925 (ContractionStart - coll->contractionIndex));
2926 } else {
2927 CE = getDiscontiguous(coll, source, ContractionStart);
2928 }
2929 }
2930 } // else after if(schar == tchar)
2931
2932 if(CE == UCOL_NOT_FOUND) {
2933 /* The Source string did not match the contraction that we were checking. */
2934 /* Back up the source position to undo the effects of having partially */
2935 /* scanned through what ultimately proved to not be a contraction. */
2936 loadState(source, &state, TRUE);
2937 CE = firstCE;
2938 break;
2939 }
2940
2941 if(!isContraction(CE)) {
2942 // The source string char was in the contraction table, and the corresponding
2943 // CE is not a contraction CE. We completed the contraction, break
2944 // out of loop, this CE will end up being returned. This is the normal
2945 // way out of contraction handling when the source actually contained
2946 // the contraction.
2947 break;
2948 }
2949
2950
2951 // The source string char was in the contraction table, and the corresponding
2952 // CE is IS a contraction CE. We will continue looping to check the source
2953 // string for the remaining chars in the contraction.
2954 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2955 if(tempCE != UCOL_NOT_FOUND) {
2956 // We have scanned a a section of source string for which there is a
2957 // CE from the contraction table. Remember the CE and scan position, so
2958 // that we can return to this point if further scanning fails to
2959 // match a longer contraction sequence.
2960 firstCE = tempCE;
2961
2962 goBackOne(source);
2963 backupState(source, &state);
2964 getNextNormalizedChar(source);
2965
2966 // Another way to do this is:
2967 //collIterateState tempState;
2968 //backupState(source, &tempState);
2969 //goBackOne(source);
2970 //backupState(source, &state);
2971 //loadState(source, &tempState, TRUE);
2972
2973 // The problem is that for incomplete contractions we have to remember the previous
2974 // position. Before, the only thing I needed to do was state.pos--;
2975 // After iterator introduction and especially after introduction of normalizing
2976 // iterators, it became much more difficult to decrease the saved state.
2977 // I'm not yet sure which of the two methods above is faster.
2978 }
2979 } // for(;;)
2980 break;
2981 } // case CONTRACTION_TAG:
2982 case LONG_PRIMARY_TAG:
2983 {
2984 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2985 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2986 source->offsetRepeatCount += 1;
2987 return CE;
2988 }
2989 case EXPANSION_TAG:
2990 {
2991 /* This should handle expansion. */
2992 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2993 /* I have to decide where continuations are going to be dealt with */
2994 uint32_t size;
2995 uint32_t i; /* general counter */
2996
2997 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2998 size = getExpansionCount(CE);
2999 CE = *CEOffset++;
3000 //source->offsetRepeatCount = -1;
3001
3002 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3003 for(i = 1; i<size; i++) {
3004 *(source->CEpos++) = *CEOffset++;
3005 source->offsetRepeatCount += 1;
3006 }
3007 } else { /* else, we do */
3008 while(*CEOffset != 0) {
3009 *(source->CEpos++) = *CEOffset++;
3010 source->offsetRepeatCount += 1;
3011 }
3012 }
3013
3014 return CE;
3015 }
3016 case DIGIT_TAG:
3017 {
3018 /*
3019 We do a check to see if we want to collate digits as numbers; if so we generate
3020 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3021 */
3022 //uint32_t size;
3023 uint32_t i; /* general counter */
3024
3025 if (source->coll->numericCollation == UCOL_ON){
3026 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3027 UChar32 char32 = 0;
3028 int32_t digVal = 0;
3029
3030 uint32_t digIndx = 0;
3031 uint32_t endIndex = 0;
3032 uint32_t trailingZeroIndex = 0;
3033
3034 uint8_t collateVal = 0;
3035
3036 UBool nonZeroValReached = FALSE;
3037
3038 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3039 /*
3040 We parse the source string until we hit a char that's NOT a digit.
3041 Use this u_charDigitValue. This might be slow because we have to
3042 handle surrogates...
3043 */
3044 /*
3045 if (U16_IS_LEAD(ch)){
3046 if (!collIter_eos(source)) {
3047 backupState(source, &digitState);
3048 UChar trail = getNextNormalizedChar(source);
3049 if(U16_IS_TRAIL(trail)) {
3050 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3051 } else {
3052 loadState(source, &digitState, TRUE);
3053 char32 = ch;
3054 }
3055 } else {
3056 char32 = ch;
3057 }
3058 } else {
3059 char32 = ch;
3060 }
3061 digVal = u_charDigitValue(char32);
3062 */
3063 digVal = u_charDigitValue(cp); // if we have arrived here, we have
3064 // already processed possible supplementaries that trigered the digit tag -
3065 // all supplementaries are marked in the UCA.
3066 /*
3067 We pad a zero in front of the first element anyways. This takes
3068 care of the (probably) most common case where people are sorting things followed
3069 by a single digit
3070 */
3071 digIndx++;
3072 for(;;){
3073 // Make sure we have enough space. No longer needed;
3074 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3075 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3076 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3077
3078 // Skipping over leading zeroes.
3079 if (digVal != 0) {
3080 nonZeroValReached = TRUE;
3081 }
3082 if (nonZeroValReached) {
3083 /*
3084 We parse the digit string into base 100 numbers (this fits into a byte).
3085 We only add to the buffer in twos, thus if we are parsing an odd character,
3086 that serves as the 'tens' digit while the if we are parsing an even one, that
3087 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3088 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3089 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3090 than all the other bytes.
3091 */
3092
3093 if (digIndx % 2 == 1){
3094 collateVal += (uint8_t)digVal;
3095
3096 // We don't enter the low-order-digit case unless we've already seen
3097 // the high order, or for the first digit, which is always non-zero.
3098 if (collateVal != 0)
3099 trailingZeroIndex = 0;
3100
3101 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3102 collateVal = 0;
3103 }
3104 else{
3105 // We drop the collation value into the buffer so if we need to do
3106 // a "front patch" we don't have to check to see if we're hitting the
3107 // last element.
3108 collateVal = (uint8_t)(digVal * 10);
3109
3110 // Check for trailing zeroes.
3111 if (collateVal == 0)
3112 {
3113 if (!trailingZeroIndex)
3114 trailingZeroIndex = (digIndx/2) + 2;
3115 }
3116 else
3117 trailingZeroIndex = 0;
3118
3119 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3120 }
3121 digIndx++;
3122 }
3123
3124 // Get next character.
3125 if (!collIter_eos(source)){
3126 ch = getNextNormalizedChar(source);
3127 if (U16_IS_LEAD(ch)){
3128 if (!collIter_eos(source)) {
3129 backupState(source, &digitState);
3130 UChar trail = getNextNormalizedChar(source);
3131 if(U16_IS_TRAIL(trail)) {
3132 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3133 } else {
3134 loadState(source, &digitState, TRUE);
3135 char32 = ch;
3136 }
3137 }
3138 } else {
3139 char32 = ch;
3140 }
3141
3142 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3143 // Resetting position to point to the next unprocessed char. We
3144 // overshot it when doing our test/set for numbers.
3145 if (char32 > 0xFFFF) { // For surrogates.
3146 loadState(source, &digitState, TRUE);
3147 //goBackOne(source);
3148 }
3149 goBackOne(source);
3150 break;
3151 }
3152 } else {
3153 break;
3154 }
3155 }
3156
3157 if (nonZeroValReached == FALSE){
3158 digIndx = 2;
3159 numTempBuf[2] = 6;
3160 }
3161
3162 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3163 if (digIndx % 2 != 0){
3164 /*
3165 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3166 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3167 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3168 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3169 */
3170
3171 for(i = 2; i < endIndex; i++){
3172 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3173 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3174 }
3175 --digIndx;
3176 }
3177
3178 // Subtract one off of the last byte.
3179 numTempBuf[endIndex-1] -= 1;
3180
3181 /*
3182 We want to skip over the first two slots in the buffer. The first slot
3183 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3184 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3185 */
3186 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3187 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3188
3189 // Now transfer the collation key to our collIterate struct.
3190 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3191 //size = ((endIndex+1) & ~1)/2;
3192 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3193 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3194 UCOL_BYTE_COMMON; // Tertiary weight.
3195 i = 2; // Reset the index into the buffer.
3196 while(i < endIndex)
3197 {
3198 uint32_t primWeight = numTempBuf[i++] << 8;
3199 if ( i < endIndex)
3200 primWeight |= numTempBuf[i++];
3201 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3202 }
3203
3204 } else {
3205 // no numeric mode, we'll just switch to whatever we stashed and continue
3206 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3207 CE = *CEOffset++;
3208 break;
3209 }
3210 return CE;
3211 }
3212 /* various implicits optimization */
3213 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3214 /* UCA is filled with these. Tailorings are NOT_FOUND */
3215 return getImplicit(cp, source);
3216 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3217 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3218 return getImplicit(cp, source);
3219 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3220 {
3221 static const uint32_t
3222 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3223 //const uint32_t LCount = 19;
3224 static const uint32_t VCount = 21;
3225 static const uint32_t TCount = 28;
3226 //const uint32_t NCount = VCount * TCount; // 588
3227 //const uint32_t SCount = LCount * NCount; // 11172
3228 uint32_t L = ch - SBase;
3229
3230 // divide into pieces
3231
3232 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3233 L /= TCount;
3234 uint32_t V = L % VCount;
3235 L /= VCount;
3236
3237 // offset them
3238
3239 L += LBase;
3240 V += VBase;
3241 T += TBase;
3242
3243 // return the first CE, but first put the rest into the expansion buffer
3244 if (!source->coll->image->jamoSpecial) { // FAST PATH
3245
3246 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3247 if (T != TBase) {
3248 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3249 }
3250
3251 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3252
3253 } else { // Jamo is Special
3254 // Since Hanguls pass the FCD check, it is
3255 // guaranteed that we won't be in
3256 // the normalization buffer if something like this happens
3257
3258 // However, if we are using a uchar iterator and normalization
3259 // is ON, the Hangul that lead us here is going to be in that
3260 // normalization buffer. Here we want to restore the uchar
3261 // iterator state and pull out of the normalization buffer
3262 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3263 source->flags = source->origFlags; // restore the iterator
3264 source->pos = NULL;
3265 }
3266
3267 // Move Jamos into normalization buffer
3268 UChar *buffer = source->writableBuffer.getBuffer(4);
3269 int32_t bufferLength;
3270 buffer[0] = (UChar)L;
3271 buffer[1] = (UChar)V;
3272 if (T != TBase) {
3273 buffer[2] = (UChar)T;
3274 bufferLength = 3;
3275 } else {
3276 bufferLength = 2;
3277 }
3278 source->writableBuffer.releaseBuffer(bufferLength);
3279
3280 // Indicate where to continue in main input string after exhausting the writableBuffer
3281 source->fcdPosition = source->pos;
3282
3283 source->pos = source->writableBuffer.getTerminatedBuffer();
3284 source->origFlags = source->flags;
3285 source->flags |= UCOL_ITER_INNORMBUF;
3286 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3287
3288 return(UCOL_IGNORABLE);
3289 }
3290 }
3291 case SURROGATE_TAG:
3292 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3293 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3294 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3295 /* we treat it like an unassigned code point. */
3296 {
3297 UChar trail;
3298 collIterateState state;
3299 backupState(source, &state);
3300 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3301 // we chould have stepped one char forward and it might have turned that it
3302 // was not a trail surrogate. In that case, we have to backup.
3303 loadState(source, &state, TRUE);
3304 return UCOL_NOT_FOUND;
3305 } else {
3306 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3307 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3308 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3309 // We need to backup
3310 loadState(source, &state, TRUE);
3311 return CE;
3312 }
3313 // calculate the supplementary code point value, if surrogate was not tailored
3314 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3315 }
3316 }
3317 break;
3318 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3319 UChar nextChar;
3320 if( source->flags & UCOL_USE_ITERATOR) {
3321 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3322 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3323 source->iterator->next(source->iterator);
3324 return getImplicit(cp, source);
3325 }
3326 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3327 U_IS_TRAIL((nextChar=*source->pos))) {
3328 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3329 source->pos++;
3330 return getImplicit(cp, source);
3331 }
3332 return UCOL_NOT_FOUND;
3333 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3334 return UCOL_NOT_FOUND; /* broken surrogate sequence */
3335 case CHARSET_TAG:
3336 /* not yet implemented */
3337 /* probably after 1.8 */
3338 return UCOL_NOT_FOUND;
3339 default:
3340 *status = U_INTERNAL_PROGRAM_ERROR;
3341 CE=0;
3342 break;
3343 }
3344 if (CE <= UCOL_NOT_FOUND) break;
3345 }
3346 return CE;
3347}
3348
3349
3350/* now uses Mark's getImplicitPrimary code */
3351static
3352inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3353 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3354
3355 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3356 collationSource->toReturn = collationSource->CEpos;
3357
3358 // **** doesn't work if using iterator ****
3359 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3360 collationSource->offsetRepeatCount = 1;
3361 } else {
3362 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3363
3364 UErrorCode errorCode = U_ZERO_ERROR;
3365 collationSource->appendOffset(firstOffset, errorCode);
3366 collationSource->appendOffset(firstOffset + 1, errorCode);
3367
3368 collationSource->offsetReturn = collationSource->offsetStore - 1;
3369 *(collationSource->offsetBuffer) = firstOffset;
3370 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3371 collationSource->offsetStore = collationSource->offsetBuffer;
3372 }
3373 }
3374
3375 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3376}
3377
3378/**
3379 * This function handles the special CEs like contractions, expansions,
3380 * surrogates, Thai.
3381 * It is called by both getPrevCE
3382 */
3383uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3384 collIterate *source,
3385 UErrorCode *status)
3386{
3387 const uint32_t *CEOffset = NULL;
3388 UChar *UCharOffset = NULL;
3389 UChar schar;
3390 const UChar *constart = NULL;
3391 uint32_t size;
3392 UChar buffer[UCOL_MAX_BUFFER];
3393 uint32_t *endCEBuffer;
3394 UChar *strbuffer;
3395 int32_t noChars = 0;
3396 int32_t CECount = 0;
3397
3398 for(;;)
3399 {
3400 /* the only ces that loops are thai and contractions */
3401 switch (getCETag(CE))
3402 {
3403 case NOT_FOUND_TAG: /* this tag always returns */
3404 return CE;
3405
3406 case SPEC_PROC_TAG:
3407 {
3408 // Special processing is getting a CE that is preceded by a certain prefix
3409 // Currently this is only needed for optimizing Japanese length and iteration marks.
3410 // When we encouter a special processing tag, we go backwards and try to see if
3411 // we have a match.
3412 // Contraction tables are used - so the whole process is not unlike contraction.
3413 // prefix data is stored backwards in the table.
3414 const UChar *UCharOffset;
3415 UChar schar, tchar;
3416 collIterateState prefixState;
3417 backupState(source, &prefixState);
3418 for(;;) {
3419 // This loop will run once per source string character, for as long as we
3420 // are matching a potential contraction sequence
3421
3422 // First we position ourselves at the begining of contraction sequence
3423 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3424
3425 if (collIter_bos(source)) {
3426 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3427 break;
3428 }
3429 schar = getPrevNormalizedChar(source, status);
3430 goBackOne(source);
3431
3432 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3433 UCharOffset++;
3434 }
3435
3436 if (schar == tchar) {
3437 // Found the source string char in the table.
3438 // Pick up the corresponding CE from the table.
3439 CE = *(coll->contractionCEs +
3440 (UCharOffset - coll->contractionIndex));
3441 }
3442 else
3443 {
3444 // if there is a completely ignorable code point in the middle of
3445 // a prefix, we need to act as if it's not there
3446 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3447 // lone surrogates cannot be set to zero as it would break other processing
3448 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3449 // it's easy for BMP code points
3450 if(isZeroCE == 0) {
3451 continue;
3452 } else if(U16_IS_SURROGATE(schar)) {
3453 // for supplementary code points, we have to check the next one
3454 // situations where we are going to ignore
3455 // 1. beginning of the string: schar is a lone surrogate
3456 // 2. schar is a lone surrogate
3457 // 3. schar is a trail surrogate in a valid surrogate sequence
3458 // that is explicitly set to zero.
3459 if (!collIter_bos(source)) {
3460 UChar lead;
3461 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3462 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3463 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3464 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3465 if(finalCE == 0) {
3466 // this is a real, assigned completely ignorable code point
3467 goBackOne(source);
3468 continue;
3469 }
3470 }
3471 } else {
3472 // lone surrogate, treat like unassigned
3473 return UCOL_NOT_FOUND;
3474 }
3475 } else {
3476 // lone surrogate at the beggining, treat like unassigned
3477 return UCOL_NOT_FOUND;
3478 }
3479 }
3480 // Source string char was not in the table.
3481 // We have not found the prefix.
3482 CE = *(coll->contractionCEs +
3483 (ContractionStart - coll->contractionIndex));
3484 }
3485
3486 if(!isPrefix(CE)) {
3487 // The source string char was in the contraction table, and the corresponding
3488 // CE is not a prefix CE. We found the prefix, break
3489 // out of loop, this CE will end up being returned. This is the normal
3490 // way out of prefix handling when the source actually contained
3491 // the prefix.
3492 break;
3493 }
3494 }
3495 loadState(source, &prefixState, TRUE);
3496 break;
3497 }
3498
3499 case CONTRACTION_TAG: {
3500 /* to ensure that the backwards and forwards iteration matches, we
3501 take the current region of most possible match and pass it through
3502 the forward iteration. this will ensure that the obstinate problem of
3503 overlapping contractions will not occur.
3504 */
3505 schar = peekCodeUnit(source, 0);
3506 constart = (UChar *)coll->image + getContractOffset(CE);
3507 if (isAtStartPrevIterate(source)
3508 /* commented away contraction end checks after adding the checks
3509 in getPrevCE */) {
3510 /* start of string or this is not the end of any contraction */
3511 CE = *(coll->contractionCEs +
3512 (constart - coll->contractionIndex));
3513 break;
3514 }
3515 strbuffer = buffer;
3516 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3517 *(UCharOffset --) = 0;
3518 noChars = 0;
3519 // have to swap thai characters
3520 while (ucol_unsafeCP(schar, coll)) {
3521 *(UCharOffset) = schar;
3522 noChars++;
3523 UCharOffset --;
3524 schar = getPrevNormalizedChar(source, status);
3525 goBackOne(source);
3526 // TODO: when we exhaust the contraction buffer,
3527 // it needs to get reallocated. The problem is
3528 // that the size depends on the string which is
3529 // not iterated over. However, since we're travelling
3530 // backwards, we already had to set the iterator at
3531 // the end - so we might as well know where we are?
3532 if (UCharOffset + 1 == buffer) {
3533 /* we have exhausted the buffer */
3534 int32_t newsize = 0;
3535 if(source->pos) { // actually dealing with a position
3536 newsize = (int32_t)(source->pos - source->string + 1);
3537 } else { // iterator
3538 newsize = 4 * UCOL_MAX_BUFFER;
3539 }
3540 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3541 (newsize + UCOL_MAX_BUFFER));
3542 /* test for NULL */
3543 if (strbuffer == NULL) {
3544 *status = U_MEMORY_ALLOCATION_ERROR;
3545 return UCOL_NO_MORE_CES;
3546 }
3547 UCharOffset = strbuffer + newsize;
3548 uprv_memcpy(UCharOffset, buffer,
3549 UCOL_MAX_BUFFER * sizeof(UChar));
3550 UCharOffset --;
3551 }
3552 if ((source->pos && (source->pos == source->string ||
3553 ((source->flags & UCOL_ITER_INNORMBUF) &&
3554 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3555 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3556 break;
3557 }
3558 }
3559 /* adds the initial base character to the string */
3560 *(UCharOffset) = schar;
3561 noChars++;
3562
3563 int32_t offsetBias;
3564
3565 // **** doesn't work if using iterator ****
3566 if (source->flags & UCOL_ITER_INNORMBUF) {
3567 offsetBias = -1;
3568 } else {
3569 offsetBias = (int32_t)(source->pos - source->string);
3570 }
3571
3572 /* a new collIterate is used to simplify things, since using the current
3573 collIterate will mean that the forward and backwards iteration will
3574 share and change the same buffers. we don't want to get into that. */
3575 collIterate temp;
3576 int32_t rawOffset;
3577
3578 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3579 if(U_FAILURE(*status)) {
3580 return UCOL_NULLORDER;
3581 }
3582 temp.flags &= ~UCOL_ITER_NORM;
3583 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3584
3585 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3586 CE = ucol_IGetNextCE(coll, &temp, status);
3587
3588 if (source->extendCEs) {
3589 endCEBuffer = source->extendCEs + source->extendCEsSize;
3590 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3591 } else {
3592 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3593 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3594 }
3595
3596 while (CE != UCOL_NO_MORE_CES) {
3597 *(source->CEpos ++) = CE;
3598
3599 if (offsetBias >= 0) {
3600 source->appendOffset(rawOffset + offsetBias, *status);
3601 }
3602
3603 CECount++;
3604 if (source->CEpos == endCEBuffer) {
3605 /* ran out of CE space, reallocate to new buffer.
3606 If reallocation fails, reset pointers and bail out,
3607 there's no guarantee of the right character position after
3608 this bail*/
3609 if (!increaseCEsCapacity(source)) {
3610 *status = U_MEMORY_ALLOCATION_ERROR;
3611 break;
3612 }
3613
3614 endCEBuffer = source->extendCEs + source->extendCEsSize;
3615 }
3616
3617 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3618 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3619 } else {
3620 rawOffset = (int32_t)(temp.pos - temp.string);
3621 }
3622
3623 CE = ucol_IGetNextCE(coll, &temp, status);
3624 }
3625
3626 if (strbuffer != buffer) {
3627 uprv_free(strbuffer);
3628 }
3629 if (U_FAILURE(*status)) {
3630 return (uint32_t)UCOL_NULLORDER;
3631 }
3632
3633 if (source->offsetRepeatValue != 0) {
3634 if (CECount > noChars) {
3635 source->offsetRepeatCount += temp.offsetRepeatCount;
3636 } else {
3637 // **** does this really skip the right offsets? ****
3638 source->offsetReturn -= (noChars - CECount);
3639 }
3640 }
3641
3642 if (offsetBias >= 0) {
3643 source->offsetReturn = source->offsetStore - 1;
3644 if (source->offsetReturn == source->offsetBuffer) {
3645 source->offsetStore = source->offsetBuffer;
3646 }
3647 }
3648
3649 source->toReturn = source->CEpos - 1;
3650 if (source->toReturn == source->CEs) {
3651 source->CEpos = source->CEs;
3652 }
3653
3654 return *(source->toReturn);
3655 }
3656 case LONG_PRIMARY_TAG:
3657 {
3658 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3659 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3660 source->toReturn = source->CEpos - 1;
3661
3662 if (source->flags & UCOL_ITER_INNORMBUF) {
3663 source->offsetRepeatCount = 1;
3664 } else {
3665 int32_t firstOffset = (int32_t)(source->pos - source->string);
3666
3667 source->appendOffset(firstOffset, *status);
3668 source->appendOffset(firstOffset + 1, *status);
3669
3670 source->offsetReturn = source->offsetStore - 1;
3671 *(source->offsetBuffer) = firstOffset;
3672 if (source->offsetReturn == source->offsetBuffer) {
3673 source->offsetStore = source->offsetBuffer;
3674 }
3675 }
3676
3677
3678 return *(source->toReturn);
3679 }
3680
3681 case EXPANSION_TAG: /* this tag always returns */
3682 {
3683 /*
3684 This should handle expansion.
3685 NOTE: we can encounter both continuations and expansions in an expansion!
3686 I have to decide where continuations are going to be dealt with
3687 */
3688 int32_t firstOffset = (int32_t)(source->pos - source->string);
3689
3690 // **** doesn't work if using iterator ****
3691 if (source->offsetReturn != NULL) {
3692 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3693 source->offsetStore = source->offsetBuffer;
3694 }else {
3695 firstOffset = -1;
3696 }
3697 }
3698
3699 /* find the offset to expansion table */
3700 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3701 size = getExpansionCount(CE);
3702 if (size != 0) {
3703 /*
3704 if there are less than 16 elements in expansion, we don't terminate
3705 */
3706 uint32_t count;
3707
3708 for (count = 0; count < size; count++) {
3709 *(source->CEpos ++) = *CEOffset++;
3710
3711 if (firstOffset >= 0) {
3712 source->appendOffset(firstOffset + 1, *status);
3713 }
3714 }
3715 } else {
3716 /* else, we do */
3717 while (*CEOffset != 0) {
3718 *(source->CEpos ++) = *CEOffset ++;
3719
3720 if (firstOffset >= 0) {
3721 source->appendOffset(firstOffset + 1, *status);
3722 }
3723 }
3724 }
3725
3726 if (firstOffset >= 0) {
3727 source->offsetReturn = source->offsetStore - 1;
3728 *(source->offsetBuffer) = firstOffset;
3729 if (source->offsetReturn == source->offsetBuffer) {
3730 source->offsetStore = source->offsetBuffer;
3731 }
3732 } else {
3733 source->offsetRepeatCount += size - 1;
3734 }
3735
3736 source->toReturn = source->CEpos - 1;
3737 // in case of one element expansion, we
3738 // want to immediately return CEpos
3739 if(source->toReturn == source->CEs) {
3740 source->CEpos = source->CEs;
3741 }
3742
3743 return *(source->toReturn);
3744 }
3745
3746 case DIGIT_TAG:
3747 {
3748 /*
3749 We do a check to see if we want to collate digits as numbers; if so we generate
3750 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3751 */
3752 uint32_t i; /* general counter */
3753
3754 if (source->coll->numericCollation == UCOL_ON){
3755 uint32_t digIndx = 0;
3756 uint32_t endIndex = 0;
3757 uint32_t leadingZeroIndex = 0;
3758 uint32_t trailingZeroCount = 0;
3759
3760 uint8_t collateVal = 0;
3761
3762 UBool nonZeroValReached = FALSE;
3763
3764 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3765 /*
3766 We parse the source string until we hit a char that's NOT a digit.
3767 Use this u_charDigitValue. This might be slow because we have to
3768 handle surrogates...
3769 */
3770 /*
3771 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3772 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3773 element we process when going backward. To determine how long that chunk might be, we may need to make
3774 two passes through the loop that collects digits - one to see how long the string is (and how much is
3775 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3776 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3777 element chunk after resetting the state to the initialState at the right side of the digit string.
3778 */
3779 uint32_t ceLimit = 0;
3780 UChar initial_ch = ch;
3781 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3782 backupState(source, &initialState);
3783
3784 for(;;) {
3785 collIterateState state = {0,0,0,0,0,0,0,0,0};
3786 UChar32 char32 = 0;
3787 int32_t digVal = 0;
3788
3789 if (U16_IS_TRAIL (ch)) {
3790 if (!collIter_bos(source)){
3791 UChar lead = getPrevNormalizedChar(source, status);
3792 if(U16_IS_LEAD(lead)) {
3793 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3794 goBackOne(source);
3795 } else {
3796 char32 = ch;
3797 }
3798 } else {
3799 char32 = ch;
3800 }
3801 } else {
3802 char32 = ch;
3803 }
3804 digVal = u_charDigitValue(char32);
3805
3806 for(;;) {
3807 // Make sure we have enough space. No longer needed;
3808 // at this point the largest value of digIndx when we need to save data in numTempBuf
3809 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3810 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3811
3812 // Skip over trailing zeroes, and keep a count of them.
3813 if (digVal != 0)
3814 nonZeroValReached = TRUE;
3815
3816 if (nonZeroValReached) {
3817 /*
3818 We parse the digit string into base 100 numbers (this fits into a byte).
3819 We only add to the buffer in twos, thus if we are parsing an odd character,
3820 that serves as the 'tens' digit while the if we are parsing an even one, that
3821 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3822 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3823 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3824 than all the other bytes.
3825
3826 Since we're doing in this reverse we want to put the first digit encountered into the
3827 ones place and the second digit encountered into the tens place.
3828 */
3829
3830 if ((digIndx + trailingZeroCount) % 2 == 1) {
3831 // High-order digit case (tens place)
3832 collateVal += (uint8_t)(digVal * 10);
3833
3834 // We cannot set leadingZeroIndex unless it has been set for the
3835 // low-order digit. Therefore, all we can do for the high-order
3836 // digit is turn it off, never on.
3837 // The only time we will have a high digit without a low is for
3838 // the very first non-zero digit, so no zero check is necessary.
3839 if (collateVal != 0)
3840 leadingZeroIndex = 0;
3841
3842 // The first pass through, digIndx may exceed the limit, but in that case
3843 // we no longer care about numTempBuf contents since they will be discarded
3844 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3845 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3846 }
3847 collateVal = 0;
3848 } else {
3849 // Low-order digit case (ones place)
3850 collateVal = (uint8_t)digVal;
3851
3852 // Check for leading zeroes.
3853 if (collateVal == 0) {
3854 if (!leadingZeroIndex)
3855 leadingZeroIndex = (digIndx/2) + 2;
3856 } else
3857 leadingZeroIndex = 0;
3858
3859 // No need to write to buffer; the case of a last odd digit
3860 // is handled below.
3861 }
3862 ++digIndx;
3863 } else
3864 ++trailingZeroCount;
3865
3866 if (!collIter_bos(source)) {
3867 ch = getPrevNormalizedChar(source, status);
3868 //goBackOne(source);
3869 if (U16_IS_TRAIL(ch)) {
3870 backupState(source, &state);
3871 if (!collIter_bos(source)) {
3872 goBackOne(source);
3873 UChar lead = getPrevNormalizedChar(source, status);
3874
3875 if(U16_IS_LEAD(lead)) {
3876 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3877 } else {
3878 loadState(source, &state, FALSE);
3879 char32 = ch;
3880 }
3881 }
3882 } else
3883 char32 = ch;
3884
3885 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3886 if (char32 > 0xFFFF) {// For surrogates.
3887 loadState(source, &state, FALSE);
3888 }
3889 // Don't need to "reverse" the goBackOne call,
3890 // as this points to the next position to process..
3891 //if (char32 > 0xFFFF) // For surrogates.
3892 //getNextNormalizedChar(source);
3893 break;
3894 }
3895
3896 goBackOne(source);
3897 }else
3898 break;
3899 }
3900
3901 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3902 // our collation element is not too big, go ahead and finish with it
3903 break;
3904 }
3905 // our digit string is too long for a collation element;
3906 // set the limit for it, reset the state and begin again
3907 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3908 if ( ceLimit == 0 ) {
3909 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3910 }
3911 ch = initial_ch;
3912 loadState(source, &initialState, FALSE);
3913 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3914 collateVal = 0;
3915 nonZeroValReached = FALSE;
3916 }
3917
3918 if (! nonZeroValReached) {
3919 digIndx = 2;
3920 trailingZeroCount = 0;
3921 numTempBuf[2] = 6;
3922 }
3923
3924 if ((digIndx + trailingZeroCount) % 2 != 0) {
3925 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3926 digIndx += 1; // The implicit leading zero
3927 }
3928 if (trailingZeroCount % 2 != 0) {
3929 // We had to consume one trailing zero for the low digit
3930 // of the least significant byte
3931 digIndx += 1; // The trailing zero not in the exponent
3932 trailingZeroCount -= 1;
3933 }
3934
3935 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3936
3937 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3938 numTempBuf[2] -= 1;
3939
3940 /*
3941 We want to skip over the first two slots in the buffer. The first slot
3942 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3943 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3944 The exponent must be adjusted by the number of leading zeroes, and the number of
3945 trailing zeroes.
3946 */
3947 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3948 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3949 if (leadingZeroIndex)
3950 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3951 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3952
3953 // Now transfer the collation key to our collIterate struct.
3954 // The total size for our collation key is half of endIndex, rounded up.
3955 int32_t size = (endIndex+1)/2;
3956 if(!ensureCEsCapacity(source, size)) {
3957 return UCOL_NULLORDER;
3958 }
3959 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3960 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3961 UCOL_BYTE_COMMON; // Tertiary weight.
3962 i = endIndex - 1; // Reset the index into the buffer.
3963 while(i >= 2) {
3964 uint32_t primWeight = numTempBuf[i--] << 8;
3965 if ( i >= 2)
3966 primWeight |= numTempBuf[i--];
3967 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3968 }
3969
3970 source->toReturn = source->CEpos -1;
3971 return *(source->toReturn);
3972 } else {
3973 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3974 CE = *(CEOffset++);
3975 break;
3976 }
3977 }
3978
3979 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3980 {
3981 static const uint32_t
3982 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3983 //const uint32_t LCount = 19;
3984 static const uint32_t VCount = 21;
3985 static const uint32_t TCount = 28;
3986 //const uint32_t NCount = VCount * TCount; /* 588 */
3987 //const uint32_t SCount = LCount * NCount; /* 11172 */
3988
3989 uint32_t L = ch - SBase;
3990 /*
3991 divide into pieces.
3992 we do it in this order since some compilers can do % and / in one
3993 operation
3994 */
3995 uint32_t T = L % TCount;
3996 L /= TCount;
3997 uint32_t V = L % VCount;
3998 L /= VCount;
3999
4000 /* offset them */
4001 L += LBase;
4002 V += VBase;
4003 T += TBase;
4004
4005 int32_t firstOffset = (int32_t)(source->pos - source->string);
4006 source->appendOffset(firstOffset, *status);
4007
4008 /*
4009 * return the first CE, but first put the rest into the expansion buffer
4010 */
4011 if (!source->coll->image->jamoSpecial) {
4012 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4013 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4014 source->appendOffset(firstOffset + 1, *status);
4015
4016 if (T != TBase) {
4017 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4018 source->appendOffset(firstOffset + 1, *status);
4019 }
4020
4021 source->toReturn = source->CEpos - 1;
4022
4023 source->offsetReturn = source->offsetStore - 1;
4024 if (source->offsetReturn == source->offsetBuffer) {
4025 source->offsetStore = source->offsetBuffer;
4026 }
4027
4028 return *(source->toReturn);
4029 } else {
4030 // Since Hanguls pass the FCD check, it is
4031 // guaranteed that we won't be in
4032 // the normalization buffer if something like this happens
4033
4034 // Move Jamos into normalization buffer
4035 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
4036 int32_t tempbufferLength, jamoOffset;
4037 tempbuffer[0] = 0;
4038 tempbuffer[1] = (UChar)L;
4039 tempbuffer[2] = (UChar)V;
4040 if (T != TBase) {
4041 tempbuffer[3] = (UChar)T;
4042 tempbufferLength = 4;
4043 } else {
4044 tempbufferLength = 3;
4045 }
4046 source->writableBuffer.releaseBuffer(tempbufferLength);
4047
4048 // Indicate where to continue in main input string after exhausting the writableBuffer
4049 if (source->pos == source->string) {
4050 jamoOffset = 0;
4051 source->fcdPosition = NULL;
4052 } else {
4053 jamoOffset = source->pos - source->string;
4054 source->fcdPosition = source->pos-1;
4055 }
4056
4057 // Append offsets for the additional chars
4058 // (not the 0, and not the L whose offsets match the original Hangul)
4059 int32_t jamoRemaining = tempbufferLength - 2;
4060 jamoOffset++; // appended offsets should match end of original Hangul
4061 while (jamoRemaining-- > 0) {
4062 source->appendOffset(jamoOffset, *status);
4063 }
4064
4065 source->offsetRepeatValue = jamoOffset;
4066
4067 source->offsetReturn = source->offsetStore - 1;
4068 if (source->offsetReturn == source->offsetBuffer) {
4069 source->offsetStore = source->offsetBuffer;
4070 }
4071
4072 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4073 source->origFlags = source->flags;
4074 source->flags |= UCOL_ITER_INNORMBUF;
4075 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4076
4077 return(UCOL_IGNORABLE);
4078 }
4079 }
4080
4081 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4082 return getPrevImplicit(ch, source);
4083
4084 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4085 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4086 return getPrevImplicit(ch, source);
4087
4088 case SURROGATE_TAG: /* This is a surrogate pair */
4089 /* essentially an engaged lead surrogate. */
4090 /* if you have encountered it here, it means that a */
4091 /* broken sequence was encountered and this is an error */
4092 return UCOL_NOT_FOUND;
4093
4094 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4095 return UCOL_NOT_FOUND; /* broken surrogate sequence */
4096
4097 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4098 {
4099 UChar32 cp = 0;
4100 UChar prevChar;
4101 const UChar *prev;
4102 if (isAtStartPrevIterate(source)) {
4103 /* we are at the start of the string, wrong place to be at */
4104 return UCOL_NOT_FOUND;
4105 }
4106 if (source->pos != source->writableBuffer.getBuffer()) {
4107 prev = source->pos - 1;
4108 } else {
4109 prev = source->fcdPosition;
4110 }
4111 prevChar = *prev;
4112
4113 /* Handles Han and Supplementary characters here.*/
4114 if (U16_IS_LEAD(prevChar)) {
4115 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4116 source->pos = prev;
4117 } else {
4118 return UCOL_NOT_FOUND; /* like unassigned */
4119 }
4120
4121 return getPrevImplicit(cp, source);
4122 }
4123
4124 /* UCA is filled with these. Tailorings are NOT_FOUND */
4125 /* not yet implemented */
4126 case CHARSET_TAG: /* this tag always returns */
4127 /* probably after 1.8 */
4128 return UCOL_NOT_FOUND;
4129
4130 default: /* this tag always returns */
4131 *status = U_INTERNAL_PROGRAM_ERROR;
4132 CE=0;
4133 break;
4134 }
4135
4136 if (CE <= UCOL_NOT_FOUND) {
4137 break;
4138 }
4139 }
4140
4141 return CE;
4142}
4143
4144/* This should really be a macro */
4145/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4146/* secondaries in French */
4147/*
4148void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4149 uint8_t temp;
4150 while(start<end) {
4151 temp = *start;
4152 *start++ = *end;
4153 *end-- = temp;
4154 }
4155}
4156*/
4157
4158#define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4159 TYPE tempA; \
4160while((start)<(end)) { \
4161 tempA = *(start); \
4162 *(start)++ = *(end); \
4163 *(end)-- = tempA; \
4164} \
4165}
4166
4167/****************************************************************************/
4168/* Following are the sortkey generation functions */
4169/* */
4170/****************************************************************************/
4171
4172/**
4173 * Merge two sort keys.
4174 * This is useful, for example, to combine sort keys from first and last names
4175 * to sort such pairs.
4176 * Merged sort keys consider on each collation level the first part first entirely,
4177 * then the second one.
4178 * It is possible to merge multiple sort keys by consecutively merging
4179 * another one with the intermediate result.
4180 *
4181 * The length of the merge result is the sum of the lengths of the input sort keys
4182 * minus 1.
4183 *
4184 * @param src1 the first sort key
4185 * @param src1Length the length of the first sort key, including the zero byte at the end;
4186 * can be -1 if the function is to find the length
4187 * @param src2 the second sort key
4188 * @param src2Length the length of the second sort key, including the zero byte at the end;
4189 * can be -1 if the function is to find the length
4190 * @param dest the buffer where the merged sort key is written,
4191 * can be NULL if destCapacity==0
4192 * @param destCapacity the number of bytes in the dest buffer
4193 * @return the length of the merged sort key, src1Length+src2Length-1;
4194 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4195 * in which cases the contents of dest is undefined
4196 *
4197 * @draft
4198 */
4199U_CAPI int32_t U_EXPORT2
4200ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4201 const uint8_t *src2, int32_t src2Length,
4202 uint8_t *dest, int32_t destCapacity) {
4203 int32_t destLength;
4204 uint8_t b;
4205
4206 /* check arguments */
4207 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4208 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4209 destCapacity<0 || (destCapacity>0 && dest==NULL)
4210 ) {
4211 /* error, attempt to write a zero byte and return 0 */
4212 if(dest!=NULL && destCapacity>0) {
4213 *dest=0;
4214 }
4215 return 0;
4216 }
4217
4218 /* check lengths and capacity */
4219 if(src1Length<0) {
4220 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4221 }
4222 if(src2Length<0) {
4223 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4224 }
4225
4226 destLength=src1Length+src2Length-1;
4227 if(destLength>destCapacity) {
4228 /* the merged sort key does not fit into the destination */
4229 return destLength;
4230 }
4231
4232 /* merge the sort keys with the same number of levels */
4233 while(*src1!=0 && *src2!=0) { /* while both have another level */
4234 /* copy level from src1 not including 00 or 01 */
4235 while((b=*src1)>=2) {
4236 ++src1;
4237 *dest++=b;
4238 }
4239
4240 /* add a 02 merge separator */
4241 *dest++=2;
4242
4243 /* copy level from src2 not including 00 or 01 */
4244 while((b=*src2)>=2) {
4245 ++src2;
4246 *dest++=b;
4247 }
4248
4249 /* if both sort keys have another level, then add a 01 level separator and continue */
4250 if(*src1==1 && *src2==1) {
4251 ++src1;
4252 ++src2;
4253 *dest++=1;
4254 }
4255 }
4256
4257 /*
4258 * here, at least one sort key is finished now, but the other one
4259 * might have some contents left from containing more levels;
4260 * that contents is just appended to the result
4261 */
4262 if(*src1!=0) {
4263 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4264 src2=src1;
4265 }
4266 /* append src2, "the other, unfinished sort key" */
4267 uprv_strcpy((char *)dest, (const char *)src2);
4268
4269 /* trust that neither sort key contained illegally embedded zero bytes */
4270 return destLength;
4271}
4272
4273U_NAMESPACE_BEGIN
4274
4275class SortKeyByteSink : public ByteSink {
4276public:
4277 static const uint32_t FILL_ORIGINAL_BUFFER = 1;
4278 static const uint32_t DONT_GROW = 2;
4279 SortKeyByteSink(char *dest, int32_t destCapacity, uint32_t flags=0)
4280 : ownedBuffer_(NULL), buffer_(dest), capacity_(destCapacity),
4281 appended_(0),
4282 fill_(flags & FILL_ORIGINAL_BUFFER),
4283 grow_((flags & DONT_GROW) == 0) {
4284 if (buffer_ == NULL || capacity_ < 0) {
4285 buffer_ = reinterpret_cast<char *>(&lastResortByte_);
4286 capacity_ = 0;
4287 }
4288 }
4289 virtual ~SortKeyByteSink();
4290
4291 virtual void Append(const char *bytes, int32_t n);
4292 void Append(const uint8_t *bytes, int32_t n) { Append(reinterpret_cast<const char *>(bytes), n); }
4293 void Append(uint8_t b) {
4294 if (appended_ < capacity_) {
4295 buffer_[appended_++] = (char)b;
4296 } else {
4297 Append(&b, 1);
4298 }
4299 }
4300 void Append(uint8_t b1, uint8_t b2) {
4301 int32_t a2 = appended_ + 2;
4302 if (a2 <= capacity_) {
4303 buffer_[appended_] = (char)b1;
4304 buffer_[appended_ + 1] = (char)b2;
4305 appended_ = a2;
4306 } else {
4307 char bytes[2] = { (char)b1, (char)b2 };
4308 Append(bytes, 2);
4309 }
4310 }
4311 void Append(const SortKeyByteSink &other) { Append(other.buffer_, other.appended_); }
4312 virtual char *GetAppendBuffer(int32_t min_capacity,
4313 int32_t desired_capacity_hint,
4314 char *scratch, int32_t scratch_capacity,
4315 int32_t *result_capacity);
4316 int32_t NumberOfBytesAppended() const { return appended_; }
4317 uint8_t &LastByte() {
4318 if (buffer_ != NULL && appended_ > 0) {
4319 return reinterpret_cast<uint8_t *>(buffer_)[appended_ - 1];
4320 } else {
4321 return lastResortByte_;
4322 }
4323 }
4324 uint8_t *GetLastFewBytes(int32_t n) {
4325 if (buffer_ != NULL && appended_ >= n) {
4326 return reinterpret_cast<uint8_t *>(buffer_) + appended_ - n;
4327 } else {
4328 return NULL;
4329 }
4330 }
4331 char *GetBuffer() { return buffer_; }
4332 uint8_t *GetUnsignedBuffer() { return reinterpret_cast<uint8_t *>(buffer_); }
4333 uint8_t *OrphanUnsignedBuffer(int32_t &orphanedCapacity);
4334 UBool IsOk() const { return buffer_ != NULL; } // otherwise out-of-memory
4335
4336private:
4337 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4338 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4339
4340 UBool Resize(int32_t appendCapacity, int32_t length);
4341 void SetNotOk() {
4342 buffer_ = NULL;
4343 capacity_ = 0;
4344 }
4345
4346 static uint8_t lastResortByte_; // last-resort return value from LastByte()
4347
4348 char *ownedBuffer_;
4349 char *buffer_;
4350 int32_t capacity_;
4351 int32_t appended_;
4352 UBool fill_;
4353 UBool grow_;
4354};
4355
4356uint8_t SortKeyByteSink::lastResortByte_ = 0;
4357
4358SortKeyByteSink::~SortKeyByteSink() {
4359 uprv_free(ownedBuffer_);
4360}
4361
4362void
4363SortKeyByteSink::Append(const char *bytes, int32_t n) {
4364 if (n <= 0) {
4365 return;
4366 }
4367 int32_t length = appended_;
4368 appended_ += n;
4369 if ((buffer_ + length) == bytes) {
4370 return; // the caller used GetAppendBuffer() and wrote the bytes already
4371 }
4372 if (buffer_ == NULL) {
4373 return; // allocation failed before already
4374 }
4375 int32_t available = capacity_ - length;
4376 if (bytes == NULL) {
4377 // assume that the caller failed to allocate memory
4378 if (fill_) {
4379 if (n > available) {
4380 n = available;
4381 }
4382 uprv_memset(buffer_, 0, n);
4383 }
4384 SetNotOk(); // propagate the out-of-memory error
4385 return;
4386 }
4387 if (n > available) {
4388 if (fill_ && available > 0) {
4389 // Fill the original buffer completely.
4390 uprv_memcpy(buffer_ + length, bytes, available);
4391 bytes += available;
4392 length += available;
4393 n -= available;
4394 available = 0;
4395 }
4396 fill_ = FALSE;
4397 if (!Resize(n, length)) {
4398 SetNotOk();
4399 return;
4400 }
4401 }
4402 uprv_memcpy(buffer_ + length, bytes, n);
4403}
4404
4405char *
4406SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4407 int32_t desired_capacity_hint,
4408 char *scratch,
4409 int32_t scratch_capacity,
4410 int32_t *result_capacity) {
4411 if (min_capacity < 1 || scratch_capacity < min_capacity) {
4412 *result_capacity = 0;
4413 return NULL;
4414 }
4415 int32_t available = capacity_ - appended_;
4416 if (available >= min_capacity) {
4417 *result_capacity = available;
4418 return buffer_ + appended_;
4419 } else if (Resize(desired_capacity_hint, appended_)) {
4420 *result_capacity = capacity_ - appended_;
4421 return buffer_ + appended_;
4422 } else {
4423 *result_capacity = scratch_capacity;
4424 return scratch;
4425 }
4426}
4427
4428UBool
4429SortKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4430 if (!grow_) {
4431 return FALSE;
4432 }
4433 int32_t newCapacity = 2 * capacity_;
4434 int32_t altCapacity = length + 2 * appendCapacity;
4435 if (newCapacity < altCapacity) {
4436 newCapacity = altCapacity;
4437 }
4438 if (newCapacity < 1024) {
4439 newCapacity = 1024;
4440 }
4441 char *newBuffer = (char *)uprv_malloc(newCapacity);
4442 if (newBuffer == NULL) {
4443 return FALSE;
4444 }
4445 uprv_memcpy(newBuffer, buffer_, length);
4446 uprv_free(ownedBuffer_);
4447 ownedBuffer_ = buffer_ = newBuffer;
4448 capacity_ = newCapacity;
4449 return TRUE;
4450}
4451
4452uint8_t *
4453SortKeyByteSink::OrphanUnsignedBuffer(int32_t &orphanedCapacity) {
4454 if (buffer_ == NULL || appended_ == 0) {
4455 orphanedCapacity = 0;
4456 return NULL;
4457 }
4458 if (ownedBuffer_ != NULL) {
4459 // orphan & forget the ownedBuffer_
4460 uint8_t *returnBuffer = reinterpret_cast<uint8_t *>(ownedBuffer_);
4461 ownedBuffer_ = buffer_ = NULL;
4462 orphanedCapacity = capacity_;
4463 capacity_ = appended_ = 0;
4464 return returnBuffer;
4465 }
4466 // clone the buffer_
4467 uint8_t *newBuffer = (uint8_t *)uprv_malloc(appended_);
4468 if (newBuffer == NULL) {
4469 orphanedCapacity = 0;
4470 return NULL;
4471 }
4472 uprv_memcpy(newBuffer, buffer_, appended_);
4473 orphanedCapacity = appended_;
4474 return newBuffer;
4475}
4476
4477U_NAMESPACE_END
4478
4479/* sortkey API */
4480U_CAPI int32_t U_EXPORT2
4481ucol_getSortKey(const UCollator *coll,
4482 const UChar *source,
4483 int32_t sourceLength,
4484 uint8_t *result,
4485 int32_t resultLength)
4486{
4487 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4488 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4489 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4490 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4491 }
4492
4493 if(coll->delegate != NULL) {
4494 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
4495 }
4496
4497 UErrorCode status = U_ZERO_ERROR;
4498 int32_t keySize = 0;
4499
4500 if(source != NULL) {
4501 // source == NULL is actually an error situation, but we would need to
4502 // have an error code to return it. Until we introduce a new
4503 // API, it stays like this
4504
4505 /* this uses the function pointer that is set in updateinternalstate */
4506 /* currently, there are two funcs: */
4507 /*ucol_calcSortKey(...);*/
4508 /*ucol_calcSortKeySimpleTertiary(...);*/
4509
4510 SortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength,
4511 SortKeyByteSink::FILL_ORIGINAL_BUFFER | SortKeyByteSink::DONT_GROW);
4512 coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4513 keySize = sink.NumberOfBytesAppended();
4514 }
4515 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4516 UTRACE_EXIT_STATUS(status);
4517 return keySize;
4518}
4519
4520/* this function is called by the C++ API for sortkey generation */
4521U_CFUNC int32_t
4522ucol_getSortKeyWithAllocation(const UCollator *coll,
4523 const UChar *source, int32_t sourceLength,
4524 uint8_t *&result, int32_t &resultCapacity,
4525 UErrorCode *pErrorCode) {
4526 SortKeyByteSink sink(reinterpret_cast<char *>(result), resultCapacity);
4527 coll->sortKeyGen(coll, source, sourceLength, sink, pErrorCode);
4528 int32_t resultLen = sink.NumberOfBytesAppended();
4529 if (U_SUCCESS(*pErrorCode)) {
4530 if (!sink.IsOk()) {
4531 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
4532 } else if (result != sink.GetUnsignedBuffer()) {
4533 result = sink.OrphanUnsignedBuffer(resultCapacity);
4534 }
4535 }
4536 return resultLen;
4537}
4538
4539// Is this primary weight compressible?
4540// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4541// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4542static inline UBool
4543isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4544 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4545}
4546
4547static
4548inline void doCaseShift(SortKeyByteSink &cases, uint32_t &caseShift) {
4549 if (caseShift == 0) {
4550 cases.Append(UCOL_CASE_BYTE_START);
4551 caseShift = UCOL_CASE_SHIFT_START;
4552 }
4553}
4554
4555// Packs the secondary buffer when processing French locale.
4556static void
4557packFrench(uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4558 secondaries += secsize; // We read the secondary-level bytes back to front.
4559 uint8_t secondary;
4560 int32_t count2 = 0;
4561 int32_t i = 0;
4562 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4563 for(i = 0; i<secsize; i++) {
4564 secondary = *(secondaries-i-1);
4565 /* This is compression code. */
4566 if (secondary == UCOL_COMMON2) {
4567 ++count2;
4568 } else {
4569 if (count2 > 0) {
4570 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4571 while (count2 > UCOL_TOP_COUNT2) {
4572 result.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4573 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4574 }
4575 result.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4576 } else {
4577 while (count2 > UCOL_BOT_COUNT2) {
4578 result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4579 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4580 }
4581 result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4582 }
4583 count2 = 0;
4584 }
4585 result.Append(secondary);
4586 }
4587 }
4588 if (count2 > 0) {
4589 while (count2 > UCOL_BOT_COUNT2) {
4590 result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4591 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4592 }
4593 result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4594 }
4595}
4596
4597#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4598
4599/* This is the sortkey work horse function */
4600U_CFUNC void U_CALLCONV
4601ucol_calcSortKey(const UCollator *coll,
4602 const UChar *source,
4603 int32_t sourceLength,
4604 SortKeyByteSink &result,
4605 UErrorCode *status)
4606{
4607 if(U_FAILURE(*status)) {
4608 return;
4609 }
4610
4611 /* Stack allocated buffers for buffers we use */
4612 char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
4613 char caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4614
4615 SortKeyByteSink &primaries = result;
4616 SortKeyByteSink secondaries(second, LENGTHOF(second));
4617 SortKeyByteSink tertiaries(tert, LENGTHOF(tert));
4618 SortKeyByteSink cases(caseB, LENGTHOF(caseB));
4619 SortKeyByteSink quads(quad, LENGTHOF(quad));
4620
4621 UnicodeString normSource;
4622
4623 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4624
4625 UColAttributeValue strength = coll->strength;
4626
4627 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4628 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4629 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4630 UBool compareIdent = (strength == UCOL_IDENTICAL);
4631 UBool doCase = (coll->caseLevel == UCOL_ON);
4632 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4633 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4634 //UBool qShifted = shifted && (compareQuad == 0);
4635 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4636
4637 uint32_t variableTopValue = coll->variableTopValue;
4638 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4639 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4640 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4641 uint8_t UCOL_HIRAGANA_QUAD = 0;
4642 if(doHiragana) {
4643 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4644 /* allocate one more space for hiragana, value for hiragana */
4645 }
4646 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4647
4648 /* support for special features like caselevel and funky secondaries */
4649 int32_t lastSecondaryLength = 0;
4650 uint32_t caseShift = 0;
4651
4652 /* If we need to normalize, we'll do it all at once at the beginning! */
4653 const Normalizer2 *norm2;
4654 if(compareIdent) {
4655 norm2 = Normalizer2Factory::getNFDInstance(*status);
4656 } else if(coll->normalizationMode != UCOL_OFF) {
4657 norm2 = Normalizer2Factory::getFCDInstance(*status);
4658 } else {
4659 norm2 = NULL;
4660 }
4661 if(norm2 != NULL) {
4662 normSource.setTo(FALSE, source, len);
4663 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4664 if(qcYesLength != len) {
4665 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4666 normSource.truncate(qcYesLength);
4667 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4668 source = normSource.getBuffer();
4669 len = normSource.length();
4670 }
4671 }
4672 collIterate s;
4673 IInit_collIterate(coll, source, len, &s, status);
4674 if(U_FAILURE(*status)) {
4675 return;
4676 }
4677 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
4678
4679 uint32_t order = 0;
4680
4681 uint8_t primary1 = 0;
4682 uint8_t primary2 = 0;
4683 uint8_t secondary = 0;
4684 uint8_t tertiary = 0;
4685 uint8_t caseSwitch = coll->caseSwitch;
4686 uint8_t tertiaryMask = coll->tertiaryMask;
4687 int8_t tertiaryAddition = coll->tertiaryAddition;
4688 uint8_t tertiaryTop = coll->tertiaryTop;
4689 uint8_t tertiaryBottom = coll->tertiaryBottom;
4690 uint8_t tertiaryCommon = coll->tertiaryCommon;
4691 uint8_t caseBits = 0;
4692
4693 UBool wasShifted = FALSE;
4694 UBool notIsContinuation = FALSE;
4695
4696 uint32_t count2 = 0, count3 = 0, count4 = 0;
4697 uint8_t leadPrimary = 0;
4698
4699 for(;;) {
4700 order = ucol_IGetNextCE(coll, &s, status);
4701 if(order == UCOL_NO_MORE_CES) {
4702 break;
4703 }
4704
4705 if(order == 0) {
4706 continue;
4707 }
4708
4709 notIsContinuation = !isContinuation(order);
4710
4711 if(notIsContinuation) {
4712 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4713 } else {
4714 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4715 }
4716
4717 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4718 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4719 primary1 = (uint8_t)(order >> 8);
4720
4721 uint8_t originalPrimary1 = primary1;
4722 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4723 primary1 = coll->leadBytePermutationTable[primary1];
4724 }
4725
4726 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4727 || (!notIsContinuation && wasShifted)))
4728 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4729 {
4730 /* and other ignorables should be removed if following a shifted code point */
4731 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4732 /* we should just completely ignore it */
4733 continue;
4734 }
4735 if(compareQuad == 0) {
4736 if(count4 > 0) {
4737 while (count4 > UCOL_BOT_COUNT4) {
4738 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
4739 count4 -= UCOL_BOT_COUNT4;
4740 }
4741 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
4742 count4 = 0;
4743 }
4744 /* We are dealing with a variable and we're treating them as shifted */
4745 /* This is a shifted ignorable */
4746 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4747 quads.Append(primary1);
4748 }
4749 if(primary2 != 0) {
4750 quads.Append(primary2);
4751 }
4752 }
4753 wasShifted = TRUE;
4754 } else {
4755 wasShifted = FALSE;
4756 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4757 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4758 /* regular and simple sortkey calc */
4759 if(primary1 != UCOL_IGNORABLE) {
4760 if(notIsContinuation) {
4761 if(leadPrimary == primary1) {
4762 primaries.Append(primary2);
4763 } else {
4764 if(leadPrimary != 0) {
4765 primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN));
4766 }
4767 if(primary2 == UCOL_IGNORABLE) {
4768 /* one byter, not compressed */
4769 primaries.Append(primary1);
4770 leadPrimary = 0;
4771 } else if(isCompressible(coll, originalPrimary1)) {
4772 /* compress */
4773 primaries.Append(leadPrimary = primary1, primary2);
4774 } else {
4775 leadPrimary = 0;
4776 primaries.Append(primary1, primary2);
4777 }
4778 }
4779 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4780 if(primary2 == UCOL_IGNORABLE) {
4781 primaries.Append(primary1);
4782 } else {
4783 primaries.Append(primary1, primary2);
4784 }
4785 }
4786 }
4787
4788 if(secondary > compareSec) {
4789 if(!isFrenchSec) {
4790 /* This is compression code. */
4791 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4792 ++count2;
4793 } else {
4794 if (count2 > 0) {
4795 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4796 while (count2 > UCOL_TOP_COUNT2) {
4797 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4798 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4799 }
4800 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4801 } else {
4802 while (count2 > UCOL_BOT_COUNT2) {
4803 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4804 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4805 }
4806 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4807 }
4808 count2 = 0;
4809 }
4810 secondaries.Append(secondary);
4811 }
4812 } else {
4813 /* Do the special handling for French secondaries */
4814 /* We need to get continuation elements and do intermediate restore */
4815 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4816 if(notIsContinuation) {
4817 if (lastSecondaryLength > 1) {
4818 uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength);
4819 if (frenchStartPtr != NULL) {
4820 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4821 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4822 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4823 }
4824 }
4825 lastSecondaryLength = 1;
4826 } else {
4827 ++lastSecondaryLength;
4828 }
4829 secondaries.Append(secondary);
4830 }
4831 }
4832
4833 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4834 // do the case level if we need to do it. We don't want to calculate
4835 // case level for primary ignorables if we have only primary strength and case level
4836 // otherwise we would break well formedness of CEs
4837 doCaseShift(cases, caseShift);
4838 if(notIsContinuation) {
4839 caseBits = (uint8_t)(tertiary & 0xC0);
4840
4841 if(tertiary != 0) {
4842 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4843 if((caseBits & 0xC0) == 0) {
4844 cases.LastByte() |= 1 << (--caseShift);
4845 } else {
4846 cases.LastByte() |= 0 << (--caseShift);
4847 /* second bit */
4848 doCaseShift(cases, caseShift);
4849 cases.LastByte() |= ((caseBits>>6)&1) << (--caseShift);
4850 }
4851 } else {
4852 if((caseBits & 0xC0) == 0) {
4853 cases.LastByte() |= 0 << (--caseShift);
4854 } else {
4855 cases.LastByte() |= 1 << (--caseShift);
4856 /* second bit */
4857 doCaseShift(cases, caseShift);
4858 cases.LastByte() |= ((caseBits>>7)&1) << (--caseShift);
4859 }
4860 }
4861 }
4862 }
4863 } else {
4864 if(notIsContinuation) {
4865 tertiary ^= caseSwitch;
4866 }
4867 }
4868
4869 tertiary &= tertiaryMask;
4870 if(tertiary > compareTer) {
4871 /* This is compression code. */
4872 /* sequence size check is included in the if clause */
4873 if (tertiary == tertiaryCommon && notIsContinuation) {
4874 ++count3;
4875 } else {
4876 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4877 tertiary += tertiaryAddition;
4878 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4879 tertiary -= tertiaryAddition;
4880 }
4881 if (count3 > 0) {
4882 if ((tertiary > tertiaryCommon)) {
4883 while (count3 > coll->tertiaryTopCount) {
4884 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
4885 count3 -= (uint32_t)coll->tertiaryTopCount;
4886 }
4887 tertiaries.Append((uint8_t)(tertiaryTop - (count3-1)));
4888 } else {
4889 while (count3 > coll->tertiaryBottomCount) {
4890 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
4891 count3 -= (uint32_t)coll->tertiaryBottomCount;
4892 }
4893 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
4894 }
4895 count3 = 0;
4896 }
4897 tertiaries.Append(tertiary);
4898 }
4899 }
4900
4901 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4902 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4903 if(count4>0) { // Close this part
4904 while (count4 > UCOL_BOT_COUNT4) {
4905 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
4906 count4 -= UCOL_BOT_COUNT4;
4907 }
4908 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
4909 count4 = 0;
4910 }
4911 quads.Append(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4912 } else { // This wasn't Hiragana, so we can continue adding stuff
4913 count4++;
4914 }
4915 }
4916 }
4917 }
4918
4919 /* Here, we are generally done with processing */
4920 /* bailing out would not be too productive */
4921
4922 if(U_SUCCESS(*status)) {
4923 /* we have done all the CE's, now let's put them together to form a key */
4924 if(compareSec == 0) {
4925 if (count2 > 0) {
4926 while (count2 > UCOL_BOT_COUNT2) {
4927 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4928 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4929 }
4930 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4931 }
4932 result.Append(UCOL_LEVELTERMINATOR);
4933 if(!isFrenchSec || !secondaries.IsOk()) {
4934 result.Append(secondaries);
4935 } else {
4936 // If there are any unresolved continuation secondaries,
4937 // reverse them here so that we can reverse the whole secondary thing.
4938 if (lastSecondaryLength > 1) {
4939 uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength);
4940 if (frenchStartPtr != NULL) {
4941 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4942 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4943 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4944 }
4945 }
4946 packFrench(secondaries.GetUnsignedBuffer(), secondaries.NumberOfBytesAppended(), result);
4947 }
4948 }
4949
4950 if(doCase) {
4951 result.Append(UCOL_LEVELTERMINATOR);
4952 result.Append(cases);
4953 }
4954
4955 if(compareTer == 0) {
4956 if (count3 > 0) {
4957 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4958 while (count3 >= coll->tertiaryTopCount) {
4959 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
4960 count3 -= (uint32_t)coll->tertiaryTopCount;
4961 }
4962 tertiaries.Append((uint8_t)(tertiaryTop - count3));
4963 } else {
4964 while (count3 > coll->tertiaryBottomCount) {
4965 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
4966 count3 -= (uint32_t)coll->tertiaryBottomCount;
4967 }
4968 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
4969 }
4970 }
4971 result.Append(UCOL_LEVELTERMINATOR);
4972 result.Append(tertiaries);
4973
4974 if(compareQuad == 0/*qShifted == TRUE*/) {
4975 if(count4 > 0) {
4976 while (count4 > UCOL_BOT_COUNT4) {
4977 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
4978 count4 -= UCOL_BOT_COUNT4;
4979 }
4980 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
4981 }
4982 result.Append(UCOL_LEVELTERMINATOR);
4983 result.Append(quads);
4984 }
4985
4986 if(compareIdent) {
4987 result.Append(UCOL_LEVELTERMINATOR);
4988 u_writeIdenticalLevelRun(s.string, len, result);
4989 }
4990 }
4991 result.Append(0);
4992 }
4993
4994 /* To avoid memory leak, free the offset buffer if necessary. */
4995 ucol_freeOffsetBuffer(&s);
4996}
4997
4998
4999U_CFUNC void U_CALLCONV
5000ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5001 const UChar *source,
5002 int32_t sourceLength,
5003 SortKeyByteSink &result,
5004 UErrorCode *status)
5005{
5006 U_ALIGN_CODE(16);
5007
5008 if(U_FAILURE(*status)) {
5009 return;
5010 }
5011
5012 /* Stack allocated buffers for buffers we use */
5013 char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5014
5015 SortKeyByteSink &primaries = result;
5016 SortKeyByteSink secondaries(second, LENGTHOF(second));
5017 SortKeyByteSink tertiaries(tert, LENGTHOF(tert));
5018
5019 UnicodeString normSource;
5020
5021 int32_t len = sourceLength;
5022
5023 /* If we need to normalize, we'll do it all at once at the beginning! */
5024 if(coll->normalizationMode != UCOL_OFF) {
5025 normSource.setTo(len < 0, source, len);
5026 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5027 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5028 if(qcYesLength != normSource.length()) {
5029 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5030 normSource.truncate(qcYesLength);
5031 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5032 source = normSource.getBuffer();
5033 len = normSource.length();
5034 }
5035 }
5036 collIterate s;
5037 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5038 if(U_FAILURE(*status)) {
5039 return;
5040 }
5041 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
5042
5043 uint32_t order = 0;
5044
5045 uint8_t primary1 = 0;
5046 uint8_t primary2 = 0;
5047 uint8_t secondary = 0;
5048 uint8_t tertiary = 0;
5049 uint8_t caseSwitch = coll->caseSwitch;
5050 uint8_t tertiaryMask = coll->tertiaryMask;
5051 int8_t tertiaryAddition = coll->tertiaryAddition;
5052 uint8_t tertiaryTop = coll->tertiaryTop;
5053 uint8_t tertiaryBottom = coll->tertiaryBottom;
5054 uint8_t tertiaryCommon = coll->tertiaryCommon;
5055
5056 UBool notIsContinuation = FALSE;
5057
5058 uint32_t count2 = 0, count3 = 0;
5059 uint8_t leadPrimary = 0;
5060
5061 for(;;) {
5062 order = ucol_IGetNextCE(coll, &s, status);
5063
5064 if(order == 0) {
5065 continue;
5066 }
5067
5068 if(order == UCOL_NO_MORE_CES) {
5069 break;
5070 }
5071
5072 notIsContinuation = !isContinuation(order);
5073
5074 if(notIsContinuation) {
5075 tertiary = (uint8_t)((order & tertiaryMask));
5076 } else {
5077 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5078 }
5079
5080 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5081 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5082 primary1 = (uint8_t)(order >> 8);
5083
5084 uint8_t originalPrimary1 = primary1;
5085 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5086 primary1 = coll->leadBytePermutationTable[primary1];
5087 }
5088
5089 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5090 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
5091 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5092 /* regular and simple sortkey calc */
5093 if(primary1 != UCOL_IGNORABLE) {
5094 if(notIsContinuation) {
5095 if(leadPrimary == primary1) {
5096 primaries.Append(primary2);
5097 } else {
5098 if(leadPrimary != 0) {
5099 primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN));
5100 }
5101 if(primary2 == UCOL_IGNORABLE) {
5102 /* one byter, not compressed */
5103 primaries.Append(primary1);
5104 leadPrimary = 0;
5105 } else if(isCompressible(coll, originalPrimary1)) {
5106 /* compress */
5107 primaries.Append(leadPrimary = primary1, primary2);
5108 } else {
5109 leadPrimary = 0;
5110 primaries.Append(primary1, primary2);
5111 }
5112 }
5113 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5114 if(primary2 == UCOL_IGNORABLE) {
5115 primaries.Append(primary1);
5116 } else {
5117 primaries.Append(primary1, primary2);
5118 }
5119 }
5120 }
5121
5122 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5123 /* This is compression code. */
5124 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5125 ++count2;
5126 } else {
5127 if (count2 > 0) {
5128 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5129 while (count2 > UCOL_TOP_COUNT2) {
5130 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
5131 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5132 }
5133 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
5134 } else {
5135 while (count2 > UCOL_BOT_COUNT2) {
5136 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
5137 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5138 }
5139 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
5140 }
5141 count2 = 0;
5142 }
5143 secondaries.Append(secondary);
5144 }
5145 }
5146
5147 if(notIsContinuation) {
5148 tertiary ^= caseSwitch;
5149 }
5150
5151 if(tertiary > 0) {
5152 /* This is compression code. */
5153 /* sequence size check is included in the if clause */
5154 if (tertiary == tertiaryCommon && notIsContinuation) {
5155 ++count3;
5156 } else {
5157 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5158 tertiary += tertiaryAddition;
5159 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5160 tertiary -= tertiaryAddition;
5161 }
5162 if (count3 > 0) {
5163 if ((tertiary > tertiaryCommon)) {
5164 while (count3 > coll->tertiaryTopCount) {
5165 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
5166 count3 -= (uint32_t)coll->tertiaryTopCount;
5167 }
5168 tertiaries.Append((uint8_t)(tertiaryTop - (count3-1)));
5169 } else {
5170 while (count3 > coll->tertiaryBottomCount) {
5171 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
5172 count3 -= (uint32_t)coll->tertiaryBottomCount;
5173 }
5174 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
5175 }
5176 count3 = 0;
5177 }
5178 tertiaries.Append(tertiary);
5179 }
5180 }
5181 }
5182
5183 if(U_SUCCESS(*status)) {
5184 /* we have done all the CE's, now let's put them together to form a key */
5185 if (count2 > 0) {
5186 while (count2 > UCOL_BOT_COUNT2) {
5187 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
5188 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5189 }
5190 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
5191 }
5192 result.Append(UCOL_LEVELTERMINATOR);
5193 result.Append(secondaries);
5194
5195 if (count3 > 0) {
5196 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5197 while (count3 >= coll->tertiaryTopCount) {
5198 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
5199 count3 -= (uint32_t)coll->tertiaryTopCount;
5200 }
5201 tertiaries.Append((uint8_t)(tertiaryTop - count3));
5202 } else {
5203 while (count3 > coll->tertiaryBottomCount) {
5204 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
5205 count3 -= (uint32_t)coll->tertiaryBottomCount;
5206 }
5207 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
5208 }
5209 }
5210 result.Append(UCOL_LEVELTERMINATOR);
5211 result.Append(tertiaries);
5212
5213 result.Append(0);
5214 }
5215
5216 /* To avoid memory leak, free the offset buffer if necessary. */
5217 ucol_freeOffsetBuffer(&s);
5218
5219 if (U_SUCCESS(*status) && !result.IsOk()) {
5220 *status = U_BUFFER_OVERFLOW_ERROR;
5221 }
5222}
5223
5224static inline
5225UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5226 UBool notIsContinuation = !isContinuation(CE);
5227 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5228 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5229 || (!notIsContinuation && *wasShifted)))
5230 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5231 {
5232 // The stuff below should probably be in the sortkey code... maybe not...
5233 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5234 /* we should just completely ignore it */
5235 *wasShifted = TRUE;
5236 //continue;
5237 }
5238 //*wasShifted = TRUE;
5239 return TRUE;
5240 } else {
5241 *wasShifted = FALSE;
5242 return FALSE;
5243 }
5244}
5245static inline
5246void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5247 if(level < maxLevel) {
5248 dest[i++] = UCOL_LEVELTERMINATOR;
5249 } else {
5250 dest[i++] = 0;
5251 }
5252}
5253
5254/** enumeration of level identifiers for partial sort key generation */
5255enum {
5256 UCOL_PSK_PRIMARY = 0,
5257 UCOL_PSK_SECONDARY = 1,
5258 UCOL_PSK_CASE = 2,
5259 UCOL_PSK_TERTIARY = 3,
5260 UCOL_PSK_QUATERNARY = 4,
5261 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5262 UCOL_PSK_IDENTICAL = 6,
5263 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5264 UCOL_PSK_LIMIT
5265};
5266
5267/** collation state enum. *_SHIFT value is how much to shift right
5268 * to get the state piece to the right. *_MASK value should be
5269 * ANDed with the shifted state. This data is stored in state[1]
5270 * field.
5271 */
5272enum {
5273 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5274 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5275 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5276 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5277 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5278 * This field is also used to denote that the French secondary level is finished
5279 */
5280 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5281 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5282 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5283 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5284 /** When we do French we need to reverse secondary values. However, continuations
5285 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5286 */
5287 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5288 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5289 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5290 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5291};
5292
5293// macro calculating the number of expansion CEs available
5294#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5295
5296
5297/** main sortkey part procedure. On the first call,
5298 * you should pass in a collator, an iterator, empty state
5299 * state[0] == state[1] == 0, a buffer to hold results
5300 * number of bytes you need and an error code pointer.
5301 * Make sure your buffer is big enough to hold the wanted
5302 * number of sortkey bytes. I don't check.
5303 * The only meaningful status you can get back is
5304 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5305 * have been dealt a raw deal and that you probably won't
5306 * be able to use partial sortkey generation for this
5307 * particular combination of string and collator. This
5308 * is highly unlikely, but you should still check the error code.
5309 * Any other status means that you're not in a sane situation
5310 * anymore. After the first call, preserve state values and
5311 * use them on subsequent calls to obtain more bytes of a sortkey.
5312 * Use until the number of bytes written is smaller than the requested
5313 * number of bytes. Generated sortkey is not compatible with the
5314 * one generated by ucol_getSortKey, as we don't do any compression.
5315 * However, levels are still terminated by a 1 (one) and the sortkey
5316 * is terminated by a 0 (zero). Identical level is the same as in the
5317 * regular sortkey - internal bocu-1 implementation is used.
5318 * For curious, although you cannot do much about this, here is
5319 * the structure of state words.
5320 * state[0] - iterator state. Depends on the iterator implementation,
5321 * but allows the iterator to continue where it stopped in
5322 * the last iteration.
5323 * state[1] - collation processing state. Here is the distribution
5324 * of the bits:
5325 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5326 * quaternary, quin (we don't use this one), identical and
5327 * null (producing only zeroes - first one to terminate the
5328 * sortkey and subsequent to fill the buffer).
5329 * 3 - byte count. Number of bytes written on the primary level.
5330 * 4 - was shifted. Whether the previous iteration finished in the
5331 * shifted state.
5332 * 5, 6 - French continuation bytes written. See the comment in the enum
5333 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5334 * the identical level.
5335 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5336 * since thes last successful update of the iterator state.
5337 */
5338U_CAPI int32_t U_EXPORT2
5339ucol_nextSortKeyPart(const UCollator *coll,
5340 UCharIterator *iter,
5341 uint32_t state[2],
5342 uint8_t *dest, int32_t count,
5343 UErrorCode *status)
5344{
5345 /* error checking */
5346 if(status==NULL || U_FAILURE(*status)) {
5347 return 0;
5348 }
5349 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5350 if( coll==NULL || iter==NULL ||
5351 state==NULL ||
5352 count<0 || (count>0 && dest==NULL)
5353 ) {
5354 *status=U_ILLEGAL_ARGUMENT_ERROR;
5355 UTRACE_EXIT_STATUS(status);
5356 return 0;
5357 }
5358
5359 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5360 coll, iter, state[0], state[1], dest, count);
5361
5362 if(count==0) {
5363 /* nothing to do */
5364 UTRACE_EXIT_VALUE(0);
5365 return 0;
5366 }
5367 /** Setting up situation according to the state we got from the previous iteration */
5368 // The state of the iterator from the previous invocation
5369 uint32_t iterState = state[0];
5370 // Has the last iteration ended in the shifted state
5371 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5372 // What is the current level of the sortkey?
5373 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5374 // Have we written only one byte from a two byte primary in the previous iteration?
5375 // Also on secondary level - have we finished with the French secondary?
5376 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5377 // number of bytes in the continuation buffer for French
5378 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5379 // Number of bytes already written from a bocsu sequence. Since
5380 // the longes bocsu sequence is 4 long, this can be up to 3.
5381 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5382 // Number of elements that need to be consumed in this iteration because
5383 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5384 // so we had to save the last valid state.
5385 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5386
5387 /** values that depend on the collator attributes */
5388 // strength of the collator.
5389 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5390 // maximal level of the partial sortkey. Need to take whether case level is done
5391 int32_t maxLevel = 0;
5392 if(strength < UCOL_TERTIARY) {
5393 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5394 maxLevel = UCOL_PSK_CASE;
5395 } else {
5396 maxLevel = strength;
5397 }
5398 } else {
5399 if(strength == UCOL_TERTIARY) {
5400 maxLevel = UCOL_PSK_TERTIARY;
5401 } else if(strength == UCOL_QUATERNARY) {
5402 maxLevel = UCOL_PSK_QUATERNARY;
5403 } else { // identical
5404 maxLevel = UCOL_IDENTICAL;
5405 }
5406 }
5407 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5408 uint8_t UCOL_HIRAGANA_QUAD =
5409 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5410 // Boundary value that decides whether a CE is shifted or not
5411 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5412 // Are we doing French collation?
5413 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5414
5415 /** initializing the collation state */
5416 UBool notIsContinuation = FALSE;
5417 uint32_t CE = UCOL_NO_MORE_CES;
5418
5419 collIterate s;
5420 IInit_collIterate(coll, NULL, -1, &s, status);
5421 if(U_FAILURE(*status)) {
5422 UTRACE_EXIT_STATUS(*status);
5423 return 0;
5424 }
5425 s.iterator = iter;
5426 s.flags |= UCOL_USE_ITERATOR;
5427 // This variable tells us whether we have produced some other levels in this iteration
5428 // before we moved to the identical level. In that case, we need to switch the
5429 // type of the iterator.
5430 UBool doingIdenticalFromStart = FALSE;
5431 // Normalizing iterator
5432 // The division for the array length may truncate the array size to
5433 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5434 // for all platforms anyway.
5435 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5436 UNormIterator *normIter = NULL;
5437 // If the normalization is turned on for the collator and we are below identical level
5438 // we will use a FCD normalizing iterator
5439 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5440 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5441 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5442 s.flags &= ~UCOL_ITER_NORM;
5443 if(U_FAILURE(*status)) {
5444 UTRACE_EXIT_STATUS(*status);
5445 return 0;
5446 }
5447 } else if(level == UCOL_PSK_IDENTICAL) {
5448 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5449 // will be updating the state - and this cannot be done on an ordinary iterator.
5450 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5451 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5452 s.flags &= ~UCOL_ITER_NORM;
5453 if(U_FAILURE(*status)) {
5454 UTRACE_EXIT_STATUS(*status);
5455 return 0;
5456 }
5457 doingIdenticalFromStart = TRUE;
5458 }
5459
5460 // This is the tentative new state of the iterator. The problem
5461 // is that the iterator might return an undefined state, in
5462 // which case we should save the last valid state and increase
5463 // the iterator skip value.
5464 uint32_t newState = 0;
5465
5466 // First, we set the iterator to the last valid position
5467 // from the last iteration. This was saved in state[0].
5468 if(iterState == 0) {
5469 /* initial state */
5470 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5471 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5472 } else {
5473 s.iterator->move(s.iterator, 0, UITER_START);
5474 }
5475 } else {
5476 /* reset to previous state */
5477 s.iterator->setState(s.iterator, iterState, status);
5478 if(U_FAILURE(*status)) {
5479 UTRACE_EXIT_STATUS(*status);
5480 return 0;
5481 }
5482 }
5483
5484
5485
5486 // This variable tells us whether we can attempt to update the state
5487 // of iterator. Situations where we don't want to update iterator state
5488 // are the existence of expansion CEs that are not yet processed, and
5489 // finishing the case level without enough space in the buffer to insert
5490 // a level terminator.
5491 UBool canUpdateState = TRUE;
5492
5493 // Consume all the CEs that were consumed at the end of the previous
5494 // iteration without updating the iterator state. On identical level,
5495 // consume the code points.
5496 int32_t counter = cces;
5497 if(level < UCOL_PSK_IDENTICAL) {
5498 while(counter-->0) {
5499 // If we're doing French and we are on the secondary level,
5500 // we go backwards.
5501 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5502 CE = ucol_IGetPrevCE(coll, &s, status);
5503 } else {
5504 CE = ucol_IGetNextCE(coll, &s, status);
5505 }
5506 if(CE==UCOL_NO_MORE_CES) {
5507 /* should not happen */
5508 *status=U_INTERNAL_PROGRAM_ERROR;
5509 UTRACE_EXIT_STATUS(*status);
5510 return 0;
5511 }
5512 if(uprv_numAvailableExpCEs(s)) {
5513 canUpdateState = FALSE;
5514 }
5515 }
5516 } else {
5517 while(counter-->0) {
5518 uiter_next32(s.iterator);
5519 }
5520 }
5521
5522 // French secondary needs to know whether the iterator state of zero came from previous level OR
5523 // from a new invocation...
5524 UBool wasDoingPrimary = FALSE;
5525 // destination buffer byte counter. When this guy
5526 // gets to count, we're done with the iteration
5527 int32_t i = 0;
5528 // used to count the zero bytes written after we
5529 // have finished with the sort key
5530 int32_t j = 0;
5531
5532
5533 // Hm.... I think we're ready to plunge in. Basic story is as following:
5534 // we have a fall through case based on level. This is used for initial
5535 // positioning on iteration start. Every level processor contains a
5536 // for(;;) which will be broken when we exhaust all the CEs. Other
5537 // way to exit is a goto saveState, which happens when we have filled
5538 // out our buffer.
5539 switch(level) {
5540 case UCOL_PSK_PRIMARY:
5541 wasDoingPrimary = TRUE;
5542 for(;;) {
5543 if(i==count) {
5544 goto saveState;
5545 }
5546 // We should save the state only if we
5547 // are sure that we are done with the
5548 // previous iterator state
5549 if(canUpdateState && byteCountOrFrenchDone == 0) {
5550 newState = s.iterator->getState(s.iterator);
5551 if(newState != UITER_NO_STATE) {
5552 iterState = newState;
5553 cces = 0;
5554 }
5555 }
5556 CE = ucol_IGetNextCE(coll, &s, status);
5557 cces++;
5558 if(CE==UCOL_NO_MORE_CES) {
5559 // Add the level separator
5560 terminatePSKLevel(level, maxLevel, i, dest);
5561 byteCountOrFrenchDone=0;
5562 // Restart the iteration an move to the
5563 // second level
5564 s.iterator->move(s.iterator, 0, UITER_START);
5565 cces = 0;
5566 level = UCOL_PSK_SECONDARY;
5567 break;
5568 }
5569 if(!isContinuation(CE)){
5570 if(coll->leadBytePermutationTable != NULL){
5571 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5572 }
5573 }
5574 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5575 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5576 if(CE != 0) {
5577 if(byteCountOrFrenchDone == 0) {
5578 // get the second byte of primary
5579 dest[i++]=(uint8_t)(CE >> 8);
5580 } else {
5581 byteCountOrFrenchDone = 0;
5582 }
5583 if((CE &=0xff)!=0) {
5584 if(i==count) {
5585 /* overflow */
5586 byteCountOrFrenchDone = 1;
5587 cces--;
5588 goto saveState;
5589 }
5590 dest[i++]=(uint8_t)CE;
5591 }
5592 }
5593 }
5594 if(uprv_numAvailableExpCEs(s)) {
5595 canUpdateState = FALSE;
5596 } else {
5597 canUpdateState = TRUE;
5598 }
5599 }
5600 /* fall through to next level */
5601 case UCOL_PSK_SECONDARY:
5602 if(strength >= UCOL_SECONDARY) {
5603 if(!doingFrench) {
5604 for(;;) {
5605 if(i == count) {
5606 goto saveState;
5607 }
5608 // We should save the state only if we
5609 // are sure that we are done with the
5610 // previous iterator state
5611 if(canUpdateState) {
5612 newState = s.iterator->getState(s.iterator);
5613 if(newState != UITER_NO_STATE) {
5614 iterState = newState;
5615 cces = 0;
5616 }
5617 }
5618 CE = ucol_IGetNextCE(coll, &s, status);
5619 cces++;
5620 if(CE==UCOL_NO_MORE_CES) {
5621 // Add the level separator
5622 terminatePSKLevel(level, maxLevel, i, dest);
5623 byteCountOrFrenchDone = 0;
5624 // Restart the iteration an move to the
5625 // second level
5626 s.iterator->move(s.iterator, 0, UITER_START);
5627 cces = 0;
5628 level = UCOL_PSK_CASE;
5629 break;
5630 }
5631 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5632 CE >>= 8; /* get secondary */
5633 if(CE != 0) {
5634 dest[i++]=(uint8_t)CE;
5635 }
5636 }
5637 if(uprv_numAvailableExpCEs(s)) {
5638 canUpdateState = FALSE;
5639 } else {
5640 canUpdateState = TRUE;
5641 }
5642 }
5643 } else { // French secondary processing
5644 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5645 int32_t frenchIndex = 0;
5646 // Here we are going backwards.
5647 // If the iterator is at the beggining, it should be
5648 // moved to end.
5649 if(wasDoingPrimary) {
5650 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5651 cces = 0;
5652 }
5653 for(;;) {
5654 if(i == count) {
5655 goto saveState;
5656 }
5657 if(canUpdateState) {
5658 newState = s.iterator->getState(s.iterator);
5659 if(newState != UITER_NO_STATE) {
5660 iterState = newState;
5661 cces = 0;
5662 }
5663 }
5664 CE = ucol_IGetPrevCE(coll, &s, status);
5665 cces++;
5666 if(CE==UCOL_NO_MORE_CES) {
5667 // Add the level separator
5668 terminatePSKLevel(level, maxLevel, i, dest);
5669 byteCountOrFrenchDone = 0;
5670 // Restart the iteration an move to the next level
5671 s.iterator->move(s.iterator, 0, UITER_START);
5672 level = UCOL_PSK_CASE;
5673 break;
5674 }
5675 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5676 // reverse when we get a first non-continuation CE.
5677 CE >>= 8;
5678 frenchBuff[frenchIndex++] = (uint8_t)CE;
5679 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5680 CE >>= 8; /* get secondary */
5681 if(!frenchIndex) {
5682 if(CE != 0) {
5683 dest[i++]=(uint8_t)CE;
5684 }
5685 } else {
5686 frenchBuff[frenchIndex++] = (uint8_t)CE;
5687 frenchIndex -= usedFrench;
5688 usedFrench = 0;
5689 while(i < count && frenchIndex) {
5690 dest[i++] = frenchBuff[--frenchIndex];
5691 usedFrench++;
5692 }
5693 }
5694 }
5695 if(uprv_numAvailableExpCEs(s)) {
5696 canUpdateState = FALSE;
5697 } else {
5698 canUpdateState = TRUE;
5699 }
5700 }
5701 }
5702 } else {
5703 level = UCOL_PSK_CASE;
5704 }
5705 /* fall through to next level */
5706 case UCOL_PSK_CASE:
5707 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5708 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5709 uint8_t caseByte = UCOL_CASE_BYTE_START;
5710 uint8_t caseBits = 0;
5711
5712 for(;;) {
5713 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5714 if(i == count) {
5715 goto saveState;
5716 }
5717 // We should save the state only if we
5718 // are sure that we are done with the
5719 // previous iterator state
5720 if(canUpdateState) {
5721 newState = s.iterator->getState(s.iterator);
5722 if(newState != UITER_NO_STATE) {
5723 iterState = newState;
5724 cces = 0;
5725 }
5726 }
5727 CE = ucol_IGetNextCE(coll, &s, status);
5728 cces++;
5729 if(CE==UCOL_NO_MORE_CES) {
5730 // On the case level we might have an unfinished
5731 // case byte. Add one if it's started.
5732 if(caseShift != UCOL_CASE_SHIFT_START) {
5733 dest[i++] = caseByte;
5734 }
5735 cces = 0;
5736 // We have finished processing CEs on this level.
5737 // However, we don't know if we have enough space
5738 // to add a case level terminator.
5739 if(i < count) {
5740 // Add the level separator
5741 terminatePSKLevel(level, maxLevel, i, dest);
5742 // Restart the iteration and move to the
5743 // next level
5744 s.iterator->move(s.iterator, 0, UITER_START);
5745 level = UCOL_PSK_TERTIARY;
5746 } else {
5747 canUpdateState = FALSE;
5748 }
5749 break;
5750 }
5751
5752 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5753 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5754 // do the case level if we need to do it. We don't want to calculate
5755 // case level for primary ignorables if we have only primary strength and case level
5756 // otherwise we would break well formedness of CEs
5757 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5758 caseBits = (uint8_t)(CE & 0xC0);
5759 // this copies the case level logic from the
5760 // sort key generation code
5761 if(CE != 0) {
5762 if (caseShift == 0) {
5763 dest[i++] = caseByte;
5764 caseShift = UCOL_CASE_SHIFT_START;
5765 caseByte = UCOL_CASE_BYTE_START;
5766 }
5767 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5768 if((caseBits & 0xC0) == 0) {
5769 caseByte |= 1 << (--caseShift);
5770 } else {
5771 caseByte |= 0 << (--caseShift);
5772 /* second bit */
5773 if(caseShift == 0) {
5774 dest[i++] = caseByte;
5775 caseShift = UCOL_CASE_SHIFT_START;
5776 caseByte = UCOL_CASE_BYTE_START;
5777 }
5778 caseByte |= ((caseBits>>6)&1) << (--caseShift);
5779 }
5780 } else {
5781 if((caseBits & 0xC0) == 0) {
5782 caseByte |= 0 << (--caseShift);
5783 } else {
5784 caseByte |= 1 << (--caseShift);
5785 /* second bit */
5786 if(caseShift == 0) {
5787 dest[i++] = caseByte;
5788 caseShift = UCOL_CASE_SHIFT_START;
5789 caseByte = UCOL_CASE_BYTE_START;
5790 }
5791 caseByte |= ((caseBits>>7)&1) << (--caseShift);
5792 }
5793 }
5794 }
5795
5796 }
5797 }
5798 // Not sure this is correct for the case level - revisit
5799 if(uprv_numAvailableExpCEs(s)) {
5800 canUpdateState = FALSE;
5801 } else {
5802 canUpdateState = TRUE;
5803 }
5804 }
5805 } else {
5806 level = UCOL_PSK_TERTIARY;
5807 }
5808 /* fall through to next level */
5809 case UCOL_PSK_TERTIARY:
5810 if(strength >= UCOL_TERTIARY) {
5811 for(;;) {
5812 if(i == count) {
5813 goto saveState;
5814 }
5815 // We should save the state only if we
5816 // are sure that we are done with the
5817 // previous iterator state
5818 if(canUpdateState) {
5819 newState = s.iterator->getState(s.iterator);
5820 if(newState != UITER_NO_STATE) {
5821 iterState = newState;
5822 cces = 0;
5823 }
5824 }
5825 CE = ucol_IGetNextCE(coll, &s, status);
5826 cces++;
5827 if(CE==UCOL_NO_MORE_CES) {
5828 // Add the level separator
5829 terminatePSKLevel(level, maxLevel, i, dest);
5830 byteCountOrFrenchDone = 0;
5831 // Restart the iteration an move to the
5832 // second level
5833 s.iterator->move(s.iterator, 0, UITER_START);
5834 cces = 0;
5835 level = UCOL_PSK_QUATERNARY;
5836 break;
5837 }
5838 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5839 notIsContinuation = !isContinuation(CE);
5840
5841 if(notIsContinuation) {
5842 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5843 CE ^= coll->caseSwitch;
5844 CE &= coll->tertiaryMask;
5845 } else {
5846 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5847 }
5848
5849 if(CE != 0) {
5850 dest[i++]=(uint8_t)CE;
5851 }
5852 }
5853 if(uprv_numAvailableExpCEs(s)) {
5854 canUpdateState = FALSE;
5855 } else {
5856 canUpdateState = TRUE;
5857 }
5858 }
5859 } else {
5860 // if we're not doing tertiary
5861 // skip to the end
5862 level = UCOL_PSK_NULL;
5863 }
5864 /* fall through to next level */
5865 case UCOL_PSK_QUATERNARY:
5866 if(strength >= UCOL_QUATERNARY) {
5867 for(;;) {
5868 if(i == count) {
5869 goto saveState;
5870 }
5871 // We should save the state only if we
5872 // are sure that we are done with the
5873 // previous iterator state
5874 if(canUpdateState) {
5875 newState = s.iterator->getState(s.iterator);
5876 if(newState != UITER_NO_STATE) {
5877 iterState = newState;
5878 cces = 0;
5879 }
5880 }
5881 CE = ucol_IGetNextCE(coll, &s, status);
5882 cces++;
5883 if(CE==UCOL_NO_MORE_CES) {
5884 // Add the level separator
5885 terminatePSKLevel(level, maxLevel, i, dest);
5886 //dest[i++] = UCOL_LEVELTERMINATOR;
5887 byteCountOrFrenchDone = 0;
5888 // Restart the iteration an move to the
5889 // second level
5890 s.iterator->move(s.iterator, 0, UITER_START);
5891 cces = 0;
5892 level = UCOL_PSK_QUIN;
5893 break;
5894 }
5895 if(CE==0)
5896 continue;
5897 if(isShiftedCE(CE, LVT, &wasShifted)) {
5898 CE >>= 16; /* get primary */
5899 if(CE != 0) {
5900 if(byteCountOrFrenchDone == 0) {
5901 dest[i++]=(uint8_t)(CE >> 8);
5902 } else {
5903 byteCountOrFrenchDone = 0;
5904 }
5905 if((CE &=0xff)!=0) {
5906 if(i==count) {
5907 /* overflow */
5908 byteCountOrFrenchDone = 1;
5909 goto saveState;
5910 }
5911 dest[i++]=(uint8_t)CE;
5912 }
5913 }
5914 } else {
5915 notIsContinuation = !isContinuation(CE);
5916 if(notIsContinuation) {
5917 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5918 dest[i++] = UCOL_HIRAGANA_QUAD;
5919 } else {
5920 dest[i++] = 0xFF;
5921 }
5922 }
5923 }
5924 if(uprv_numAvailableExpCEs(s)) {
5925 canUpdateState = FALSE;
5926 } else {
5927 canUpdateState = TRUE;
5928 }
5929 }
5930 } else {
5931 // if we're not doing quaternary
5932 // skip to the end
5933 level = UCOL_PSK_NULL;
5934 }
5935 /* fall through to next level */
5936 case UCOL_PSK_QUIN:
5937 level = UCOL_PSK_IDENTICAL;
5938 /* fall through to next level */
5939 case UCOL_PSK_IDENTICAL:
5940 if(strength >= UCOL_IDENTICAL) {
5941 UChar32 first, second;
5942 int32_t bocsuBytesWritten = 0;
5943 // We always need to do identical on
5944 // the NFD form of the string.
5945 if(normIter == NULL) {
5946 // we arrived from the level below and
5947 // normalization was not turned on.
5948 // therefore, we need to make a fresh NFD iterator
5949 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5950 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5951 } else if(!doingIdenticalFromStart) {
5952 // there is an iterator, but we did some other levels.
5953 // therefore, we have a FCD iterator - need to make
5954 // a NFD one.
5955 // normIter being at the beginning does not guarantee
5956 // that the underlying iterator is at the beginning
5957 iter->move(iter, 0, UITER_START);
5958 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5959 }
5960 // At this point we have a NFD iterator that is positioned
5961 // in the right place
5962 if(U_FAILURE(*status)) {
5963 UTRACE_EXIT_STATUS(*status);
5964 return 0;
5965 }
5966 first = uiter_previous32(s.iterator);
5967 // maybe we're at the start of the string
5968 if(first == U_SENTINEL) {
5969 first = 0;
5970 } else {
5971 uiter_next32(s.iterator);
5972 }
5973
5974 j = 0;
5975 for(;;) {
5976 if(i == count) {
5977 if(j+1 < bocsuBytesWritten) {
5978 bocsuBytesUsed = j+1;
5979 }
5980 goto saveState;
5981 }
5982
5983 // On identical level, we will always save
5984 // the state if we reach this point, since
5985 // we don't depend on getNextCE for content
5986 // all the content is in our buffer and we
5987 // already either stored the full buffer OR
5988 // otherwise we won't arrive here.
5989 newState = s.iterator->getState(s.iterator);
5990 if(newState != UITER_NO_STATE) {
5991 iterState = newState;
5992 cces = 0;
5993 }
5994
5995 uint8_t buff[4];
5996 second = uiter_next32(s.iterator);
5997 cces++;
5998
5999 // end condition for identical level
6000 if(second == U_SENTINEL) {
6001 terminatePSKLevel(level, maxLevel, i, dest);
6002 level = UCOL_PSK_NULL;
6003 break;
6004 }
6005 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6006 first = second;
6007
6008 j = 0;
6009 if(bocsuBytesUsed != 0) {
6010 while(bocsuBytesUsed-->0) {
6011 j++;
6012 }
6013 }
6014
6015 while(i < count && j < bocsuBytesWritten) {
6016 dest[i++] = buff[j++];
6017 }
6018 }
6019
6020 } else {
6021 level = UCOL_PSK_NULL;
6022 }
6023 /* fall through to next level */
6024 case UCOL_PSK_NULL:
6025 j = i;
6026 while(j<count) {
6027 dest[j++]=0;
6028 }
6029 break;
6030 default:
6031 *status = U_INTERNAL_PROGRAM_ERROR;
6032 UTRACE_EXIT_STATUS(*status);
6033 return 0;
6034 }
6035
6036saveState:
6037 // Now we need to return stuff. First we want to see whether we have
6038 // done everything for the current state of iterator.
6039 if(byteCountOrFrenchDone
6040 || canUpdateState == FALSE
6041 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6042 {
6043 // Any of above mean that the previous transaction
6044 // wasn't finished and that we should store the
6045 // previous iterator state.
6046 state[0] = iterState;
6047 } else {
6048 // The transaction is complete. We will continue in the next iteration.
6049 state[0] = s.iterator->getState(s.iterator);
6050 cces = 0;
6051 }
6052 // Store the number of bocsu bytes written.
6053 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6054 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6055 }
6056 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6057
6058 // Next we put in the level of comparison
6059 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6060
6061 // If we are doing French, we need to store whether we have just finished the French level
6062 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6063 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6064 } else {
6065 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6066 }
6067
6068 // Was the latest CE shifted
6069 if(wasShifted) {
6070 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6071 }
6072 // Check for cces overflow
6073 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6074 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6075 }
6076 // Store cces
6077 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6078
6079 // Check for French overflow
6080 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6081 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6082 }
6083 // Store number of bytes written in the French secondary continuation sequence
6084 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6085
6086
6087 // If we have used normalizing iterator, get rid of it
6088 if(normIter != NULL) {
6089 unorm_closeIter(normIter);
6090 }
6091
6092 /* To avoid memory leak, free the offset buffer if necessary. */
6093 ucol_freeOffsetBuffer(&s);
6094
6095 // Return number of meaningful sortkey bytes.
6096 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6097 dest,i, state[0], state[1]);
6098 UTRACE_EXIT_VALUE(i);
6099 return i;
6100}
6101
6102/**
6103 * Produce a bound for a given sortkey and a number of levels.
6104 */
6105U_CAPI int32_t U_EXPORT2
6106ucol_getBound(const uint8_t *source,
6107 int32_t sourceLength,
6108 UColBoundMode boundType,
6109 uint32_t noOfLevels,
6110 uint8_t *result,
6111 int32_t resultLength,
6112 UErrorCode *status)
6113{
6114 // consistency checks
6115 if(status == NULL || U_FAILURE(*status)) {
6116 return 0;
6117 }
6118 if(source == NULL) {
6119 *status = U_ILLEGAL_ARGUMENT_ERROR;
6120 return 0;
6121 }
6122
6123 int32_t sourceIndex = 0;
6124 // Scan the string until we skip enough of the key OR reach the end of the key
6125 do {
6126 sourceIndex++;
6127 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6128 noOfLevels--;
6129 }
6130 } while (noOfLevels > 0
6131 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6132
6133 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6134 && noOfLevels > 0) {
6135 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6136 }
6137
6138
6139 // READ ME: this code assumes that the values for boundType
6140 // enum will not changes. They are set so that the enum value
6141 // corresponds to the number of extra bytes each bound type
6142 // needs.
6143 if(result != NULL && resultLength >= sourceIndex+boundType) {
6144 uprv_memcpy(result, source, sourceIndex);
6145 switch(boundType) {
6146 // Lower bound just gets terminated. No extra bytes
6147 case UCOL_BOUND_LOWER: // = 0
6148 break;
6149 // Upper bound needs one extra byte
6150 case UCOL_BOUND_UPPER: // = 1
6151 result[sourceIndex++] = 2;
6152 break;
6153 // Upper long bound needs two extra bytes
6154 case UCOL_BOUND_UPPER_LONG: // = 2
6155 result[sourceIndex++] = 0xFF;
6156 result[sourceIndex++] = 0xFF;
6157 break;
6158 default:
6159 *status = U_ILLEGAL_ARGUMENT_ERROR;
6160 return 0;
6161 }
6162 result[sourceIndex++] = 0;
6163
6164 return sourceIndex;
6165 } else {
6166 return sourceIndex+boundType+1;
6167 }
6168}
6169
6170/****************************************************************************/
6171/* Following are the functions that deal with the properties of a collator */
6172/* there are new APIs and some compatibility APIs */
6173/****************************************************************************/
6174
6175static inline void
6176ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6177 int32_t *primShift, int32_t *secShift, int32_t *terShift)
6178{
6179 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6180 UBool reverseSecondary = FALSE;
6181 UBool continuation = isContinuation(CE);
6182 if(!continuation) {
6183 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6184 tertiary ^= coll->caseSwitch;
6185 reverseSecondary = TRUE;
6186 } else {
6187 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6188 tertiary &= UCOL_REMOVE_CASE;
6189 reverseSecondary = FALSE;
6190 }
6191
6192 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6193 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6194 primary1 = (uint8_t)(CE >> 8);
6195
6196 if(primary1 != 0) {
6197 if (coll->leadBytePermutationTable != NULL && !continuation) {
6198 primary1 = coll->leadBytePermutationTable[primary1];
6199 }
6200
6201 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6202 *primShift -= 8;
6203 }
6204 if(primary2 != 0) {
6205 if(*primShift < 0) {
6206 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6207 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6208 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6209 return;
6210 }
6211 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6212 *primShift -= 8;
6213 }
6214 if(secondary != 0) {
6215 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6216 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6217 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6218 } else { // normal case
6219 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6220 }
6221 *secShift -= 8;
6222 }
6223 if(tertiary != 0) {
6224 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6225 *terShift -= 8;
6226 }
6227}
6228
6229static inline UBool
6230ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6231 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6232 if(newTable == NULL) {
6233 *status = U_MEMORY_ALLOCATION_ERROR;
6234 coll->latinOneFailed = TRUE;
6235 return FALSE;
6236 }
6237 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6238 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6239 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6240 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6241 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6242 coll->latinOneTableLen = size;
6243 uprv_free(coll->latinOneCEs);
6244 coll->latinOneCEs = newTable;
6245 return TRUE;
6246}
6247
6248static UBool
6249ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6250 UBool result = TRUE;
6251 if(coll->latinOneCEs == NULL) {
6252 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6253 if(coll->latinOneCEs == NULL) {
6254 *status = U_MEMORY_ALLOCATION_ERROR;
6255 return FALSE;
6256 }
6257 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6258 }
6259 UChar ch = 0;
6260 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6261 // Check for null pointer
6262 if (U_FAILURE(*status)) {
6263 return FALSE;
6264 }
6265 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6266
6267 int32_t primShift = 24, secShift = 24, terShift = 24;
6268 uint32_t CE = 0;
6269 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6270
6271 // TODO: make safe if you get more than you wanted...
6272 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6273 primShift = 24; secShift = 24; terShift = 24;
6274 if(ch < 0x100) {
6275 CE = coll->latinOneMapping[ch];
6276 } else {
6277 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6278 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6279 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6280 }
6281 }
6282 if(CE < UCOL_NOT_FOUND) {
6283 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6284 } else {
6285 switch (getCETag(CE)) {
6286 case EXPANSION_TAG:
6287 case DIGIT_TAG:
6288 ucol_setText(it, &ch, 1, status);
6289 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6290 if(primShift < 0 || secShift < 0 || terShift < 0) {
6291 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6292 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6293 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6294 break;
6295 }
6296 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6297 }
6298 break;
6299 case CONTRACTION_TAG:
6300 // here is the trick
6301 // F2 is contraction. We do something very similar to contractions
6302 // but have two indices, one in the real contraction table and the
6303 // other to where we stuffed things. This hopes that we don't have
6304 // many contractions (this should work for latin-1 tables).
6305 {
6306 if((CE & 0x00FFF000) != 0) {
6307 *status = U_UNSUPPORTED_ERROR;
6308 goto cleanup_after_failure;
6309 }
6310
6311 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6312
6313 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6314
6315 coll->latinOneCEs[ch] = CE;
6316 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6317 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6318
6319 // We're going to jump into contraction table, pick the elements
6320 // and use them
6321 do {
6322 CE = *(coll->contractionCEs +
6323 (UCharOffset - coll->contractionIndex));
6324 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6325 uint32_t size;
6326 uint32_t i; /* general counter */
6327 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6328 size = getExpansionCount(CE);
6329 //CE = *CEOffset++;
6330 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6331 for(i = 0; i<size; i++) {
6332 if(primShift < 0 || secShift < 0 || terShift < 0) {
6333 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6334 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6335 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6336 break;
6337 }
6338 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6339 }
6340 } else { /* else, we do */
6341 while(*CEOffset != 0) {
6342 if(primShift < 0 || secShift < 0 || terShift < 0) {
6343 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6344 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6345 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6346 break;
6347 }
6348 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6349 }
6350 }
6351 contractionOffset++;
6352 } else if(CE < UCOL_NOT_FOUND) {
6353 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6354 } else {
6355 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6356 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6357 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6358 contractionOffset++;
6359 }
6360 UCharOffset++;
6361 primShift = 24; secShift = 24; terShift = 24;
6362 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6363 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6364 goto cleanup_after_failure;
6365 }
6366 }
6367 } while(*UCharOffset != 0xFFFF);
6368 }
6369 break;;
6370 case SPEC_PROC_TAG:
6371 {
6372 // 0xB7 is a precontext character defined in UCA5.1, a special
6373 // handle is implemeted in order to save LatinOne table for
6374 // most locales.
6375 if (ch==0xb7) {
6376 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6377 }
6378 else {
6379 goto cleanup_after_failure;
6380 }
6381 }
6382 break;
6383 default:
6384 goto cleanup_after_failure;
6385 }
6386 }
6387 }
6388 // compact table
6389 if(contractionOffset < coll->latinOneTableLen) {
6390 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6391 goto cleanup_after_failure;
6392 }
6393 }
6394 ucol_closeElements(it);
6395 return result;
6396
6397cleanup_after_failure:
6398 // status should already be set before arriving here.
6399 coll->latinOneFailed = TRUE;
6400 ucol_closeElements(it);
6401 return FALSE;
6402}
6403
6404void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6405 if(U_SUCCESS(*status)) {
6406 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6407 coll->caseSwitch = UCOL_CASE_SWITCH;
6408 } else {
6409 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6410 }
6411
6412 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6413 coll->tertiaryMask = UCOL_REMOVE_CASE;
6414 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6415 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6416 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6417 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6418 } else {
6419 coll->tertiaryMask = UCOL_KEEP_CASE;
6420 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6421 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6422 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6423 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6424 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6425 } else {
6426 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6427 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6428 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6429 }
6430 }
6431
6432 /* Set the compression values */
6433 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6434 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6435 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6436
6437 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6438 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6439 {
6440 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6441 } else {
6442 coll->sortKeyGen = ucol_calcSortKey;
6443 }
6444 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6445 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6446 {
6447 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6448 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6449 //fprintf(stderr, "F");
6450 coll->latinOneUse = TRUE;
6451 } else {
6452 coll->latinOneUse = FALSE;
6453 }
6454 if(*status == U_UNSUPPORTED_ERROR) {
6455 *status = U_ZERO_ERROR;
6456 }
6457 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6458 coll->latinOneUse = TRUE;
6459 }
6460 } else {
6461 coll->latinOneUse = FALSE;
6462 }
6463 }
6464}
6465
6466U_CAPI uint32_t U_EXPORT2
6467ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6468 if(U_FAILURE(*status) || coll == NULL) {
6469 return 0;
6470 }
6471 if(len == -1) {
6472 len = u_strlen(varTop);
6473 }
6474 if(len == 0) {
6475 *status = U_ILLEGAL_ARGUMENT_ERROR;
6476 return 0;
6477 }
6478
6479 if(coll->delegate!=NULL) {
6480 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6481 }
6482
6483
6484 collIterate s;
6485 IInit_collIterate(coll, varTop, len, &s, status);
6486 if(U_FAILURE(*status)) {
6487 return 0;
6488 }
6489
6490 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6491
6492 /* here we check if we have consumed all characters */
6493 /* you can put in either one character or a contraction */
6494 /* you shouldn't put more... */
6495 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6496 *status = U_CE_NOT_FOUND_ERROR;
6497 return 0;
6498 }
6499
6500 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6501
6502 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6503 *status = U_PRIMARY_TOO_LONG_ERROR;
6504 return 0;
6505 }
6506 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6507 coll->variableTopValueisDefault = FALSE;
6508 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6509 }
6510
6511 /* To avoid memory leak, free the offset buffer if necessary. */
6512 ucol_freeOffsetBuffer(&s);
6513
6514 return CE & UCOL_PRIMARYMASK;
6515}
6516
6517U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6518 if(U_FAILURE(*status) || coll == NULL) {
6519 return 0;
6520 }
6521 if(coll->delegate!=NULL) {
6522 return ((const Collator*)coll->delegate)->getVariableTop(*status);
6523 }
6524 return coll->variableTopValue<<16;
6525}
6526
6527U_CAPI void U_EXPORT2
6528ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6529 if(U_FAILURE(*status) || coll == NULL) {
6530 return;
6531 }
6532
6533 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6534 coll->variableTopValueisDefault = FALSE;
6535 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6536 }
6537}
6538/* Attribute setter API */
6539U_CAPI void U_EXPORT2
6540ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6541 if(U_FAILURE(*status) || coll == NULL) {
6542 return;
6543 }
6544
6545 if(coll->delegate != NULL) {
6546 ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6547 return;
6548 }
6549
6550 UColAttributeValue oldFrench = coll->frenchCollation;
6551 UColAttributeValue oldCaseFirst = coll->caseFirst;
6552 switch(attr) {
6553 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6554 if(value == UCOL_ON) {
6555 coll->numericCollation = UCOL_ON;
6556 coll->numericCollationisDefault = FALSE;
6557 } else if (value == UCOL_OFF) {
6558 coll->numericCollation = UCOL_OFF;
6559 coll->numericCollationisDefault = FALSE;
6560 } else if (value == UCOL_DEFAULT) {
6561 coll->numericCollationisDefault = TRUE;
6562 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6563 } else {
6564 *status = U_ILLEGAL_ARGUMENT_ERROR;
6565 }
6566 break;
6567 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6568 if(value == UCOL_ON) {
6569 coll->hiraganaQ = UCOL_ON;
6570 coll->hiraganaQisDefault = FALSE;
6571 } else if (value == UCOL_OFF) {
6572 coll->hiraganaQ = UCOL_OFF;
6573 coll->hiraganaQisDefault = FALSE;
6574 } else if (value == UCOL_DEFAULT) {
6575 coll->hiraganaQisDefault = TRUE;
6576 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6577 } else {
6578 *status = U_ILLEGAL_ARGUMENT_ERROR;
6579 }
6580 break;
6581 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6582 if(value == UCOL_ON) {
6583 coll->frenchCollation = UCOL_ON;
6584 coll->frenchCollationisDefault = FALSE;
6585 } else if (value == UCOL_OFF) {
6586 coll->frenchCollation = UCOL_OFF;
6587 coll->frenchCollationisDefault = FALSE;
6588 } else if (value == UCOL_DEFAULT) {
6589 coll->frenchCollationisDefault = TRUE;
6590 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6591 } else {
6592 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6593 }
6594 break;
6595 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6596 if(value == UCOL_SHIFTED) {
6597 coll->alternateHandling = UCOL_SHIFTED;
6598 coll->alternateHandlingisDefault = FALSE;
6599 } else if (value == UCOL_NON_IGNORABLE) {
6600 coll->alternateHandling = UCOL_NON_IGNORABLE;
6601 coll->alternateHandlingisDefault = FALSE;
6602 } else if (value == UCOL_DEFAULT) {
6603 coll->alternateHandlingisDefault = TRUE;
6604 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6605 } else {
6606 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6607 }
6608 break;
6609 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6610 if(value == UCOL_LOWER_FIRST) {
6611 coll->caseFirst = UCOL_LOWER_FIRST;
6612 coll->caseFirstisDefault = FALSE;
6613 } else if (value == UCOL_UPPER_FIRST) {
6614 coll->caseFirst = UCOL_UPPER_FIRST;
6615 coll->caseFirstisDefault = FALSE;
6616 } else if (value == UCOL_OFF) {
6617 coll->caseFirst = UCOL_OFF;
6618 coll->caseFirstisDefault = FALSE;
6619 } else if (value == UCOL_DEFAULT) {
6620 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6621 coll->caseFirstisDefault = TRUE;
6622 } else {
6623 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6624 }
6625 break;
6626 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6627 if(value == UCOL_ON) {
6628 coll->caseLevel = UCOL_ON;
6629 coll->caseLevelisDefault = FALSE;
6630 } else if (value == UCOL_OFF) {
6631 coll->caseLevel = UCOL_OFF;
6632 coll->caseLevelisDefault = FALSE;
6633 } else if (value == UCOL_DEFAULT) {
6634 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6635 coll->caseLevelisDefault = TRUE;
6636 } else {
6637 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6638 }
6639 break;
6640 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6641 if(value == UCOL_ON) {
6642 coll->normalizationMode = UCOL_ON;
6643 coll->normalizationModeisDefault = FALSE;
6644 initializeFCD(status);
6645 } else if (value == UCOL_OFF) {
6646 coll->normalizationMode = UCOL_OFF;
6647 coll->normalizationModeisDefault = FALSE;
6648 } else if (value == UCOL_DEFAULT) {
6649 coll->normalizationModeisDefault = TRUE;
6650 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6651 if(coll->normalizationMode == UCOL_ON) {
6652 initializeFCD(status);
6653 }
6654 } else {
6655 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6656 }
6657 break;
6658 case UCOL_STRENGTH: /* attribute for strength */
6659 if (value == UCOL_DEFAULT) {
6660 coll->strengthisDefault = TRUE;
6661 coll->strength = (UColAttributeValue)coll->options->strength;
6662 } else if (value <= UCOL_IDENTICAL) {
6663 coll->strengthisDefault = FALSE;
6664 coll->strength = value;
6665 } else {
6666 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6667 }
6668 break;
6669 case UCOL_ATTRIBUTE_COUNT:
6670 default:
6671 *status = U_ILLEGAL_ARGUMENT_ERROR;
6672 break;
6673 }
6674 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6675 coll->latinOneRegenTable = TRUE;
6676 } else {
6677 coll->latinOneRegenTable = FALSE;
6678 }
6679 ucol_updateInternalState(coll, status);
6680}
6681
6682U_CAPI UColAttributeValue U_EXPORT2
6683ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6684 if(U_FAILURE(*status) || coll == NULL) {
6685 return UCOL_DEFAULT;
6686 }
6687
6688 if(coll->delegate != NULL) {
6689 return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6690 }
6691
6692 switch(attr) {
6693 case UCOL_NUMERIC_COLLATION:
6694 return coll->numericCollation;
6695 case UCOL_HIRAGANA_QUATERNARY_MODE:
6696 return coll->hiraganaQ;
6697 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6698 return coll->frenchCollation;
6699 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6700 return coll->alternateHandling;
6701 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6702 return coll->caseFirst;
6703 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6704 return coll->caseLevel;
6705 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6706 return coll->normalizationMode;
6707 case UCOL_STRENGTH: /* attribute for strength */
6708 return coll->strength;
6709 case UCOL_ATTRIBUTE_COUNT:
6710 default:
6711 *status = U_ILLEGAL_ARGUMENT_ERROR;
6712 break;
6713 }
6714 return UCOL_DEFAULT;
6715}
6716
6717U_CAPI void U_EXPORT2
6718ucol_setStrength( UCollator *coll,
6719 UCollationStrength strength)
6720{
6721 UErrorCode status = U_ZERO_ERROR;
6722 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6723}
6724
6725U_CAPI UCollationStrength U_EXPORT2
6726ucol_getStrength(const UCollator *coll)
6727{
6728 UErrorCode status = U_ZERO_ERROR;
6729 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6730}
6731
6732U_DRAFT int32_t U_EXPORT2
6733ucol_getReorderCodes(const UCollator *coll,
6734 int32_t *dest,
6735 int32_t destCapacity,
6736 UErrorCode *status) {
6737 if (U_FAILURE(*status)) {
6738 return 0;
6739 }
6740
6741 if(coll->delegate!=NULL) {
6742 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
6743 }
6744
6745 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6746 *status = U_ILLEGAL_ARGUMENT_ERROR;
6747 return 0;
6748 }
6749
6750#ifdef UCOL_DEBUG
6751 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6752 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6753#endif
6754
6755 if (coll->reorderCodesLength > destCapacity) {
6756 *status = U_BUFFER_OVERFLOW_ERROR;
6757 return coll->reorderCodesLength;
6758 }
6759 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6760 dest[i] = coll->reorderCodes[i];
6761 }
6762 return coll->reorderCodesLength;
6763}
6764
6765U_DRAFT void U_EXPORT2
6766ucol_setReorderCodes(UCollator* coll,
6767 const int32_t* reorderCodes,
6768 int32_t reorderCodesLength,
6769 UErrorCode *status) {
6770 if (U_FAILURE(*status)) {
6771 return;
6772 }
6773
6774 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6775 *status = U_ILLEGAL_ARGUMENT_ERROR;
6776 return;
6777 }
6778
6779 if(coll->delegate!=NULL) {
6780 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
6781 return;
6782 }
6783
6784 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6785 uprv_free(coll->reorderCodes);
6786 }
6787 coll->reorderCodes = NULL;
6788 coll->reorderCodesLength = 0;
6789 if (reorderCodesLength == 0) {
6790 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6791 uprv_free(coll->leadBytePermutationTable);
6792 }
6793 coll->leadBytePermutationTable = NULL;
6794 return;
6795 }
6796 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6797 if (coll->reorderCodes == NULL) {
6798 *status = U_MEMORY_ALLOCATION_ERROR;
6799 return;
6800 }
6801 coll->freeReorderCodesOnClose = TRUE;
6802 for (int32_t i = 0; i < reorderCodesLength; i++) {
6803 coll->reorderCodes[i] = reorderCodes[i];
6804 }
6805 coll->reorderCodesLength = reorderCodesLength;
6806 ucol_buildPermutationTable(coll, status);
6807}
6808
6809U_DRAFT int32_t U_EXPORT2
6810ucol_getEquivalentReorderCodes(int32_t reorderCode,
6811 int32_t* dest,
6812 int32_t destCapacity,
6813 UErrorCode *pErrorCode) {
6814 bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6815 uint16_t leadBytes[256];
6816 int leadBytesCount;
6817 int leadByteIndex;
6818 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6819 int reorderCodesForLeadByteCount;
6820 int reorderCodeIndex;
6821
6822 int32_t equivalentCodesCount = 0;
6823 int setIndex;
6824
6825 if (U_FAILURE(*pErrorCode)) {
6826 return 0;
6827 }
6828
6829 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6830 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6831 return 0;
6832 }
6833
6834 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6835
6836 const UCollator* uca = ucol_initUCA(pErrorCode);
6837 if (U_FAILURE(*pErrorCode)) {
6838 return 0;
6839 }
6840 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6841 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6842 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6843 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6844 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6845 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6846 }
6847 }
6848
6849 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6850 if (equivalentCodesSet[setIndex] == true) {
6851 equivalentCodesCount++;
6852 }
6853 }
6854
6855 if (destCapacity == 0) {
6856 return equivalentCodesCount;
6857 }
6858
6859 equivalentCodesCount = 0;
6860 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6861 if (equivalentCodesSet[setIndex] == true) {
6862 dest[equivalentCodesCount++] = setIndex;
6863 if (equivalentCodesCount >= destCapacity) {
6864 break;
6865 }
6866 }
6867 }
6868 return equivalentCodesCount;
6869}
6870
6871
6872/****************************************************************************/
6873/* Following are misc functions */
6874/* there are new APIs and some compatibility APIs */
6875/****************************************************************************/
6876
6877U_CAPI void U_EXPORT2
6878ucol_getVersion(const UCollator* coll,
6879 UVersionInfo versionInfo)
6880{
6881 if(coll->delegate!=NULL) {
6882 ((const Collator*)coll->delegate)->getVersion(versionInfo);
6883 return;
6884 }
6885 /* RunTime version */
6886 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6887 /* Builder version*/
6888 uint8_t bdVersion = coll->image->version[0];
6889
6890 /* Charset Version. Need to get the version from cnv files
6891 * makeconv should populate cnv files with version and
6892 * an api has to be provided in ucnv.h to obtain this version
6893 */
6894 uint8_t csVersion = 0;
6895
6896 /* combine the version info */
6897 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6898
6899 /* Tailoring rules */
6900 versionInfo[0] = (uint8_t)(cmbVersion>>8);
6901 versionInfo[1] = (uint8_t)cmbVersion;
6902 versionInfo[2] = coll->image->version[1];
6903 if(coll->UCA) {
6904 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6905 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6906 } else {
6907 versionInfo[3] = 0;
6908 }
6909}
6910
6911
6912/* This internal API checks whether a character is tailored or not */
6913U_CAPI UBool U_EXPORT2
6914ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6915 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6916 return FALSE;
6917 }
6918
6919 uint32_t CE = UCOL_NOT_FOUND;
6920 const UChar *ContractionStart = NULL;
6921 if(u < 0x100) { /* latin-1 */
6922 CE = coll->latinOneMapping[u];
6923 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6924 return FALSE;
6925 }
6926 } else { /* regular */
6927 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6928 }
6929
6930 if(isContraction(CE)) {
6931 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6932 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6933 }
6934
6935 return (UBool)(CE != UCOL_NOT_FOUND);
6936}
6937
6938
6939/****************************************************************************/
6940/* Following are the string compare functions */
6941/* */
6942/****************************************************************************/
6943
6944
6945/* ucol_checkIdent internal function. Does byte level string compare. */
6946/* Used by strcoll if strength == identical and strings */
6947/* are otherwise equal. */
6948/* */
6949/* Comparison must be done on NFD normalized strings. */
6950/* FCD is not good enough. */
6951
6952static
6953UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6954{
6955 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6956 // of same type, but that doesn't really mean that it will stay that way.
6957 int32_t comparison;
6958
6959 if (sColl->flags & UCOL_USE_ITERATOR) {
6960 // The division for the array length may truncate the array size to
6961 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6962 // for all platforms anyway.
6963 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6964 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6965 UNormIterator *sNIt = NULL, *tNIt = NULL;
6966 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6967 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6968 sColl->iterator->move(sColl->iterator, 0, UITER_START);
6969 tColl->iterator->move(tColl->iterator, 0, UITER_START);
6970 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6971 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6972 comparison = u_strCompareIter(sIt, tIt, TRUE);
6973 unorm_closeIter(sNIt);
6974 unorm_closeIter(tNIt);
6975 } else {
6976 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
6977 const UChar *sBuf = sColl->string;
6978 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
6979 const UChar *tBuf = tColl->string;
6980
6981 if (normalize) {
6982 *status = U_ZERO_ERROR;
6983 // Note: We could use Normalizer::compare() or similar, but for short strings
6984 // which may not be in FCD it might be faster to just NFD them.
6985 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
6986 // NFD'ing immediately might be faster for long strings,
6987 // but string comparison is usually done on relatively short strings.
6988 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
6989 sColl->writableBuffer,
6990 *status);
6991 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
6992 tColl->writableBuffer,
6993 *status);
6994 if(U_FAILURE(*status)) {
6995 return UCOL_LESS;
6996 }
6997 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
6998 } else {
6999 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7000 }
7001 }
7002
7003 if (comparison < 0) {
7004 return UCOL_LESS;
7005 } else if (comparison == 0) {
7006 return UCOL_EQUAL;
7007 } else /* comparison > 0 */ {
7008 return UCOL_GREATER;
7009 }
7010}
7011
7012/* CEBuf - A struct and some inline functions to handle the saving */
7013/* of CEs in a buffer within ucol_strcoll */
7014
7015#define UCOL_CEBUF_SIZE 512
7016typedef struct ucol_CEBuf {
7017 uint32_t *buf;
7018 uint32_t *endp;
7019 uint32_t *pos;
7020 uint32_t localArray[UCOL_CEBUF_SIZE];
7021} ucol_CEBuf;
7022
7023
7024static
7025inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7026 (b)->buf = (b)->pos = (b)->localArray;
7027 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7028}
7029
7030static
7031void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7032 uint32_t oldSize;
7033 uint32_t newSize;
7034 uint32_t *newBuf;
7035
7036 ci->flags |= UCOL_ITER_ALLOCATED;
7037 oldSize = (uint32_t)(b->pos - b->buf);
7038 newSize = oldSize * 2;
7039 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7040 if(newBuf == NULL) {
7041 *status = U_MEMORY_ALLOCATION_ERROR;
7042 }
7043 else {
7044 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7045 if (b->buf != b->localArray) {
7046 uprv_free(b->buf);
7047 }
7048 b->buf = newBuf;
7049 b->endp = b->buf + newSize;
7050 b->pos = b->buf + oldSize;
7051 }
7052}
7053
7054static
7055inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7056 if (b->pos == b->endp) {
7057 ucol_CEBuf_Expand(b, ci, status);
7058 }
7059 if (U_SUCCESS(*status)) {
7060 *(b)->pos++ = ce;
7061 }
7062}
7063
7064/* This is a trick string compare function that goes in and uses sortkeys to compare */
7065/* It is used when compare gets in trouble and needs to bail out */
7066static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7067 collIterate *tColl,
7068 UErrorCode *status)
7069{
7070 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7071 uint8_t *sourceKeyP = sourceKey;
7072 uint8_t *targetKeyP = targetKey;
7073 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7074 const UCollator *coll = sColl->coll;
7075 const UChar *source = NULL;
7076 const UChar *target = NULL;
7077 int32_t result = UCOL_EQUAL;
7078 UnicodeString sourceString, targetString;
7079 int32_t sourceLength;
7080 int32_t targetLength;
7081
7082 if(sColl->flags & UCOL_USE_ITERATOR) {
7083 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7084 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7085 UChar32 c;
7086 while((c=sColl->iterator->next(sColl->iterator))>=0) {
7087 sourceString.append((UChar)c);
7088 }
7089 while((c=tColl->iterator->next(tColl->iterator))>=0) {
7090 targetString.append((UChar)c);
7091 }
7092 source = sourceString.getBuffer();
7093 sourceLength = sourceString.length();
7094 target = targetString.getBuffer();
7095 targetLength = targetString.length();
7096 } else { // no iterators
7097 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7098 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7099 source = sColl->string;
7100 target = tColl->string;
7101 }
7102
7103
7104
7105 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7106 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7107 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7108 if(sourceKeyP == NULL) {
7109 *status = U_MEMORY_ALLOCATION_ERROR;
7110 goto cleanup_and_do_compare;
7111 }
7112 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7113 }
7114
7115 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7116 if(targetKeyLen > UCOL_MAX_BUFFER) {
7117 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7118 if(targetKeyP == NULL) {
7119 *status = U_MEMORY_ALLOCATION_ERROR;
7120 goto cleanup_and_do_compare;
7121 }
7122 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7123 }
7124
7125 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7126
7127cleanup_and_do_compare:
7128 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7129 uprv_free(sourceKeyP);
7130 }
7131
7132 if(targetKeyP != NULL && targetKeyP != targetKey) {
7133 uprv_free(targetKeyP);
7134 }
7135
7136 if(result<0) {
7137 return UCOL_LESS;
7138 } else if(result>0) {
7139 return UCOL_GREATER;
7140 } else {
7141 return UCOL_EQUAL;
7142 }
7143}
7144
7145
7146static UCollationResult
7147ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7148{
7149 U_ALIGN_CODE(16);
7150
7151 const UCollator *coll = sColl->coll;
7152
7153
7154 // setting up the collator parameters
7155 UColAttributeValue strength = coll->strength;
7156 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7157
7158 UBool checkSecTer = initialCheckSecTer;
7159 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7160 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7161 UBool checkIdent = (strength == UCOL_IDENTICAL);
7162 UBool checkCase = (coll->caseLevel == UCOL_ON);
7163 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7164 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7165 UBool qShifted = shifted && checkQuad;
7166 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7167
7168 if(doHiragana && shifted) {
7169 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7170 }
7171 uint8_t caseSwitch = coll->caseSwitch;
7172 uint8_t tertiaryMask = coll->tertiaryMask;
7173
7174 // This is the lowest primary value that will not be ignored if shifted
7175 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7176
7177 UCollationResult result = UCOL_EQUAL;
7178 UCollationResult hirResult = UCOL_EQUAL;
7179
7180 // Preparing the CE buffers. They will be filled during the primary phase
7181 ucol_CEBuf sCEs;
7182 ucol_CEBuf tCEs;
7183 UCOL_INIT_CEBUF(&sCEs);
7184 UCOL_INIT_CEBUF(&tCEs);
7185
7186 uint32_t secS = 0, secT = 0;
7187 uint32_t sOrder=0, tOrder=0;
7188
7189 // Non shifted primary processing is quite simple
7190 if(!shifted) {
7191 for(;;) {
7192
7193 // We fetch CEs until we hit a non ignorable primary or end.
7194 do {
7195 // We get the next CE
7196 sOrder = ucol_IGetNextCE(coll, sColl, status);
7197 // Stuff it in the buffer
7198 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7199 // And keep just the primary part.
7200 sOrder &= UCOL_PRIMARYMASK;
7201 } while(sOrder == 0);
7202
7203 // see the comments on the above block
7204 do {
7205 tOrder = ucol_IGetNextCE(coll, tColl, status);
7206 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7207 tOrder &= UCOL_PRIMARYMASK;
7208 } while(tOrder == 0);
7209
7210 // if both primaries are the same
7211 if(sOrder == tOrder) {
7212 // and there are no more CEs, we advance to the next level
7213 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7214 break;
7215 }
7216 if(doHiragana && hirResult == UCOL_EQUAL) {
7217 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7218 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7219 ? UCOL_LESS:UCOL_GREATER;
7220 }
7221 }
7222 } else {
7223 // only need to check one for continuation
7224 // if one is then the other must be or the preceding CE would be a prefix of the other
7225 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7226 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7227 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7228 }
7229 // if two primaries are different, we are done
7230 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7231 goto commonReturn;
7232 }
7233 } // no primary difference... do the rest from the buffers
7234 } else { // shifted - do a slightly more complicated processing :)
7235 for(;;) {
7236 UBool sInShifted = FALSE;
7237 UBool tInShifted = FALSE;
7238 // This version of code can be refactored. However, it seems easier to understand this way.
7239 // Source loop. Sam as the target loop.
7240 for(;;) {
7241 sOrder = ucol_IGetNextCE(coll, sColl, status);
7242 if(sOrder == UCOL_NO_MORE_CES) {
7243 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7244 break;
7245 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7246 /* UCA amendment - ignore ignorables that follow shifted code points */
7247 continue;
7248 } else if(isContinuation(sOrder)) {
7249 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7250 if(sInShifted) {
7251 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7252 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7253 continue;
7254 } else {
7255 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7256 break;
7257 }
7258 } else { /* Just lower level values */
7259 if(sInShifted) {
7260 continue;
7261 } else {
7262 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7263 continue;
7264 }
7265 }
7266 } else { /* regular */
7267 if(coll->leadBytePermutationTable != NULL){
7268 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7269 }
7270 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7271 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7272 break;
7273 } else {
7274 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7275 sInShifted = TRUE;
7276 sOrder &= UCOL_PRIMARYMASK;
7277 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7278 continue;
7279 } else {
7280 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7281 sInShifted = FALSE;
7282 continue;
7283 }
7284 }
7285 }
7286 }
7287 sOrder &= UCOL_PRIMARYMASK;
7288 sInShifted = FALSE;
7289
7290 for(;;) {
7291 tOrder = ucol_IGetNextCE(coll, tColl, status);
7292 if(tOrder == UCOL_NO_MORE_CES) {
7293 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7294 break;
7295 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7296 /* UCA amendment - ignore ignorables that follow shifted code points */
7297 continue;
7298 } else if(isContinuation(tOrder)) {
7299 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7300 if(tInShifted) {
7301 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7302 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7303 continue;
7304 } else {
7305 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7306 break;
7307 }
7308 } else { /* Just lower level values */
7309 if(tInShifted) {
7310 continue;
7311 } else {
7312 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7313 continue;
7314 }
7315 }
7316 } else { /* regular */
7317 if(coll->leadBytePermutationTable != NULL){
7318 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7319 }
7320 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7321 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7322 break;
7323 } else {
7324 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7325 tInShifted = TRUE;
7326 tOrder &= UCOL_PRIMARYMASK;
7327 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7328 continue;
7329 } else {
7330 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7331 tInShifted = FALSE;
7332 continue;
7333 }
7334 }
7335 }
7336 }
7337 tOrder &= UCOL_PRIMARYMASK;
7338 tInShifted = FALSE;
7339
7340 if(sOrder == tOrder) {
7341 /*
7342 if(doHiragana && hirResult == UCOL_EQUAL) {
7343 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7344 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7345 ? UCOL_LESS:UCOL_GREATER;
7346 }
7347 }
7348 */
7349 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7350 break;
7351 } else {
7352 sOrder = 0;
7353 tOrder = 0;
7354 continue;
7355 }
7356 } else {
7357 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7358 goto commonReturn;
7359 }
7360 } /* no primary difference... do the rest from the buffers */
7361 }
7362
7363 /* now, we're gonna reexamine collected CEs */
7364 uint32_t *sCE;
7365 uint32_t *tCE;
7366
7367 /* This is the secondary level of comparison */
7368 if(checkSecTer) {
7369 if(!isFrenchSec) { /* normal */
7370 sCE = sCEs.buf;
7371 tCE = tCEs.buf;
7372 for(;;) {
7373 while (secS == 0) {
7374 secS = *(sCE++) & UCOL_SECONDARYMASK;
7375 }
7376
7377 while(secT == 0) {
7378 secT = *(tCE++) & UCOL_SECONDARYMASK;
7379 }
7380
7381 if(secS == secT) {
7382 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7383 break;
7384 } else {
7385 secS = 0; secT = 0;
7386 continue;
7387 }
7388 } else {
7389 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7390 goto commonReturn;
7391 }
7392 }
7393 } else { /* do the French */
7394 uint32_t *sCESave = NULL;
7395 uint32_t *tCESave = NULL;
7396 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7397 tCE = tCEs.pos-2;
7398 for(;;) {
7399 while (secS == 0 && sCE >= sCEs.buf) {
7400 if(sCESave == NULL) {
7401 secS = *(sCE--);
7402 if(isContinuation(secS)) {
7403 while(isContinuation(secS = *(sCE--)))
7404 ;
7405 /* after this, secS has the start of continuation, and sCEs points before that */
7406 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7407 sCE+=2; /* need to point to the first continuation CP */
7408 /* However, now you can just continue doing stuff */
7409 }
7410 } else {
7411 secS = *(sCE++);
7412 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7413 sCE = sCESave; /* reset the pointer to before continuation */
7414 sCESave = NULL;
7415 secS = 0; /* Fetch a fresh CE before the continuation sequence. */
7416 continue;
7417 }
7418 }
7419 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7420 }
7421
7422 while(secT == 0 && tCE >= tCEs.buf) {
7423 if(tCESave == NULL) {
7424 secT = *(tCE--);
7425 if(isContinuation(secT)) {
7426 while(isContinuation(secT = *(tCE--)))
7427 ;
7428 /* after this, secS has the start of continuation, and sCEs points before that */
7429 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7430 tCE+=2; /* need to point to the first continuation CP */
7431 /* However, now you can just continue doing stuff */
7432 }
7433 } else {
7434 secT = *(tCE++);
7435 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7436 tCE = tCESave; /* reset the pointer to before continuation */
7437 tCESave = NULL;
7438 secT = 0; /* Fetch a fresh CE before the continuation sequence. */
7439 continue;
7440 }
7441 }
7442 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7443 }
7444
7445 if(secS == secT) {
7446 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7447 break;
7448 } else {
7449 secS = 0; secT = 0;
7450 continue;
7451 }
7452 } else {
7453 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7454 goto commonReturn;
7455 }
7456 }
7457 }
7458 }
7459
7460 /* doing the case bit */
7461 if(checkCase) {
7462 sCE = sCEs.buf;
7463 tCE = tCEs.buf;
7464 for(;;) {
7465 while((secS & UCOL_REMOVE_CASE) == 0) {
7466 if(!isContinuation(*sCE++)) {
7467 secS =*(sCE-1);
7468 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7469 // primary ignorables should not be considered on the case level when the strength is primary
7470 // otherwise, the CEs stop being well-formed
7471 secS &= UCOL_TERT_CASE_MASK;
7472 secS ^= caseSwitch;
7473 } else {
7474 secS = 0;
7475 }
7476 } else {
7477 secS = 0;
7478 }
7479 }
7480
7481 while((secT & UCOL_REMOVE_CASE) == 0) {
7482 if(!isContinuation(*tCE++)) {
7483 secT = *(tCE-1);
7484 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7485 // primary ignorables should not be considered on the case level when the strength is primary
7486 // otherwise, the CEs stop being well-formed
7487 secT &= UCOL_TERT_CASE_MASK;
7488 secT ^= caseSwitch;
7489 } else {
7490 secT = 0;
7491 }
7492 } else {
7493 secT = 0;
7494 }
7495 }
7496
7497 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7498 result = UCOL_LESS;
7499 goto commonReturn;
7500 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7501 result = UCOL_GREATER;
7502 goto commonReturn;
7503 }
7504
7505 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7506 break;
7507 } else {
7508 secS = 0;
7509 secT = 0;
7510 }
7511 }
7512 }
7513
7514 /* Tertiary level */
7515 if(checkTertiary) {
7516 secS = 0;
7517 secT = 0;
7518 sCE = sCEs.buf;
7519 tCE = tCEs.buf;
7520 for(;;) {
7521 while((secS & UCOL_REMOVE_CASE) == 0) {
7522 secS = *(sCE++) & tertiaryMask;
7523 if(!isContinuation(secS)) {
7524 secS ^= caseSwitch;
7525 } else {
7526 secS &= UCOL_REMOVE_CASE;
7527 }
7528 }
7529
7530 while((secT & UCOL_REMOVE_CASE) == 0) {
7531 secT = *(tCE++) & tertiaryMask;
7532 if(!isContinuation(secT)) {
7533 secT ^= caseSwitch;
7534 } else {
7535 secT &= UCOL_REMOVE_CASE;
7536 }
7537 }
7538
7539 if(secS == secT) {
7540 if((secS & UCOL_REMOVE_CASE) == 1) {
7541 break;
7542 } else {
7543 secS = 0; secT = 0;
7544 continue;
7545 }
7546 } else {
7547 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7548 goto commonReturn;
7549 }
7550 }
7551 }
7552
7553
7554 if(qShifted /*checkQuad*/) {
7555 UBool sInShifted = TRUE;
7556 UBool tInShifted = TRUE;
7557 secS = 0;
7558 secT = 0;
7559 sCE = sCEs.buf;
7560 tCE = tCEs.buf;
7561 for(;;) {
7562 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7563 secS = *(sCE++);
7564 if(isContinuation(secS)) {
7565 if(!sInShifted) {
7566 continue;
7567 }
7568 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7569 secS = UCOL_PRIMARYMASK;
7570 sInShifted = FALSE;
7571 } else {
7572 sInShifted = TRUE;
7573 }
7574 }
7575 secS &= UCOL_PRIMARYMASK;
7576
7577
7578 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7579 secT = *(tCE++);
7580 if(isContinuation(secT)) {
7581 if(!tInShifted) {
7582 continue;
7583 }
7584 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7585 secT = UCOL_PRIMARYMASK;
7586 tInShifted = FALSE;
7587 } else {
7588 tInShifted = TRUE;
7589 }
7590 }
7591 secT &= UCOL_PRIMARYMASK;
7592
7593 if(secS == secT) {
7594 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7595 break;
7596 } else {
7597 secS = 0; secT = 0;
7598 continue;
7599 }
7600 } else {
7601 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7602 goto commonReturn;
7603 }
7604 }
7605 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7606 // If we're fine on quaternaries, we might be different
7607 // on Hiragana. This, however, might fail us in shifted.
7608 result = hirResult;
7609 goto commonReturn;
7610 }
7611
7612 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7613 /* as a tiebreaker if all else is equal. */
7614 /* Getting here should be quite rare - strings are not identical - */
7615 /* that is checked first, but compared == through all other checks. */
7616 if(checkIdent)
7617 {
7618 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7619 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7620 }
7621
7622commonReturn:
7623 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7624 if (sCEs.buf != sCEs.localArray ) {
7625 uprv_free(sCEs.buf);
7626 }
7627 if (tCEs.buf != tCEs.localArray ) {
7628 uprv_free(tCEs.buf);
7629 }
7630 }
7631
7632 return result;
7633}
7634
7635static UCollationResult
7636ucol_strcollRegular(const UCollator *coll,
7637 const UChar *source, int32_t sourceLength,
7638 const UChar *target, int32_t targetLength,
7639 UErrorCode *status) {
7640 collIterate sColl, tColl;
7641 // Preparing the context objects for iterating over strings
7642 IInit_collIterate(coll, source, sourceLength, &sColl, status);
7643 IInit_collIterate(coll, target, targetLength, &tColl, status);
7644 if(U_FAILURE(*status)) {
7645 return UCOL_LESS;
7646 }
7647 return ucol_strcollRegular(&sColl, &tColl, status);
7648}
7649
7650static inline uint32_t
7651ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7652 uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7653{
7654 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7655 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7656 int32_t offset = 1;
7657 UChar schar = 0, tchar = 0;
7658
7659 for(;;) {
7660 if(len == -1) {
7661 if(s[*index] == 0) { // end of string
7662 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7663 } else {
7664 schar = s[*index];
7665 }
7666 } else {
7667 if(*index == len) {
7668 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7669 } else {
7670 schar = s[*index];
7671 }
7672 }
7673
7674 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7675 offset++;
7676 }
7677
7678 if (schar == tchar) {
7679 (*index)++;
7680 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7681 }
7682 else
7683 {
7684 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7685 return UCOL_BAIL_OUT_CE;
7686 }
7687 // skip completely ignorables
7688 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7689 if(isZeroCE == 0) { // we have to ignore completely ignorables
7690 (*index)++;
7691 continue;
7692 }
7693
7694 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7695 }
7696 }
7697}
7698
7699
7700/**
7701 * This is a fast strcoll, geared towards text in Latin-1.
7702 * It supports contractions of size two, French secondaries
7703 * and case switching. You can use it with strengths primary
7704 * to tertiary. It does not support shifted and case level.
7705 * It relies on the table build by setupLatin1Table. If it
7706 * doesn't understand something, it will go to the regular
7707 * strcoll.
7708 */
7709static UCollationResult
7710ucol_strcollUseLatin1( const UCollator *coll,
7711 const UChar *source,
7712 int32_t sLen,
7713 const UChar *target,
7714 int32_t tLen,
7715 UErrorCode *status)
7716{
7717 U_ALIGN_CODE(16);
7718 int32_t strength = coll->strength;
7719
7720 int32_t sIndex = 0, tIndex = 0;
7721 UChar sChar = 0, tChar = 0;
7722 uint32_t sOrder=0, tOrder=0;
7723
7724 UBool endOfSource = FALSE;
7725
7726 uint32_t *elements = coll->latinOneCEs;
7727
7728 UBool haveContractions = FALSE; // if we have contractions in our string
7729 // we cannot do French secondary
7730
7731 // Do the primary level
7732 for(;;) {
7733 while(sOrder==0) { // this loop skips primary ignorables
7734 // sOrder=getNextlatinOneCE(source);
7735 if(sLen==-1) { // handling zero terminated strings
7736 sChar=source[sIndex++];
7737 if(sChar==0) {
7738 endOfSource = TRUE;
7739 break;
7740 }
7741 } else { // handling strings with known length
7742 if(sIndex==sLen) {
7743 endOfSource = TRUE;
7744 break;
7745 }
7746 sChar=source[sIndex++];
7747 }
7748 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7749 //fprintf(stderr, "R");
7750 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7751 }
7752 sOrder = elements[sChar];
7753 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7754 // specials can basically be either contractions or bail-out signs. If we get anything
7755 // else, we'll bail out anywasy
7756 if(getCETag(sOrder) == CONTRACTION_TAG) {
7757 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7758 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7759 // However, if there are contractions in the table, but we always use just one char,
7760 // we might be able to do French. This should be checked out.
7761 }
7762 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7763 //fprintf(stderr, "S");
7764 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7765 }
7766 }
7767 }
7768
7769 while(tOrder==0) { // this loop skips primary ignorables
7770 // tOrder=getNextlatinOneCE(target);
7771 if(tLen==-1) { // handling zero terminated strings
7772 tChar=target[tIndex++];
7773 if(tChar==0) {
7774 if(endOfSource) { // this is different than source loop,
7775 // as we already know that source loop is done here,
7776 // so we can either finish the primary loop if both
7777 // strings are done or anounce the result if only
7778 // target is done. Same below.
7779 goto endOfPrimLoop;
7780 } else {
7781 return UCOL_GREATER;
7782 }
7783 }
7784 } else { // handling strings with known length
7785 if(tIndex==tLen) {
7786 if(endOfSource) {
7787 goto endOfPrimLoop;
7788 } else {
7789 return UCOL_GREATER;
7790 }
7791 }
7792 tChar=target[tIndex++];
7793 }
7794 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7795 //fprintf(stderr, "R");
7796 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7797 }
7798 tOrder = elements[tChar];
7799 if(tOrder >= UCOL_NOT_FOUND) {
7800 // Handling specials, see the comments for source
7801 if(getCETag(tOrder) == CONTRACTION_TAG) {
7802 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7803 haveContractions = TRUE;
7804 }
7805 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7806 //fprintf(stderr, "S");
7807 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7808 }
7809 }
7810 }
7811 if(endOfSource) { // source is finished, but target is not, say the result.
7812 return UCOL_LESS;
7813 }
7814
7815 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7816 sOrder = 0; tOrder = 0;
7817 continue;
7818 } else {
7819 // compare current top bytes
7820 if(((sOrder^tOrder)&0xFF000000)!=0) {
7821 // top bytes differ, return difference
7822 if(sOrder < tOrder) {
7823 return UCOL_LESS;
7824 } else if(sOrder > tOrder) {
7825 return UCOL_GREATER;
7826 }
7827 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7828 // since we must return enum value
7829 }
7830
7831 // top bytes match, continue with following bytes
7832 sOrder<<=8;
7833 tOrder<<=8;
7834 }
7835 }
7836
7837endOfPrimLoop:
7838 // after primary loop, we definitely know the sizes of strings,
7839 // so we set it and use simpler loop for secondaries and tertiaries
7840 sLen = sIndex; tLen = tIndex;
7841 if(strength >= UCOL_SECONDARY) {
7842 // adjust the table beggining
7843 elements += coll->latinOneTableLen;
7844 endOfSource = FALSE;
7845
7846 if(coll->frenchCollation == UCOL_OFF) { // non French
7847 // This loop is a simplified copy of primary loop
7848 // at this point we know that whole strings are latin-1, so we don't
7849 // check for that. We also know that we only have contractions as
7850 // specials.
7851 sIndex = 0; tIndex = 0;
7852 for(;;) {
7853 while(sOrder==0) {
7854 if(sIndex==sLen) {
7855 endOfSource = TRUE;
7856 break;
7857 }
7858 sChar=source[sIndex++];
7859 sOrder = elements[sChar];
7860 if(sOrder > UCOL_NOT_FOUND) {
7861 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7862 }
7863 }
7864
7865 while(tOrder==0) {
7866 if(tIndex==tLen) {
7867 if(endOfSource) {
7868 goto endOfSecLoop;
7869 } else {
7870 return UCOL_GREATER;
7871 }
7872 }
7873 tChar=target[tIndex++];
7874 tOrder = elements[tChar];
7875 if(tOrder > UCOL_NOT_FOUND) {
7876 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7877 }
7878 }
7879 if(endOfSource) {
7880 return UCOL_LESS;
7881 }
7882
7883 if(sOrder == tOrder) {
7884 sOrder = 0; tOrder = 0;
7885 continue;
7886 } else {
7887 // see primary loop for comments on this
7888 if(((sOrder^tOrder)&0xFF000000)!=0) {
7889 if(sOrder < tOrder) {
7890 return UCOL_LESS;
7891 } else if(sOrder > tOrder) {
7892 return UCOL_GREATER;
7893 }
7894 }
7895 sOrder<<=8;
7896 tOrder<<=8;
7897 }
7898 }
7899 } else { // French
7900 if(haveContractions) { // if we have contractions, we have to bail out
7901 // since we don't really know how to handle them here
7902 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7903 }
7904 // For French, we go backwards
7905 sIndex = sLen; tIndex = tLen;
7906 for(;;) {
7907 while(sOrder==0) {
7908 if(sIndex==0) {
7909 endOfSource = TRUE;
7910 break;
7911 }
7912 sChar=source[--sIndex];
7913 sOrder = elements[sChar];
7914 // don't even look for contractions
7915 }
7916
7917 while(tOrder==0) {
7918 if(tIndex==0) {
7919 if(endOfSource) {
7920 goto endOfSecLoop;
7921 } else {
7922 return UCOL_GREATER;
7923 }
7924 }
7925 tChar=target[--tIndex];
7926 tOrder = elements[tChar];
7927 // don't even look for contractions
7928 }
7929 if(endOfSource) {
7930 return UCOL_LESS;
7931 }
7932
7933 if(sOrder == tOrder) {
7934 sOrder = 0; tOrder = 0;
7935 continue;
7936 } else {
7937 // see the primary loop for comments
7938 if(((sOrder^tOrder)&0xFF000000)!=0) {
7939 if(sOrder < tOrder) {
7940 return UCOL_LESS;
7941 } else if(sOrder > tOrder) {
7942 return UCOL_GREATER;
7943 }
7944 }
7945 sOrder<<=8;
7946 tOrder<<=8;
7947 }
7948 }
7949 }
7950 }
7951
7952endOfSecLoop:
7953 if(strength >= UCOL_TERTIARY) {
7954 // tertiary loop is the same as secondary (except no French)
7955 elements += coll->latinOneTableLen;
7956 sIndex = 0; tIndex = 0;
7957 endOfSource = FALSE;
7958 for(;;) {
7959 while(sOrder==0) {
7960 if(sIndex==sLen) {
7961 endOfSource = TRUE;
7962 break;
7963 }
7964 sChar=source[sIndex++];
7965 sOrder = elements[sChar];
7966 if(sOrder > UCOL_NOT_FOUND) {
7967 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7968 }
7969 }
7970 while(tOrder==0) {
7971 if(tIndex==tLen) {
7972 if(endOfSource) {
7973 return UCOL_EQUAL; // if both strings are at the end, they are equal
7974 } else {
7975 return UCOL_GREATER;
7976 }
7977 }
7978 tChar=target[tIndex++];
7979 tOrder = elements[tChar];
7980 if(tOrder > UCOL_NOT_FOUND) {
7981 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7982 }
7983 }
7984 if(endOfSource) {
7985 return UCOL_LESS;
7986 }
7987 if(sOrder == tOrder) {
7988 sOrder = 0; tOrder = 0;
7989 continue;
7990 } else {
7991 if(((sOrder^tOrder)&0xff000000)!=0) {
7992 if(sOrder < tOrder) {
7993 return UCOL_LESS;
7994 } else if(sOrder > tOrder) {
7995 return UCOL_GREATER;
7996 }
7997 }
7998 sOrder<<=8;
7999 tOrder<<=8;
8000 }
8001 }
8002 }
8003 return UCOL_EQUAL;
8004}
8005
8006
8007U_CAPI UCollationResult U_EXPORT2
8008ucol_strcollIter( const UCollator *coll,
8009 UCharIterator *sIter,
8010 UCharIterator *tIter,
8011 UErrorCode *status)
8012{
8013 if(!status || U_FAILURE(*status)) {
8014 return UCOL_EQUAL;
8015 }
8016
8017 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8018 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8019
8020 if (sIter == tIter) {
8021 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8022 return UCOL_EQUAL;
8023 }
8024 if(sIter == NULL || tIter == NULL || coll == NULL) {
8025 *status = U_ILLEGAL_ARGUMENT_ERROR;
8026 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8027 return UCOL_EQUAL;
8028 }
8029
8030 UCollationResult result = UCOL_EQUAL;
8031
8032 // Preparing the context objects for iterating over strings
8033 collIterate sColl, tColl;
8034 IInit_collIterate(coll, NULL, -1, &sColl, status);
8035 IInit_collIterate(coll, NULL, -1, &tColl, status);
8036 if(U_FAILURE(*status)) {
8037 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8038 return UCOL_EQUAL;
8039 }
8040 // The division for the array length may truncate the array size to
8041 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8042 // for all platforms anyway.
8043 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8044 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8045 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8046
8047 sColl.iterator = sIter;
8048 sColl.flags |= UCOL_USE_ITERATOR;
8049 tColl.flags |= UCOL_USE_ITERATOR;
8050 tColl.iterator = tIter;
8051
8052 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8053 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8054 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8055 sColl.flags &= ~UCOL_ITER_NORM;
8056
8057 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8058 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8059 tColl.flags &= ~UCOL_ITER_NORM;
8060 }
8061
8062 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8063
8064 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8065 (tChar = tColl.iterator->next(tColl.iterator))) {
8066 if(sChar == U_SENTINEL) {
8067 result = UCOL_EQUAL;
8068 goto end_compare;
8069 }
8070 }
8071
8072 if(sChar == U_SENTINEL) {
8073 tChar = tColl.iterator->previous(tColl.iterator);
8074 }
8075
8076 if(tChar == U_SENTINEL) {
8077 sChar = sColl.iterator->previous(sColl.iterator);
8078 }
8079
8080 sChar = sColl.iterator->previous(sColl.iterator);
8081 tChar = tColl.iterator->previous(tColl.iterator);
8082
8083 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8084 {
8085 // We are stopped in the middle of a contraction.
8086 // Scan backwards through the == part of the string looking for the start of the contraction.
8087 // It doesn't matter which string we scan, since they are the same in this region.
8088 do
8089 {
8090 sChar = sColl.iterator->previous(sColl.iterator);
8091 tChar = tColl.iterator->previous(tColl.iterator);
8092 }
8093 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8094 }
8095
8096
8097 if(U_SUCCESS(*status)) {
8098 result = ucol_strcollRegular(&sColl, &tColl, status);
8099 }
8100
8101end_compare:
8102 if(sNormIter || tNormIter) {
8103 unorm_closeIter(sNormIter);
8104 unorm_closeIter(tNormIter);
8105 }
8106
8107 UTRACE_EXIT_VALUE_STATUS(result, *status)
8108 return result;
8109}
8110
8111
8112/* */
8113/* ucol_strcoll Main public API string comparison function */
8114/* */
8115U_CAPI UCollationResult U_EXPORT2
8116ucol_strcoll( const UCollator *coll,
8117 const UChar *source,
8118 int32_t sourceLength,
8119 const UChar *target,
8120 int32_t targetLength)
8121{
8122 U_ALIGN_CODE(16);
8123
8124 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8125 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8126 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8127 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8128 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8129 }
8130
8131 if(source == NULL || target == NULL) {
8132 // do not crash, but return. Should have
8133 // status argument to return error.
8134 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8135 return UCOL_EQUAL;
8136 }
8137
8138 /* Quick check if source and target are same strings. */
8139 /* They should either both be NULL terminated or the explicit length should be set on both. */
8140 if (source==target && sourceLength==targetLength) {
8141 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8142 return UCOL_EQUAL;
8143 }
8144
8145 if(coll->delegate != NULL) {
8146 UErrorCode status = U_ZERO_ERROR;
8147 return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
8148 }
8149
8150 /* Scan the strings. Find: */
8151 /* The length of any leading portion that is equal */
8152 /* Whether they are exactly equal. (in which case we just return) */
8153 const UChar *pSrc = source;
8154 const UChar *pTarg = target;
8155 int32_t equalLength;
8156
8157 if (sourceLength == -1 && targetLength == -1) {
8158 // Both strings are null terminated.
8159 // Scan through any leading equal portion.
8160 while (*pSrc == *pTarg && *pSrc != 0) {
8161 pSrc++;
8162 pTarg++;
8163 }
8164 if (*pSrc == 0 && *pTarg == 0) {
8165 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8166 return UCOL_EQUAL;
8167 }
8168 equalLength = (int32_t)(pSrc - source);
8169 }
8170 else
8171 {
8172 // One or both strings has an explicit length.
8173 const UChar *pSrcEnd = source + sourceLength;
8174 const UChar *pTargEnd = target + targetLength;
8175
8176 // Scan while the strings are bitwise ==, or until one is exhausted.
8177 for (;;) {
8178 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8179 break;
8180 }
8181 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8182 break;
8183 }
8184 if (*pSrc != *pTarg) {
8185 break;
8186 }
8187 pSrc++;
8188 pTarg++;
8189 }
8190 equalLength = (int32_t)(pSrc - source);
8191
8192 // If we made it all the way through both strings, we are done. They are ==
8193 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8194 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8195 {
8196 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8197 return UCOL_EQUAL;
8198 }
8199 }
8200 if (equalLength > 0) {
8201 /* There is an identical portion at the beginning of the two strings. */
8202 /* If the identical portion ends within a contraction or a comibining */
8203 /* character sequence, back up to the start of that sequence. */
8204
8205 // These values should already be set by the code above.
8206 //pSrc = source + equalLength; /* point to the first differing chars */
8207 //pTarg = target + equalLength;
8208 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8209 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8210 {
8211 // We are stopped in the middle of a contraction.
8212 // Scan backwards through the == part of the string looking for the start of the contraction.
8213 // It doesn't matter which string we scan, since they are the same in this region.
8214 do
8215 {
8216 equalLength--;
8217 pSrc--;
8218 }
8219 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8220 }
8221
8222 source += equalLength;
8223 target += equalLength;
8224 if (sourceLength > 0) {
8225 sourceLength -= equalLength;
8226 }
8227 if (targetLength > 0) {
8228 targetLength -= equalLength;
8229 }
8230 }
8231
8232 UErrorCode status = U_ZERO_ERROR;
8233 UCollationResult returnVal;
8234 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8235 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8236 } else {
8237 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8238 }
8239 UTRACE_EXIT_VALUE(returnVal);
8240 return returnVal;
8241}
8242
8243/* convenience function for comparing strings */
8244U_CAPI UBool U_EXPORT2
8245ucol_greater( const UCollator *coll,
8246 const UChar *source,
8247 int32_t sourceLength,
8248 const UChar *target,
8249 int32_t targetLength)
8250{
8251 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8252 == UCOL_GREATER);
8253}
8254
8255/* convenience function for comparing strings */
8256U_CAPI UBool U_EXPORT2
8257ucol_greaterOrEqual( const UCollator *coll,
8258 const UChar *source,
8259 int32_t sourceLength,
8260 const UChar *target,
8261 int32_t targetLength)
8262{
8263 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8264 != UCOL_LESS);
8265}
8266
8267/* convenience function for comparing strings */
8268U_CAPI UBool U_EXPORT2
8269ucol_equal( const UCollator *coll,
8270 const UChar *source,
8271 int32_t sourceLength,
8272 const UChar *target,
8273 int32_t targetLength)
8274{
8275 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8276 == UCOL_EQUAL);
8277}
8278
8279U_CAPI void U_EXPORT2
8280ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8281 if(coll && coll->UCA) {
8282 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8283 }
8284}
8285
8286#endif /* #if !UCONFIG_NO_COLLATION */