2 *******************************************************************************
4 * Copyright (C) 2005-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucasemap.c
10 * tab size: 8 (not used)
13 * created on: 2005may06
14 * created by: Markus W. Scherer
16 * Case mapping service object and functions using it.
19 #include "unicode/utypes.h"
20 #include "unicode/uloc.h"
21 #include "unicode/ustring.h"
22 #include "unicode/ucasemap.h"
23 #if !UCONFIG_NO_BREAK_ITERATION
24 #include "unicode/ubrk.h"
25 #include "unicode/utext.h"
32 /* UCaseMap service object -------------------------------------------------- */
34 U_CAPI UCaseMap
* U_EXPORT2
35 ucasemap_open(const char *locale
, uint32_t options
, UErrorCode
*pErrorCode
) {
38 if(U_FAILURE(*pErrorCode
)) {
42 csm
=(UCaseMap
*)uprv_malloc(sizeof(UCaseMap
));
46 uprv_memset(csm
, 0, sizeof(UCaseMap
));
48 csm
->csp
=ucase_getSingleton(pErrorCode
);
49 ucasemap_setLocale(csm
, locale
, pErrorCode
);
50 if(U_FAILURE(*pErrorCode
)) {
60 ucasemap_close(UCaseMap
*csm
) {
62 #if !UCONFIG_NO_BREAK_ITERATION
63 ubrk_close(csm
->iter
);
69 U_CAPI
const char * U_EXPORT2
70 ucasemap_getLocale(const UCaseMap
*csm
) {
74 U_CAPI
uint32_t U_EXPORT2
75 ucasemap_getOptions(const UCaseMap
*csm
) {
80 ucasemap_setLocale(UCaseMap
*csm
, const char *locale
, UErrorCode
*pErrorCode
) {
83 if(U_FAILURE(*pErrorCode
)) {
87 length
=uloc_getName(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
88 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
|| length
==sizeof(csm
->locale
)) {
89 *pErrorCode
=U_ZERO_ERROR
;
90 /* we only really need the language code for case mappings */
91 length
=uloc_getLanguage(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
93 if(length
==sizeof(csm
->locale
)) {
94 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
97 if(U_SUCCESS(*pErrorCode
)) {
98 ucase_getCaseLocale(csm
->locale
, &csm
->locCache
);
104 U_CAPI
void U_EXPORT2
105 ucasemap_setOptions(UCaseMap
*csm
, uint32_t options
, UErrorCode
*pErrorCode
) {
106 csm
->options
=options
;
109 #if !UCONFIG_NO_BREAK_ITERATION
111 U_CAPI
const UBreakIterator
* U_EXPORT2
112 ucasemap_getBreakIterator(const UCaseMap
*csm
) {
116 U_CAPI
void U_EXPORT2
117 ucasemap_setBreakIterator(UCaseMap
*csm
, UBreakIterator
*iterToAdopt
, UErrorCode
*pErrorCode
) {
118 ubrk_close(csm
->iter
);
119 csm
->iter
=iterToAdopt
;
124 /* UTF-8 string case mappings ----------------------------------------------- */
126 /* TODO(markus): Move to a new, separate utf8case.c file. */
128 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
129 static U_INLINE
int32_t
130 appendResult(uint8_t *dest
, int32_t destIndex
, int32_t destCapacity
,
131 int32_t result
, const UChar
*s
) {
133 int32_t length
, destLength
;
134 UErrorCode errorCode
;
136 /* decode the result */
138 /* (not) original code point */
141 } else if(result
<=UCASE_MAX_STRING_LENGTH
) {
149 if(destIndex
<destCapacity
) {
150 /* append the result */
154 U8_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
156 /* overflow, nothing written */
157 destIndex
+=U8_LENGTH(c
);
161 errorCode
=U_ZERO_ERROR
;
163 (char *)(dest
+destIndex
), destCapacity
-destIndex
, &destLength
,
166 destIndex
+=destLength
;
167 /* we might have an overflow, but we know the actual length */
172 destIndex
+=U8_LENGTH(c
);
174 errorCode
=U_ZERO_ERROR
;
176 NULL
, 0, &destLength
,
179 destIndex
+=destLength
;
185 static UChar32 U_CALLCONV
186 utf8_caseContextIterator(void *context
, int8_t dir
) {
187 UCaseContext
*csc
=(UCaseContext
*)context
;
191 /* reset for backward iteration */
192 csc
->index
=csc
->cpStart
;
195 /* reset for forward iteration */
196 csc
->index
=csc
->cpLimit
;
199 /* continue current iteration direction */
204 if(csc
->start
<csc
->index
) {
205 U8_PREV((const uint8_t *)csc
->p
, csc
->start
, csc
->index
, c
);
209 if(csc
->index
<csc
->limit
) {
210 U8_NEXT((const uint8_t *)csc
->p
, csc
->index
, csc
->limit
, c
);
218 * Case-maps [srcStart..srcLimit[ but takes
219 * context [0..srcLength[ into account.
222 _caseMap(const UCaseMap
*csm
, UCaseMapFull
*map
,
223 uint8_t *dest
, int32_t destCapacity
,
224 const uint8_t *src
, UCaseContext
*csc
,
225 int32_t srcStart
, int32_t srcLimit
,
226 UErrorCode
*pErrorCode
) {
229 int32_t srcIndex
, destIndex
;
232 locCache
=csm
->locCache
;
234 /* case mapping loop */
237 while(srcIndex
<srcLimit
) {
238 csc
->cpStart
=srcIndex
;
239 U8_NEXT(src
, srcIndex
, srcLimit
, c
);
240 csc
->cpLimit
=srcIndex
;
242 int32_t i
=csc
->cpStart
;
243 while(destIndex
<destCapacity
&& i
<srcIndex
) {
244 dest
[destIndex
++]=src
[i
++];
248 c
=map(csm
->csp
, c
, utf8_caseContextIterator
, csc
, &s
, csm
->locale
, &locCache
);
249 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0x7f : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0x7f)) {
250 /* fast path version of appendResult() for ASCII results */
251 dest
[destIndex
++]=(uint8_t)c2
;
253 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
257 if(destIndex
>destCapacity
) {
258 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
263 #if !UCONFIG_NO_BREAK_ITERATION
266 * Internal titlecasing function.
269 _toTitle(UCaseMap
*csm
,
270 uint8_t *dest
, int32_t destCapacity
,
271 const uint8_t *src
, UCaseContext
*csc
,
273 UErrorCode
*pErrorCode
) {
274 UText utext
=UTEXT_INITIALIZER
;
277 int32_t prev
, titleStart
, titleLimit
, index
, destIndex
, length
;
280 utext_openUTF8(&utext
, (const char *)src
, srcLength
, pErrorCode
);
281 if(U_FAILURE(*pErrorCode
)) {
284 if(csm
->iter
==NULL
) {
285 csm
->iter
=ubrk_open(UBRK_WORD
, csm
->locale
,
289 ubrk_setUText(csm
->iter
, &utext
, pErrorCode
);
290 if(U_FAILURE(*pErrorCode
)) {
295 /* set up local variables */
300 /* titlecasing loop */
301 while(prev
<srcLength
) {
302 /* find next index where to titlecase */
305 index
=ubrk_first(csm
->iter
);
307 index
=ubrk_next(csm
->iter
);
309 if(index
==UBRK_DONE
|| index
>srcLength
) {
314 * Unicode 4 & 5 section 3.13 Default Case Operations:
316 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
317 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
318 * cased character F. If F exists, map F to default_title(F); then map each
319 * subsequent character C to default_lower(C).
321 * In this implementation, segment [prev..index[ into 3 parts:
322 * a) uncased characters (copy as-is) [prev..titleStart[
323 * b) first case letter (titlecase) [titleStart..titleLimit[
324 * c) subsequent characters (lowercase) [titleLimit..index[
327 /* find and copy uncased characters [prev..titleStart[ */
328 titleStart
=titleLimit
=prev
;
329 U8_NEXT(src
, titleLimit
, index
, c
);
330 if((csm
->options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0 && UCASE_NONE
==ucase_getType(csm
->csp
, c
)) {
331 /* Adjust the titlecasing index (titleStart) to the next cased character. */
333 titleStart
=titleLimit
;
334 if(titleLimit
==index
) {
336 * only uncased characters in [prev..index[
337 * stop with titleStart==titleLimit==index
341 U8_NEXT(src
, titleLimit
, index
, c
);
342 if(UCASE_NONE
!=ucase_getType(csm
->csp
, c
)) {
343 break; /* cased letter at [titleStart..titleLimit[ */
346 length
=titleStart
-prev
;
348 if((destIndex
+length
)<=destCapacity
) {
349 uprv_memcpy(dest
+destIndex
, src
+prev
, length
);
355 if(titleStart
<titleLimit
) {
356 /* titlecase c which is from [titleStart..titleLimit[ */
357 csc
->cpStart
=titleStart
;
358 csc
->cpLimit
=titleLimit
;
359 c
=ucase_toFullTitle(csm
->csp
, c
, utf8_caseContextIterator
, csc
, &s
, csm
->locale
, &csm
->locCache
);
360 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
363 /* Special case Dutch IJ titlecasing */
364 if ( titleStart
+1 < index
&&
365 ucase_getCaseLocale(csm
->locale
,&csm
->locCache
) == UCASE_LOC_DUTCH
&&
366 ( src
[titleStart
] == 0x0049 || src
[titleStart
] == 0x0069 ) &&
367 ( src
[titleStart
+1] == 0x004A || src
[titleStart
+1] == 0x006A )) {
369 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
372 /* lowercase [titleLimit..index[ */
373 if(titleLimit
<index
) {
374 if((csm
->options
&U_TITLECASE_NO_LOWERCASE
)==0) {
375 /* Normal operation: Lowercase the rest of the word. */
378 csm
, ucase_toFullLower
,
379 dest
+destIndex
, destCapacity
-destIndex
,
384 /* Optionally just copy the rest of the word unchanged. */
385 length
=index
-titleLimit
;
386 if((destIndex
+length
)<=destCapacity
) {
387 uprv_memcpy(dest
+destIndex
, src
+titleLimit
, length
);
398 if(destIndex
>destCapacity
) {
399 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
408 utf8_foldCase(const UCaseProps
*csp
,
409 uint8_t *dest
, int32_t destCapacity
,
410 const uint8_t *src
, int32_t srcLength
,
412 UErrorCode
*pErrorCode
) {
413 int32_t srcIndex
, destIndex
;
419 /* case mapping loop */
420 srcIndex
=destIndex
=0;
421 while(srcIndex
<srcLength
) {
423 U8_NEXT(src
, srcIndex
, srcLength
, c
);
425 while(destIndex
<destCapacity
&& start
<srcIndex
) {
426 dest
[destIndex
++]=src
[start
++];
430 c
=ucase_toFullFolding(csp
, c
, &s
, options
);
431 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0x7f : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0x7f)) {
432 /* fast path version of appendResult() for ASCII results */
433 dest
[destIndex
++]=(uint8_t)c2
;
435 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
439 if(destIndex
>destCapacity
) {
440 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
446 * Implement argument checking and buffer handling
447 * for string case mapping as a common function.
450 /* common internal function for public API functions */
453 caseMap(const UCaseMap
*csm
,
454 uint8_t *dest
, int32_t destCapacity
,
455 const uint8_t *src
, int32_t srcLength
,
457 UErrorCode
*pErrorCode
) {
460 /* check argument values */
461 if(U_FAILURE(*pErrorCode
)) {
464 if( destCapacity
<0 ||
465 (dest
==NULL
&& destCapacity
>0) ||
469 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
473 /* get the string length */
475 srcLength
=uprv_strlen((const char *)src
);
478 /* check for overlapping source and destination */
480 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
481 (dest
>=src
&& dest
<(src
+srcLength
)))
483 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
489 if(toWhichCase
==FOLD_CASE
) {
490 destLength
=utf8_foldCase(csm
->csp
, dest
, destCapacity
, src
, srcLength
,
491 csm
->options
, pErrorCode
);
493 UCaseContext csc
={ NULL
};
498 if(toWhichCase
==TO_LOWER
) {
499 destLength
=_caseMap(csm
, ucase_toFullLower
,
504 } else if(toWhichCase
==TO_UPPER
) {
505 destLength
=_caseMap(csm
, ucase_toFullUpper
,
510 } else /* if(toWhichCase==TO_TITLE) */ {
511 #if UCONFIG_NO_BREAK_ITERATION
512 *pErrorCode
=U_UNSUPPORTED_ERROR
;
514 /* UCaseMap is actually non-const in toTitle() APIs. */
515 destLength
=_toTitle((UCaseMap
*)csm
, dest
, destCapacity
,
516 src
, &csc
, srcLength
,
522 return u_terminateChars((char *)dest
, destCapacity
, destLength
, pErrorCode
);
525 /* public API functions */
527 U_CAPI
int32_t U_EXPORT2
528 ucasemap_utf8ToLower(const UCaseMap
*csm
,
529 char *dest
, int32_t destCapacity
,
530 const char *src
, int32_t srcLength
,
531 UErrorCode
*pErrorCode
) {
533 (uint8_t *)dest
, destCapacity
,
534 (const uint8_t *)src
, srcLength
,
535 TO_LOWER
, pErrorCode
);
538 U_CAPI
int32_t U_EXPORT2
539 ucasemap_utf8ToUpper(const UCaseMap
*csm
,
540 char *dest
, int32_t destCapacity
,
541 const char *src
, int32_t srcLength
,
542 UErrorCode
*pErrorCode
) {
544 (uint8_t *)dest
, destCapacity
,
545 (const uint8_t *)src
, srcLength
,
546 TO_UPPER
, pErrorCode
);
549 #if !UCONFIG_NO_BREAK_ITERATION
551 U_CAPI
int32_t U_EXPORT2
552 ucasemap_utf8ToTitle(UCaseMap
*csm
,
553 char *dest
, int32_t destCapacity
,
554 const char *src
, int32_t srcLength
,
555 UErrorCode
*pErrorCode
) {
557 (uint8_t *)dest
, destCapacity
,
558 (const uint8_t *)src
, srcLength
,
559 TO_TITLE
, pErrorCode
);
564 U_CAPI
int32_t U_EXPORT2
565 ucasemap_utf8FoldCase(const UCaseMap
*csm
,
566 char *dest
, int32_t destCapacity
,
567 const char *src
, int32_t srcLength
,
568 UErrorCode
*pErrorCode
) {
570 (uint8_t *)dest
, destCapacity
,
571 (const uint8_t *)src
, srcLength
,
572 FOLD_CASE
, pErrorCode
);