2 *******************************************************************************
4 * Copyright (C) 2005-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucasemap.cpp
10 * tab size: 8 (not used)
13 * created on: 2005may06
14 * created by: Markus W. Scherer
16 * Case mapping service object and functions using it.
19 #include "unicode/utypes.h"
20 #include "unicode/brkiter.h"
21 #include "unicode/ubrk.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ucasemap.h"
25 #if !UCONFIG_NO_BREAK_ITERATION
26 #include "unicode/utext.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
38 /* UCaseMap service object -------------------------------------------------- */
40 U_CAPI UCaseMap
* U_EXPORT2
41 ucasemap_open(const char *locale
, uint32_t options
, UErrorCode
*pErrorCode
) {
44 if(U_FAILURE(*pErrorCode
)) {
48 csm
=(UCaseMap
*)uprv_malloc(sizeof(UCaseMap
));
52 uprv_memset(csm
, 0, sizeof(UCaseMap
));
54 csm
->csp
=ucase_getSingleton();
55 ucasemap_setLocale(csm
, locale
, pErrorCode
);
56 if(U_FAILURE(*pErrorCode
)) {
66 ucasemap_close(UCaseMap
*csm
) {
68 #if !UCONFIG_NO_BREAK_ITERATION
69 // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
70 delete reinterpret_cast<BreakIterator
*>(csm
->iter
);
76 U_CAPI
const char * U_EXPORT2
77 ucasemap_getLocale(const UCaseMap
*csm
) {
81 U_CAPI
uint32_t U_EXPORT2
82 ucasemap_getOptions(const UCaseMap
*csm
) {
87 ucasemap_setLocale(UCaseMap
*csm
, const char *locale
, UErrorCode
*pErrorCode
) {
90 if(U_FAILURE(*pErrorCode
)) {
94 length
=uloc_getName(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
95 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
|| length
==sizeof(csm
->locale
)) {
96 *pErrorCode
=U_ZERO_ERROR
;
97 /* we only really need the language code for case mappings */
98 length
=uloc_getLanguage(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
100 if(length
==sizeof(csm
->locale
)) {
101 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
104 if(U_SUCCESS(*pErrorCode
)) {
105 ucase_getCaseLocale(csm
->locale
, &csm
->locCache
);
111 U_CAPI
void U_EXPORT2
112 ucasemap_setOptions(UCaseMap
*csm
, uint32_t options
, UErrorCode
* /*pErrorCode*/) {
113 csm
->options
=options
;
116 /* UTF-8 string case mappings ----------------------------------------------- */
118 /* TODO(markus): Move to a new, separate utf8case.c file. */
120 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
121 static inline int32_t
122 appendResult(uint8_t *dest
, int32_t destIndex
, int32_t destCapacity
,
123 int32_t result
, const UChar
*s
) {
125 int32_t length
, destLength
;
126 UErrorCode errorCode
;
128 /* decode the result */
130 /* (not) original code point */
133 } else if(result
<=UCASE_MAX_STRING_LENGTH
) {
141 if(destIndex
<destCapacity
) {
142 /* append the result */
146 U8_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
148 /* overflow, nothing written */
149 destIndex
+=U8_LENGTH(c
);
153 errorCode
=U_ZERO_ERROR
;
155 (char *)(dest
+destIndex
), destCapacity
-destIndex
, &destLength
,
158 destIndex
+=destLength
;
159 /* we might have an overflow, but we know the actual length */
164 destIndex
+=U8_LENGTH(c
);
166 errorCode
=U_ZERO_ERROR
;
168 NULL
, 0, &destLength
,
171 destIndex
+=destLength
;
177 static UChar32 U_CALLCONV
178 utf8_caseContextIterator(void *context
, int8_t dir
) {
179 UCaseContext
*csc
=(UCaseContext
*)context
;
183 /* reset for backward iteration */
184 csc
->index
=csc
->cpStart
;
187 /* reset for forward iteration */
188 csc
->index
=csc
->cpLimit
;
191 /* continue current iteration direction */
196 if(csc
->start
<csc
->index
) {
197 U8_PREV((const uint8_t *)csc
->p
, csc
->start
, csc
->index
, c
);
201 if(csc
->index
<csc
->limit
) {
202 U8_NEXT((const uint8_t *)csc
->p
, csc
->index
, csc
->limit
, c
);
210 * Case-maps [srcStart..srcLimit[ but takes
211 * context [0..srcLength[ into account.
214 _caseMap(const UCaseMap
*csm
, UCaseMapFull
*map
,
215 uint8_t *dest
, int32_t destCapacity
,
216 const uint8_t *src
, UCaseContext
*csc
,
217 int32_t srcStart
, int32_t srcLimit
,
218 UErrorCode
*pErrorCode
) {
221 int32_t srcIndex
, destIndex
;
224 locCache
=csm
->locCache
;
226 /* case mapping loop */
229 while(srcIndex
<srcLimit
) {
230 csc
->cpStart
=srcIndex
;
231 U8_NEXT(src
, srcIndex
, srcLimit
, c
);
232 csc
->cpLimit
=srcIndex
;
234 int32_t i
=csc
->cpStart
;
235 while(destIndex
<destCapacity
&& i
<srcIndex
) {
236 dest
[destIndex
++]=src
[i
++];
240 c
=map(csm
->csp
, c
, utf8_caseContextIterator
, csc
, &s
, csm
->locale
, &locCache
);
241 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0x7f : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0x7f)) {
242 /* fast path version of appendResult() for ASCII results */
243 dest
[destIndex
++]=(uint8_t)c2
;
245 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
249 if(destIndex
>destCapacity
) {
250 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
255 #if !UCONFIG_NO_BREAK_ITERATION
257 U_CFUNC
int32_t U_CALLCONV
258 ucasemap_internalUTF8ToTitle(const UCaseMap
*csm
,
259 uint8_t *dest
, int32_t destCapacity
,
260 const uint8_t *src
, int32_t srcLength
,
261 UErrorCode
*pErrorCode
) {
264 int32_t prev
, titleStart
, titleLimit
, idx
, destIndex
, length
;
267 if(U_FAILURE(*pErrorCode
)) {
271 // Use the C++ abstract base class to minimize dependencies.
272 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
273 BreakIterator
*bi
=reinterpret_cast<BreakIterator
*>(csm
->iter
);
275 /* set up local variables */
276 int32_t locCache
=csm
->locCache
;
277 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
284 /* titlecasing loop */
285 while(prev
<srcLength
) {
286 /* find next index where to titlecase */
293 if(idx
==UBRK_DONE
|| idx
>srcLength
) {
298 * Unicode 4 & 5 section 3.13 Default Case Operations:
300 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
301 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
302 * cased character F. If F exists, map F to default_title(F); then map each
303 * subsequent character C to default_lower(C).
305 * In this implementation, segment [prev..index[ into 3 parts:
306 * a) uncased characters (copy as-is) [prev..titleStart[
307 * b) first case letter (titlecase) [titleStart..titleLimit[
308 * c) subsequent characters (lowercase) [titleLimit..index[
311 /* find and copy uncased characters [prev..titleStart[ */
312 titleStart
=titleLimit
=prev
;
313 U8_NEXT(src
, titleLimit
, idx
, c
);
314 if((csm
->options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0 && UCASE_NONE
==ucase_getType(csm
->csp
, c
)) {
315 /* Adjust the titlecasing index (titleStart) to the next cased character. */
317 titleStart
=titleLimit
;
318 if(titleLimit
==idx
) {
320 * only uncased characters in [prev..index[
321 * stop with titleStart==titleLimit==index
325 U8_NEXT(src
, titleLimit
, idx
, c
);
326 if(UCASE_NONE
!=ucase_getType(csm
->csp
, c
)) {
327 break; /* cased letter at [titleStart..titleLimit[ */
330 length
=titleStart
-prev
;
332 if((destIndex
+length
)<=destCapacity
) {
333 uprv_memcpy(dest
+destIndex
, src
+prev
, length
);
339 if(titleStart
<titleLimit
) {
340 /* titlecase c which is from [titleStart..titleLimit[ */
341 csc
.cpStart
=titleStart
;
342 csc
.cpLimit
=titleLimit
;
343 c
=ucase_toFullTitle(csm
->csp
, c
, utf8_caseContextIterator
, &csc
, &s
, csm
->locale
, &locCache
);
344 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
346 /* Special case Dutch IJ titlecasing */
347 if ( titleStart
+1 < idx
&&
348 ucase_getCaseLocale(csm
->locale
, &locCache
) == UCASE_LOC_DUTCH
&&
349 ( src
[titleStart
] == 0x0049 || src
[titleStart
] == 0x0069 ) &&
350 ( src
[titleStart
+1] == 0x004A || src
[titleStart
+1] == 0x006A )) {
352 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
355 /* lowercase [titleLimit..index[ */
357 if((csm
->options
&U_TITLECASE_NO_LOWERCASE
)==0) {
358 /* Normal operation: Lowercase the rest of the word. */
361 csm
, ucase_toFullLower
,
362 dest
+destIndex
, destCapacity
-destIndex
,
367 /* Optionally just copy the rest of the word unchanged. */
368 length
=idx
-titleLimit
;
369 if((destIndex
+length
)<=destCapacity
) {
370 uprv_memcpy(dest
+destIndex
, src
+titleLimit
, length
);
381 if(destIndex
>destCapacity
) {
382 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
389 static int32_t U_CALLCONV
390 ucasemap_internalUTF8ToLower(const UCaseMap
*csm
,
391 uint8_t *dest
, int32_t destCapacity
,
392 const uint8_t *src
, int32_t srcLength
,
393 UErrorCode
*pErrorCode
) {
394 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
398 csm
, ucase_toFullLower
,
400 src
, &csc
, 0, srcLength
,
404 static int32_t U_CALLCONV
405 ucasemap_internalUTF8ToUpper(const UCaseMap
*csm
,
406 uint8_t *dest
, int32_t destCapacity
,
407 const uint8_t *src
, int32_t srcLength
,
408 UErrorCode
*pErrorCode
) {
409 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
413 csm
, ucase_toFullUpper
,
415 src
, &csc
, 0, srcLength
,
420 utf8_foldCase(const UCaseProps
*csp
,
421 uint8_t *dest
, int32_t destCapacity
,
422 const uint8_t *src
, int32_t srcLength
,
424 UErrorCode
*pErrorCode
) {
425 int32_t srcIndex
, destIndex
;
431 /* case mapping loop */
432 srcIndex
=destIndex
=0;
433 while(srcIndex
<srcLength
) {
435 U8_NEXT(src
, srcIndex
, srcLength
, c
);
437 while(destIndex
<destCapacity
&& start
<srcIndex
) {
438 dest
[destIndex
++]=src
[start
++];
442 c
=ucase_toFullFolding(csp
, c
, &s
, options
);
443 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0x7f : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0x7f)) {
444 /* fast path version of appendResult() for ASCII results */
445 dest
[destIndex
++]=(uint8_t)c2
;
447 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
451 if(destIndex
>destCapacity
) {
452 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
457 static int32_t U_CALLCONV
458 ucasemap_internalUTF8Fold(const UCaseMap
*csm
,
459 uint8_t *dest
, int32_t destCapacity
,
460 const uint8_t *src
, int32_t srcLength
,
461 UErrorCode
*pErrorCode
) {
462 return utf8_foldCase(csm
->csp
, dest
, destCapacity
, src
, srcLength
, csm
->options
, pErrorCode
);
466 ucasemap_mapUTF8(const UCaseMap
*csm
,
467 uint8_t *dest
, int32_t destCapacity
,
468 const uint8_t *src
, int32_t srcLength
,
469 UTF8CaseMapper
*stringCaseMapper
,
470 UErrorCode
*pErrorCode
) {
473 /* check argument values */
474 if(U_FAILURE(*pErrorCode
)) {
477 if( destCapacity
<0 ||
478 (dest
==NULL
&& destCapacity
>0) ||
482 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
486 /* get the string length */
488 srcLength
=(int32_t)uprv_strlen((const char *)src
);
491 /* check for overlapping source and destination */
493 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
494 (dest
>=src
&& dest
<(src
+srcLength
)))
496 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
500 destLength
=stringCaseMapper(csm
, dest
, destCapacity
, src
, srcLength
, pErrorCode
);
501 return u_terminateChars((char *)dest
, destCapacity
, destLength
, pErrorCode
);
504 /* public API functions */
506 U_CAPI
int32_t U_EXPORT2
507 ucasemap_utf8ToLower(const UCaseMap
*csm
,
508 char *dest
, int32_t destCapacity
,
509 const char *src
, int32_t srcLength
,
510 UErrorCode
*pErrorCode
) {
511 return ucasemap_mapUTF8(csm
,
512 (uint8_t *)dest
, destCapacity
,
513 (const uint8_t *)src
, srcLength
,
514 ucasemap_internalUTF8ToLower
, pErrorCode
);
517 U_CAPI
int32_t U_EXPORT2
518 ucasemap_utf8ToUpper(const UCaseMap
*csm
,
519 char *dest
, int32_t destCapacity
,
520 const char *src
, int32_t srcLength
,
521 UErrorCode
*pErrorCode
) {
522 return ucasemap_mapUTF8(csm
,
523 (uint8_t *)dest
, destCapacity
,
524 (const uint8_t *)src
, srcLength
,
525 ucasemap_internalUTF8ToUpper
, pErrorCode
);
528 U_CAPI
int32_t U_EXPORT2
529 ucasemap_utf8FoldCase(const UCaseMap
*csm
,
530 char *dest
, int32_t destCapacity
,
531 const char *src
, int32_t srcLength
,
532 UErrorCode
*pErrorCode
) {
533 return ucasemap_mapUTF8(csm
,
534 (uint8_t *)dest
, destCapacity
,
535 (const uint8_t *)src
, srcLength
,
536 ucasemap_internalUTF8Fold
, pErrorCode
);