2 *******************************************************************************
4 * Copyright (C) 2005-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucasemap.cpp
10 * tab size: 8 (not used)
13 * created on: 2005may06
14 * created by: Markus W. Scherer
16 * Case mapping service object and functions using it.
19 #include "unicode/utypes.h"
20 #include "unicode/brkiter.h"
21 #include "unicode/ubrk.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ucasemap.h"
25 #if !UCONFIG_NO_BREAK_ITERATION
26 #include "unicode/utext.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
38 /* UCaseMap service object -------------------------------------------------- */
40 U_CAPI UCaseMap
* U_EXPORT2
41 ucasemap_open(const char *locale
, uint32_t options
, UErrorCode
*pErrorCode
) {
44 if(U_FAILURE(*pErrorCode
)) {
48 csm
=(UCaseMap
*)uprv_malloc(sizeof(UCaseMap
));
52 uprv_memset(csm
, 0, sizeof(UCaseMap
));
54 csm
->csp
=ucase_getSingleton();
55 ucasemap_setLocale(csm
, locale
, pErrorCode
);
56 if(U_FAILURE(*pErrorCode
)) {
66 ucasemap_close(UCaseMap
*csm
) {
68 #if !UCONFIG_NO_BREAK_ITERATION
69 // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
70 delete reinterpret_cast<BreakIterator
*>(csm
->iter
);
76 U_CAPI
const char * U_EXPORT2
77 ucasemap_getLocale(const UCaseMap
*csm
) {
81 U_CAPI
uint32_t U_EXPORT2
82 ucasemap_getOptions(const UCaseMap
*csm
) {
87 ucasemap_setLocale(UCaseMap
*csm
, const char *locale
, UErrorCode
*pErrorCode
) {
90 if(U_FAILURE(*pErrorCode
)) {
94 length
=uloc_getName(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
95 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
|| length
==sizeof(csm
->locale
)) {
96 *pErrorCode
=U_ZERO_ERROR
;
97 /* we only really need the language code for case mappings */
98 length
=uloc_getLanguage(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
100 if(length
==sizeof(csm
->locale
)) {
101 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
104 if(U_SUCCESS(*pErrorCode
)) {
105 ucase_getCaseLocale(csm
->locale
, &csm
->locCache
);
111 U_CAPI
void U_EXPORT2
112 ucasemap_setOptions(UCaseMap
*csm
, uint32_t options
, UErrorCode
* /*pErrorCode*/) {
113 csm
->options
=options
;
116 /* UTF-8 string case mappings ----------------------------------------------- */
118 /* TODO(markus): Move to a new, separate utf8case.c file. */
120 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
121 static inline int32_t
122 appendResult(uint8_t *dest
, int32_t destIndex
, int32_t destCapacity
,
123 int32_t result
, const UChar
*s
) {
126 UErrorCode errorCode
;
128 /* decode the result */
130 /* (not) original code point */
133 } else if(result
<=UCASE_MAX_STRING_LENGTH
) {
140 if(length
>(INT32_MAX
-destIndex
)) {
141 return -1; // integer overflow
144 if(destIndex
<destCapacity
) {
145 /* append the result */
149 U8_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
151 /* overflow, nothing written */
157 errorCode
=U_ZERO_ERROR
;
159 (char *)(dest
+destIndex
), destCapacity
-destIndex
, &destLength
,
162 if(U_FAILURE(errorCode
) && errorCode
!= U_BUFFER_OVERFLOW_ERROR
) {
165 if(destLength
>(INT32_MAX
-destIndex
)) {
166 return -1; // integer overflow
168 destIndex
+=destLength
;
169 /* we might have an overflow, but we know the actual length */
177 errorCode
=U_ZERO_ERROR
;
179 NULL
, 0, &destLength
,
182 if(U_FAILURE(errorCode
) && errorCode
!= U_BUFFER_OVERFLOW_ERROR
) {
185 if(destLength
>(INT32_MAX
-destIndex
)) {
186 return -1; // integer overflow
188 destIndex
+=destLength
;
194 static inline int32_t
195 appendUChar(uint8_t *dest
, int32_t destIndex
, int32_t destCapacity
, UChar c
) {
196 int32_t length
=U8_LENGTH(c
);
197 if(length
>(INT32_MAX
-destIndex
)) {
198 return -1; // integer overflow
200 int32_t limit
=destIndex
+length
;
201 if(limit
<destCapacity
) {
202 U8_APPEND_UNSAFE(dest
, destIndex
, c
);
207 static UChar32 U_CALLCONV
208 utf8_caseContextIterator(void *context
, int8_t dir
) {
209 UCaseContext
*csc
=(UCaseContext
*)context
;
213 /* reset for backward iteration */
214 csc
->index
=csc
->cpStart
;
217 /* reset for forward iteration */
218 csc
->index
=csc
->cpLimit
;
221 /* continue current iteration direction */
226 if(csc
->start
<csc
->index
) {
227 U8_PREV((const uint8_t *)csc
->p
, csc
->start
, csc
->index
, c
);
231 if(csc
->index
<csc
->limit
) {
232 U8_NEXT((const uint8_t *)csc
->p
, csc
->index
, csc
->limit
, c
);
240 * Case-maps [srcStart..srcLimit[ but takes
241 * context [0..srcLength[ into account.
244 _caseMap(const UCaseMap
*csm
, UCaseMapFull
*map
,
245 uint8_t *dest
, int32_t destCapacity
,
246 const uint8_t *src
, UCaseContext
*csc
,
247 int32_t srcStart
, int32_t srcLimit
,
248 UErrorCode
*pErrorCode
) {
249 const UChar
*s
= NULL
;
251 int32_t srcIndex
, destIndex
;
254 locCache
=csm
->locCache
;
256 /* case mapping loop */
259 while(srcIndex
<srcLimit
) {
260 csc
->cpStart
=srcIndex
;
261 U8_NEXT(src
, srcIndex
, srcLimit
, c
);
262 csc
->cpLimit
=srcIndex
;
264 int32_t i
=csc
->cpStart
;
265 while(destIndex
<destCapacity
&& i
<srcIndex
) {
266 dest
[destIndex
++]=src
[i
++];
270 c
=map(csm
->csp
, c
, utf8_caseContextIterator
, csc
, &s
, csm
->locale
, &locCache
);
271 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0x7f : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0x7f)) {
272 /* fast path version of appendResult() for ASCII results */
273 dest
[destIndex
++]=(uint8_t)c2
;
275 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
277 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
283 if(destIndex
>destCapacity
) {
284 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
289 #if !UCONFIG_NO_BREAK_ITERATION
291 U_CFUNC
int32_t U_CALLCONV
292 ucasemap_internalUTF8ToTitle(const UCaseMap
*csm
,
293 uint8_t *dest
, int32_t destCapacity
,
294 const uint8_t *src
, int32_t srcLength
,
295 UErrorCode
*pErrorCode
) {
298 int32_t prev
, titleStart
, titleLimit
, idx
, destIndex
, length
;
301 if(U_FAILURE(*pErrorCode
)) {
305 // Use the C++ abstract base class to minimize dependencies.
306 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
307 BreakIterator
*bi
=reinterpret_cast<BreakIterator
*>(csm
->iter
);
309 /* set up local variables */
310 int32_t locCache
=csm
->locCache
;
311 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
318 /* titlecasing loop */
319 while(prev
<srcLength
) {
320 /* find next index where to titlecase */
327 if(idx
==UBRK_DONE
|| idx
>srcLength
) {
332 * Unicode 4 & 5 section 3.13 Default Case Operations:
334 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
335 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
336 * cased character F. If F exists, map F to default_title(F); then map each
337 * subsequent character C to default_lower(C).
339 * In this implementation, segment [prev..index[ into 3 parts:
340 * a) uncased characters (copy as-is) [prev..titleStart[
341 * b) first case letter (titlecase) [titleStart..titleLimit[
342 * c) subsequent characters (lowercase) [titleLimit..index[
345 /* find and copy uncased characters [prev..titleStart[ */
346 titleStart
=titleLimit
=prev
;
347 U8_NEXT(src
, titleLimit
, idx
, c
);
348 if((csm
->options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0 && UCASE_NONE
==ucase_getType(csm
->csp
, c
)) {
349 /* Adjust the titlecasing index (titleStart) to the next cased character. */
351 titleStart
=titleLimit
;
352 if(titleLimit
==idx
) {
354 * only uncased characters in [prev..index[
355 * stop with titleStart==titleLimit==index
359 U8_NEXT(src
, titleLimit
, idx
, c
);
360 if(UCASE_NONE
!=ucase_getType(csm
->csp
, c
)) {
361 break; /* cased letter at [titleStart..titleLimit[ */
364 length
=titleStart
-prev
;
366 if((destIndex
+length
)<=destCapacity
) {
367 uprv_memcpy(dest
+destIndex
, src
+prev
, length
);
373 if(titleStart
<titleLimit
) {
374 /* titlecase c which is from [titleStart..titleLimit[ */
375 csc
.cpStart
=titleStart
;
376 csc
.cpLimit
=titleLimit
;
377 c
=ucase_toFullTitle(csm
->csp
, c
, utf8_caseContextIterator
, &csc
, &s
, csm
->locale
, &locCache
);
378 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
380 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
384 /* Special case Dutch IJ titlecasing */
385 if (titleStart
+1 < idx
&&
386 ucase_getCaseLocale(csm
->locale
, &locCache
) == UCASE_LOC_DUTCH
&&
387 (src
[titleStart
] == 0x0049 || src
[titleStart
] == 0x0069) &&
388 (src
[titleStart
+1] == 0x004A || src
[titleStart
+1] == 0x006A)) {
389 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x004A);
392 /* lowercase [titleLimit..index[ */
394 if((csm
->options
&U_TITLECASE_NO_LOWERCASE
)==0) {
395 /* Normal operation: Lowercase the rest of the word. */
398 csm
, ucase_toFullLower
,
399 dest
+destIndex
, destCapacity
-destIndex
,
403 if(U_FAILURE(*pErrorCode
)) {
407 /* Optionally just copy the rest of the word unchanged. */
408 length
=idx
-titleLimit
;
409 if(length
>(INT32_MAX
-destIndex
)) {
410 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
413 if((destIndex
+length
)<=destCapacity
) {
414 uprv_memcpy(dest
+destIndex
, src
+titleLimit
, length
);
425 if(destIndex
>destCapacity
) {
426 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
433 static int32_t U_CALLCONV
434 ucasemap_internalUTF8ToLower(const UCaseMap
*csm
,
435 uint8_t *dest
, int32_t destCapacity
,
436 const uint8_t *src
, int32_t srcLength
,
437 UErrorCode
*pErrorCode
) {
438 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
442 csm
, ucase_toFullLower
,
444 src
, &csc
, 0, srcLength
,
448 static int32_t U_CALLCONV
449 ucasemap_internalUTF8ToUpper(const UCaseMap
*csm
,
450 uint8_t *dest
, int32_t destCapacity
,
451 const uint8_t *src
, int32_t srcLength
,
452 UErrorCode
*pErrorCode
) {
453 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
457 csm
, ucase_toFullUpper
,
459 src
, &csc
, 0, srcLength
,
464 utf8_foldCase(const UCaseProps
*csp
,
465 uint8_t *dest
, int32_t destCapacity
,
466 const uint8_t *src
, int32_t srcLength
,
468 UErrorCode
*pErrorCode
) {
469 int32_t srcIndex
, destIndex
;
475 /* case mapping loop */
476 srcIndex
=destIndex
=0;
477 while(srcIndex
<srcLength
) {
479 U8_NEXT(src
, srcIndex
, srcLength
, c
);
481 while(destIndex
<destCapacity
&& start
<srcIndex
) {
482 dest
[destIndex
++]=src
[start
++];
486 c
=ucase_toFullFolding(csp
, c
, &s
, options
);
487 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0x7f : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0x7f)) {
488 /* fast path version of appendResult() for ASCII results */
489 dest
[destIndex
++]=(uint8_t)c2
;
491 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
493 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
499 if(destIndex
>destCapacity
) {
500 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
505 static int32_t U_CALLCONV
506 ucasemap_internalUTF8Fold(const UCaseMap
*csm
,
507 uint8_t *dest
, int32_t destCapacity
,
508 const uint8_t *src
, int32_t srcLength
,
509 UErrorCode
*pErrorCode
) {
510 return utf8_foldCase(csm
->csp
, dest
, destCapacity
, src
, srcLength
, csm
->options
, pErrorCode
);
514 ucasemap_mapUTF8(const UCaseMap
*csm
,
515 uint8_t *dest
, int32_t destCapacity
,
516 const uint8_t *src
, int32_t srcLength
,
517 UTF8CaseMapper
*stringCaseMapper
,
518 UErrorCode
*pErrorCode
) {
521 /* check argument values */
522 if(U_FAILURE(*pErrorCode
)) {
525 if( destCapacity
<0 ||
526 (dest
==NULL
&& destCapacity
>0) ||
530 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
534 /* get the string length */
536 srcLength
=(int32_t)uprv_strlen((const char *)src
);
539 /* check for overlapping source and destination */
541 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
542 (dest
>=src
&& dest
<(src
+srcLength
)))
544 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
548 destLength
=stringCaseMapper(csm
, dest
, destCapacity
, src
, srcLength
, pErrorCode
);
549 return u_terminateChars((char *)dest
, destCapacity
, destLength
, pErrorCode
);
552 /* public API functions */
554 U_CAPI
int32_t U_EXPORT2
555 ucasemap_utf8ToLower(const UCaseMap
*csm
,
556 char *dest
, int32_t destCapacity
,
557 const char *src
, int32_t srcLength
,
558 UErrorCode
*pErrorCode
) {
559 return ucasemap_mapUTF8(csm
,
560 (uint8_t *)dest
, destCapacity
,
561 (const uint8_t *)src
, srcLength
,
562 ucasemap_internalUTF8ToLower
, pErrorCode
);
565 U_CAPI
int32_t U_EXPORT2
566 ucasemap_utf8ToUpper(const UCaseMap
*csm
,
567 char *dest
, int32_t destCapacity
,
568 const char *src
, int32_t srcLength
,
569 UErrorCode
*pErrorCode
) {
570 return ucasemap_mapUTF8(csm
,
571 (uint8_t *)dest
, destCapacity
,
572 (const uint8_t *)src
, srcLength
,
573 ucasemap_internalUTF8ToUpper
, pErrorCode
);
576 U_CAPI
int32_t U_EXPORT2
577 ucasemap_utf8FoldCase(const UCaseMap
*csm
,
578 char *dest
, int32_t destCapacity
,
579 const char *src
, int32_t srcLength
,
580 UErrorCode
*pErrorCode
) {
581 return ucasemap_mapUTF8(csm
,
582 (uint8_t *)dest
, destCapacity
,
583 (const uint8_t *)src
, srcLength
,
584 ucasemap_internalUTF8Fold
, pErrorCode
);