X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..b25be06635768807f8f693286fa73bb2297bb06c:/icuSources/common/ucasemap.c diff --git a/icuSources/common/ucasemap.c b/icuSources/common/ucasemap.c index 02f94762..9f94235a 100644 --- a/icuSources/common/ucasemap.c +++ b/icuSources/common/ucasemap.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2005, International Business Machines +* Copyright (C) 2005-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -20,6 +20,10 @@ #include "unicode/uloc.h" #include "unicode/ustring.h" #include "unicode/ucasemap.h" +#if !UCONFIG_NO_BREAK_ITERATION +#include "unicode/ubrk.h" +#include "unicode/utext.h" +#endif #include "cmemory.h" #include "cstring.h" #include "ucase.h" @@ -27,14 +31,7 @@ /* UCaseMap service object -------------------------------------------------- */ -struct UCaseMap { - const UCaseProps *csp; - char locale[32]; - int32_t locCache; - uint32_t options; -}; - -U_DRAFT UCaseMap * U_EXPORT2 +U_CAPI UCaseMap * U_EXPORT2 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { UCaseMap *csm; @@ -48,7 +45,7 @@ ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { } uprv_memset(csm, 0, sizeof(UCaseMap)); - csm->csp=ucase_getSingleton(pErrorCode); + csm->csp=ucase_getSingleton(); ucasemap_setLocale(csm, locale, pErrorCode); if(U_FAILURE(*pErrorCode)) { uprv_free(csm); @@ -59,24 +56,27 @@ ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { return csm; } -U_DRAFT void U_EXPORT2 +U_CAPI void U_EXPORT2 ucasemap_close(UCaseMap *csm) { if(csm!=NULL) { +#if !UCONFIG_NO_BREAK_ITERATION + ubrk_close(csm->iter); +#endif uprv_free(csm); } } -U_DRAFT const char * U_EXPORT2 +U_CAPI const char * U_EXPORT2 ucasemap_getLocale(const UCaseMap *csm) { return csm->locale; } -U_DRAFT uint32_t U_EXPORT2 +U_CAPI uint32_t U_EXPORT2 ucasemap_getOptions(const UCaseMap *csm) { return csm->options; } -U_DRAFT void U_EXPORT2 +U_CAPI void U_EXPORT2 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { int32_t length; @@ -101,13 +101,30 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { } } -U_DRAFT void U_EXPORT2 +U_CAPI void U_EXPORT2 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { csm->options=options; } +#if !UCONFIG_NO_BREAK_ITERATION + +U_CAPI const UBreakIterator * U_EXPORT2 +ucasemap_getBreakIterator(const UCaseMap *csm) { + return csm->iter; +} + +U_CAPI void U_EXPORT2 +ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode) { + ubrk_close(csm->iter); + csm->iter=iterToAdopt; +} + +#endif + /* UTF-8 string case mappings ----------------------------------------------- */ +/* TODO(markus): Move to a new, separate utf8case.c file. */ + /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ static U_INLINE int32_t appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, @@ -146,7 +163,7 @@ appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, (char *)(dest+destIndex), destCapacity-destIndex, &destLength, s, length, &errorCode); - destIndex+=length; + destIndex+=destLength; /* we might have an overflow, but we know the actual length */ } } else { @@ -159,7 +176,7 @@ appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, NULL, 0, &destLength, s, length, &errorCode); - destIndex+=length; + destIndex+=destLength; } } return destIndex; @@ -197,12 +214,6 @@ utf8_caseContextIterator(void *context, int8_t dir) { return U_SENTINEL; } -typedef int32_t U_CALLCONV -UCaseMapFull(const UCaseProps *csp, UChar32 c, - UCaseContextIterator *iter, void *context, - const UChar **pString, - const char *locale, int32_t *locCache); - /* * Case-maps [srcStart..srcLimit[ but takes * context [0..srcLength[ into account. @@ -214,7 +225,7 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map, int32_t srcStart, int32_t srcLimit, UErrorCode *pErrorCode) { const UChar *s; - UChar32 c; + UChar32 c, c2 = 0; int32_t srcIndex, destIndex; int32_t locCache; @@ -227,8 +238,202 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map, csc->cpStart=srcIndex; U8_NEXT(src, srcIndex, srcLimit, c); csc->cpLimit=srcIndex; + if(c<0) { + int32_t i=csc->cpStart; + while(destIndexcsp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache); - destIndex=appendResult(dest, destIndex, destCapacity, c, s); + if((destIndexdestCapacity) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + return destIndex; +} + +#if !UCONFIG_NO_BREAK_ITERATION + +/* + * Internal titlecasing function. + */ +static int32_t +_toTitle(UCaseMap *csm, + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, UCaseContext *csc, + int32_t srcLength, + UErrorCode *pErrorCode) { + UText utext=UTEXT_INITIALIZER; + const UChar *s; + UChar32 c; + int32_t prev, titleStart, titleLimit, idx, destIndex, length; + UBool isFirstIndex; + + utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(csm->iter==NULL) { + csm->iter=ubrk_open(UBRK_WORD, csm->locale, + NULL, 0, + pErrorCode); + } + ubrk_setUText(csm->iter, &utext, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + utext_close(&utext); + return 0; + } + + /* set up local variables */ + destIndex=0; + prev=0; + isFirstIndex=TRUE; + + /* titlecasing loop */ + while(previter); + } else { + idx=ubrk_next(csm->iter); + } + if(idx==UBRK_DONE || idx>srcLength) { + idx=srcLength; + } + + /* + * Unicode 4 & 5 section 3.13 Default Case Operations: + * + * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex + * #29, "Text Boundaries." Between each pair of word boundaries, find the first + * cased character F. If F exists, map F to default_title(F); then map each + * subsequent character C to default_lower(C). + * + * In this implementation, segment [prev..index[ into 3 parts: + * a) uncased characters (copy as-is) [prev..titleStart[ + * b) first case letter (titlecase) [titleStart..titleLimit[ + * c) subsequent characters (lowercase) [titleLimit..index[ + */ + if(prevoptions&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { + /* Adjust the titlecasing index (titleStart) to the next cased character. */ + for(;;) { + titleStart=titleLimit; + if(titleLimit==idx) { + /* + * only uncased characters in [prev..index[ + * stop with titleStart==titleLimit==index + */ + break; + } + U8_NEXT(src, titleLimit, idx, c); + if(UCASE_NONE!=ucase_getType(csm->csp, c)) { + break; /* cased letter at [titleStart..titleLimit[ */ + } + } + length=titleStart-prev; + if(length>0) { + if((destIndex+length)<=destCapacity) { + uprv_memcpy(dest+destIndex, src+prev, length); + } + destIndex+=length; + } + } + + if(titleStartcpStart=titleStart; + csc->cpLimit=titleLimit; + c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache); + destIndex=appendResult(dest, destIndex, destCapacity, c, s); + + + /* Special case Dutch IJ titlecasing */ + if ( titleStart+1 < idx && + ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && + ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) && + ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { + c=0x004A; + destIndex=appendResult(dest, destIndex, destCapacity, c, s); + titleLimit++; + } + /* lowercase [titleLimit..index[ */ + if(titleLimitoptions&U_TITLECASE_NO_LOWERCASE)==0) { + /* Normal operation: Lowercase the rest of the word. */ + destIndex+= + _caseMap( + csm, ucase_toFullLower, + dest+destIndex, destCapacity-destIndex, + src, csc, + titleLimit, idx, + pErrorCode); + } else { + /* Optionally just copy the rest of the word unchanged. */ + length=idx-titleLimit; + if((destIndex+length)<=destCapacity) { + uprv_memcpy(dest+destIndex, src+titleLimit, length); + } + destIndex+=length; + } + } + } + } + + prev=idx; + } + + if(destIndex>destCapacity) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + utext_close(&utext); + return destIndex; +} + +#endif + +static int32_t +utf8_foldCase(const UCaseProps *csp, + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + uint32_t options, + UErrorCode *pErrorCode) { + int32_t srcIndex, destIndex; + + const UChar *s; + UChar32 c, c2; + int32_t start; + + /* case mapping loop */ + srcIndex=destIndex=0; + while(srcIndexdestCapacity) { @@ -241,12 +446,6 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map, * Implement argument checking and buffer handling * for string case mapping as a common function. */ -enum { - TO_LOWER, - TO_UPPER, - TO_TITLE, - FOLD_CASE -}; /* common internal function for public API functions */ @@ -256,7 +455,6 @@ caseMap(const UCaseMap *csm, const uint8_t *src, int32_t srcLength, int32_t toWhichCase, UErrorCode *pErrorCode) { - UCaseContext csc={ NULL }; int32_t destLength; /* check argument values */ @@ -274,7 +472,7 @@ caseMap(const UCaseMap *csm, /* get the string length */ if(srcLength==-1) { - srcLength=uprv_strlen((const char *)src); + srcLength=(int32_t)uprv_strlen((const char *)src); } /* check for overlapping source and destination */ @@ -288,21 +486,38 @@ caseMap(const UCaseMap *csm, destLength=0; - csc.p=(void *)src; - csc.limit=srcLength; - - if(toWhichCase==TO_LOWER) { - destLength=_caseMap(csm, ucase_toFullLower, - dest, destCapacity, - src, &csc, - 0, srcLength, - pErrorCode); - } else /* if(toWhichCase==TO_UPPER) */ { - destLength=_caseMap(csm, ucase_toFullUpper, - dest, destCapacity, - src, &csc, - 0, srcLength, - pErrorCode); + if(toWhichCase==FOLD_CASE) { + destLength=utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, + csm->options, pErrorCode); + } else { + UCaseContext csc={ NULL }; + + csc.p=(void *)src; + csc.limit=srcLength; + + if(toWhichCase==TO_LOWER) { + destLength=_caseMap(csm, ucase_toFullLower, + dest, destCapacity, + src, &csc, + 0, srcLength, + pErrorCode); + } else if(toWhichCase==TO_UPPER) { + destLength=_caseMap(csm, ucase_toFullUpper, + dest, destCapacity, + src, &csc, + 0, srcLength, + pErrorCode); + } else /* if(toWhichCase==TO_TITLE) */ { +#if UCONFIG_NO_BREAK_ITERATION + *pErrorCode=U_UNSUPPORTED_ERROR; +#else + /* UCaseMap is actually non-const in toTitle() APIs. */ + UCaseMap *tmp = (UCaseMap *)csm; + destLength=_toTitle(tmp, dest, destCapacity, + src, &csc, srcLength, + pErrorCode); +#endif + } } return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode); @@ -310,7 +525,7 @@ caseMap(const UCaseMap *csm, /* public API functions */ -U_DRAFT int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToLower(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, @@ -321,7 +536,7 @@ ucasemap_utf8ToLower(const UCaseMap *csm, TO_LOWER, pErrorCode); } -U_DRAFT int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToUpper(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, @@ -331,3 +546,29 @@ ucasemap_utf8ToUpper(const UCaseMap *csm, (const uint8_t *)src, srcLength, TO_UPPER, pErrorCode); } + +#if !UCONFIG_NO_BREAK_ITERATION + +U_CAPI int32_t U_EXPORT2 +ucasemap_utf8ToTitle(UCaseMap *csm, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode) { + return caseMap(csm, + (uint8_t *)dest, destCapacity, + (const uint8_t *)src, srcLength, + TO_TITLE, pErrorCode); +} + +#endif + +U_CAPI int32_t U_EXPORT2 +ucasemap_utf8FoldCase(const UCaseMap *csm, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode) { + return caseMap(csm, + (uint8_t *)dest, destCapacity, + (const uint8_t *)src, srcLength, + FOLD_CASE, pErrorCode); +}