]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ustrcase.c
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / common / ustrcase.c
index d7531f2d3c4ee09f79e5b6701df1532361f4e7ec..4b62fb95c3a14cb4f98ab365a54548d42f8f76e0 100644 (file)
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2001-2004, International Business Machines
+*   Copyright (C) 2001-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 #include "unicode/utypes.h"
 #include "unicode/uloc.h"
 #include "unicode/ustring.h"
+#include "unicode/ucasemap.h"
 #include "unicode/ubrk.h"
 #include "cmemory.h"
 #include "ucase.h"
-#include "unormimp.h"
 #include "ustr_imp.h"
 
 /* string casing ------------------------------------------------------------ */
@@ -114,26 +114,22 @@ utf16_caseContextIterator(void *context, int8_t dir) {
     return U_SENTINEL;
 }
 
-typedef int32_t U_CALLCONV
-UCaseMapFull(const UCaseProps *csp, UChar32 c,
-             UCaseContextIterator *iter, void *context,
-             const UChar **pString,
-             const char *locale, int32_t *locCache);
-
 /*
- * Lowercases [srcStart..srcLimit[ but takes
+ * Case-maps [srcStart..srcLimit[ but takes
  * context [0..srcLength[ into account.
  */
 static int32_t
-_caseMap(UCaseProps *csp, UCaseMapFull *map,
+_caseMap(const UCaseMap *csm, UCaseMapFull *map,
          UChar *dest, int32_t destCapacity,
          const UChar *src, UCaseContext *csc,
          int32_t srcStart, int32_t srcLimit,
-         const char *locale, int32_t *locCache,
          UErrorCode *pErrorCode) {
     const UChar *s;
-    UChar32 c;
+    UChar32 c, c2 = 0;
     int32_t srcIndex, destIndex;
+    int32_t locCache;
+
+    locCache=csm->locCache;
 
     /* case mapping loop */
     srcIndex=srcStart;
@@ -142,8 +138,13 @@ _caseMap(UCaseProps *csp, UCaseMapFull *map,
         csc->cpStart=srcIndex;
         U16_NEXT(src, srcIndex, srcLimit, c);
         csc->cpLimit=srcIndex;
-        c=map(csp, c, utf16_caseContextIterator, csc, &s, locale, locCache);
-        destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+        c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
+        if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
+            /* fast path version of appendResult() for BMP results */
+            dest[destIndex++]=(UChar)c2;
+        } else {
+            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+        }
     }
 
     if(destIndex>destCapacity) {
@@ -152,26 +153,80 @@ _caseMap(UCaseProps *csp, UCaseMapFull *map,
     return destIndex;
 }
 
+static void
+setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
+    /*
+     * We could call ucasemap_setLocale(), but here we really only care about
+     * the initial language subtag, we need not return the real string via
+     * ucasemap_getLocale(), and we don't care about only getting "x" from
+     * "x-some-thing" etc.
+     *
+     * We ignore locales with a longer-than-3 initial subtag.
+     *
+     * We also do not fill in the locCache because it is rarely used,
+     * and not worth setting unless we reuse it for many case mapping operations.
+     * (That's why UCaseMap was created.)
+     */
+    int i;
+    char c;
+
+    /* the internal functions require locale!=NULL */
+    if(locale==NULL) {
+        locale=uloc_getDefault();
+    }
+    for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) {
+        csm->locale[i]=c;
+    }
+    if(i<=3) {
+        csm->locale[i]=0;  /* Up to 3 non-separator characters. */
+    } else {
+        csm->locale[0]=0;  /* Longer-than-3 initial subtag: Ignore. */
+    }
+}
+
+/*
+ * Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
+ * Do this fast because it is called with every function call.
+ */
+static U_INLINE void
+setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
+    if(csm->csp==NULL) {
+        csm->csp=ucase_getSingleton();
+    }
+    if(locale!=NULL && locale[0]==0) {
+        csm->locale[0]=0;
+    } else {
+        setTempCaseMapLocale(csm, locale, pErrorCode);
+    }
+}
+
 #if !UCONFIG_NO_BREAK_ITERATION
 
 /*
  * Internal titlecasing function.
- *
- * Must get titleIter!=NULL.
  */
 static int32_t
-_toTitle(UCaseProps *csp,
+_toTitle(UCaseMap *csm,
          UChar *dest, int32_t destCapacity,
          const UChar *src, UCaseContext *csc,
          int32_t srcLength,
-         UBreakIterator *titleIter,
-         const char *locale, int32_t *locCache,
          UErrorCode *pErrorCode) {
     const UChar *s;
     UChar32 c;
-    int32_t prev, index, destIndex;
+    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
     UBool isFirstIndex;
 
+    if(csm->iter!=NULL) {
+        ubrk_setText(csm->iter, src, srcLength, pErrorCode);
+    } else {
+        csm->iter=ubrk_open(UBRK_WORD, csm->locale,
+                            src, srcLength,
+                            pErrorCode);
+    }
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
     /* set up local variables */
     destIndex=0;
     prev=0;
@@ -182,38 +237,97 @@ _toTitle(UCaseProps *csp,
         /* find next index where to titlecase */
         if(isFirstIndex) {
             isFirstIndex=FALSE;
-            index=ubrk_first(titleIter);
+            idx=ubrk_first(csm->iter);
         } else {
-            index=ubrk_next(titleIter);
+            idx=ubrk_next(csm->iter);
         }
-        if(index==UBRK_DONE || index>srcLength) {
-            index=srcLength;
+        if(idx==UBRK_DONE || idx>srcLength) {
+            idx=srcLength;
         }
 
-        /* lowercase [prev..index[ */
-        if(prev<index) {
-            destIndex+=
-                _caseMap(
-                    csp, ucase_toFullLower,
-                    dest+destIndex, destCapacity-destIndex,
-                    src, csc,
-                    prev, index,
-                    locale, locCache,
-                    pErrorCode);
-        }
+        /*
+         * Unicode 4 & 5 section 3.13 Default Case Operations:
+         *
+         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
+         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
+         * cased character F. If F exists, map F to default_title(F); then map each
+         * subsequent character C to default_lower(C).
+         *
+         * In this implementation, segment [prev..index[ into 3 parts:
+         * a) uncased characters (copy as-is) [prev..titleStart[
+         * b) first case letter (titlecase)         [titleStart..titleLimit[
+         * c) subsequent characters (lowercase)                 [titleLimit..index[
+         */
+        if(prev<idx) {
+            /* find and copy uncased characters [prev..titleStart[ */
+            titleStart=titleLimit=prev;
+            U16_NEXT(src, titleLimit, idx, c);
+            if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
+                /* Adjust the titlecasing index (titleStart) to the next cased character. */
+                for(;;) {
+                    titleStart=titleLimit;
+                    if(titleLimit==idx) {
+                        /*
+                         * only uncased characters in [prev..index[
+                         * stop with titleStart==titleLimit==index
+                         */
+                        break;
+                    }
+                    U16_NEXT(src, titleLimit, idx, c);
+                    if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
+                        break; /* cased letter at [titleStart..titleLimit[ */
+                    }
+                }
+                length=titleStart-prev;
+                if(length>0) {
+                    if((destIndex+length)<=destCapacity) {
+                        uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
+                    }
+                    destIndex+=length;
+                }
+            }
 
-        if(index>=srcLength) {
-            break;
-        }
+            if(titleStart<titleLimit) {
+                /* titlecase c which is from [titleStart..titleLimit[ */
+                csc->cpStart=titleStart;
+                csc->cpLimit=titleLimit;
+                c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
+                destIndex=appendResult(dest, destIndex, destCapacity, c, s); 
+
+                /* Special case Dutch IJ titlecasing */
+                if ( titleStart+1 < idx && 
+                     ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
+                     ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
+                     ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 
+                            c=(UChar32) 0x004A;
+                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                            titleLimit++;
+                }
 
-        /* titlecase the character at the found index */
-        csc->cpStart=index;
-        U16_NEXT(src, index, srcLength, c);
-        csc->cpLimit=index;
-        c=ucase_toFullTitle(csp, c, utf16_caseContextIterator, csc, &s, locale, locCache);
-        destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                /* lowercase [titleLimit..index[ */
+                if(titleLimit<idx) {
+                    if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
+                        /* Normal operation: Lowercase the rest of the word. */
+                        destIndex+=
+                            _caseMap(
+                                csm, ucase_toFullLower,
+                                dest+destIndex, destCapacity-destIndex,
+                                src, csc,
+                                titleLimit, idx,
+                                pErrorCode);
+                    } else {
+                        /* Optionally just copy the rest of the word unchanged. */
+                        length=idx-titleLimit;
+                        if((destIndex+length)<=destCapacity) {
+                            uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
+                        }
+                        destIndex+=length;
+                    }
+                }
+            }
+        }
 
-        prev=index;
+        prev=idx;
     }
 
     if(destIndex>destCapacity) {
@@ -222,70 +336,84 @@ _toTitle(UCaseProps *csp,
     return destIndex;
 }
 
+#endif
+
+/* functions available in the common library (for unistr_case.cpp) */
+
 U_CFUNC int32_t
-ustr_toTitle(UCaseProps *csp,
+ustr_toLower(const UCaseProps *csp,
              UChar *dest, int32_t destCapacity,
              const UChar *src, int32_t srcLength,
-             UBreakIterator *titleIter,
              const char *locale,
              UErrorCode *pErrorCode) {
+    UCaseMap csm={ NULL };
     UCaseContext csc={ NULL };
-    int32_t locCache;
 
+    csm.csp=csp;
+    setTempCaseMap(&csm, locale, pErrorCode);
     csc.p=(void *)src;
     csc.limit=srcLength;
-    locCache=0;
 
-    return _toTitle(csp,
+    return _caseMap(&csm, ucase_toFullLower,
                     dest, destCapacity,
-                    src, &csc, srcLength,
-                    titleIter, locale, &locCache, pErrorCode);
+                    src, &csc, 0, srcLength,
+                    pErrorCode);
 }
 
-#endif
-
-/* functions available in the common library (for unistr_case.cpp) */
-
 U_CFUNC int32_t
-ustr_toLower(UCaseProps *csp,
+ustr_toUpper(const UCaseProps *csp,
              UChar *dest, int32_t destCapacity,
              const UChar *src, int32_t srcLength,
              const char *locale,
              UErrorCode *pErrorCode) {
+    UCaseMap csm={ NULL };
     UCaseContext csc={ NULL };
-    int32_t locCache;
 
+    csm.csp=csp;
+    setTempCaseMap(&csm, locale, pErrorCode);
     csc.p=(void *)src;
     csc.limit=srcLength;
-    locCache=0;
 
-    return _caseMap(csp, ucase_toFullLower,
+    return _caseMap(&csm, ucase_toFullUpper,
                     dest, destCapacity,
                     src, &csc, 0, srcLength,
-                    locale, &locCache, pErrorCode);
+                    pErrorCode);
 }
 
+#if !UCONFIG_NO_BREAK_ITERATION
+
 U_CFUNC int32_t
-ustr_toUpper(UCaseProps *csp,
+ustr_toTitle(const UCaseProps *csp,
              UChar *dest, int32_t destCapacity,
              const UChar *src, int32_t srcLength,
-             const char *locale,
+             UBreakIterator *titleIter,
+             const char *locale, uint32_t options,
              UErrorCode *pErrorCode) {
+    UCaseMap csm={ NULL };
     UCaseContext csc={ NULL };
-    int32_t locCache;
+    int32_t length;
 
+    csm.csp=csp;
+    csm.iter=titleIter;
+    csm.options=options;
+    setTempCaseMap(&csm, locale, pErrorCode);
     csc.p=(void *)src;
     csc.limit=srcLength;
-    locCache=0;
 
-    return _caseMap(csp, ucase_toFullUpper,
+    length=_toTitle(&csm,
                     dest, destCapacity,
-                    src, &csc, 0, srcLength,
-                    locale, &locCache, pErrorCode);
+                    src, &csc, srcLength,
+                    pErrorCode);
+    if(titleIter==NULL && csm.iter!=NULL) {
+        ubrk_close(csm.iter);
+    }
+    return length;
 }
 
+#endif
+
 U_CFUNC int32_t
-ustr_foldCase(UCaseProps *csp,
+ustr_foldCase(const UCaseProps *csp,
               UChar *dest, int32_t destCapacity,
               const UChar *src, int32_t srcLength,
               uint32_t options,
@@ -293,14 +421,19 @@ ustr_foldCase(UCaseProps *csp,
     int32_t srcIndex, destIndex;
 
     const UChar *s;
-    UChar32 c;
+    UChar32 c, c2 = 0;
 
     /* case mapping loop */
     srcIndex=destIndex=0;
     while(srcIndex<srcLength) {
         U16_NEXT(src, srcIndex, srcLength, c);
         c=ucase_toFullFolding(csp, c, &s, options);
-        destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+        if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
+            /* fast path version of appendResult() for BMP results */
+            dest[destIndex++]=(UChar)c2;
+        } else {
+            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+        }
     }
 
     if(destIndex>destCapacity) {
@@ -313,30 +446,19 @@ ustr_foldCase(UCaseProps *csp,
  * Implement argument checking and buffer handling
  * for string case mapping as a common function.
  */
-enum {
-    TO_LOWER,
-    TO_UPPER,
-    TO_TITLE,
-    FOLD_CASE
-};
 
 /* common internal function for public API functions */
 
 static int32_t
-caseMap(UChar *dest, int32_t destCapacity,
+caseMap(const UCaseMap *csm,
+        UChar *dest, int32_t destCapacity,
         const UChar *src, int32_t srcLength,
-        UBreakIterator *titleIter,
-        const char *locale,
-        uint32_t options,
         int32_t toWhichCase,
         UErrorCode *pErrorCode) {
     UChar buffer[300];
     UChar *temp;
 
-    UCaseProps *csp;
-
     int32_t destLength;
-    UBool ownTitleIter;
 
     /* check argument values */
     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
@@ -351,11 +473,6 @@ caseMap(UChar *dest, int32_t destCapacity,
         return 0;
     }
 
-    csp=ucase_getSingleton(pErrorCode);
-    if(U_FAILURE(*pErrorCode)) {
-        return 0;
-    }
-
     /* get the string length */
     if(srcLength==-1) {
         srcLength=u_strlen(src);
@@ -382,53 +499,38 @@ caseMap(UChar *dest, int32_t destCapacity,
         temp=dest;
     }
 
-    ownTitleIter=FALSE;
     destLength=0;
 
     if(toWhichCase==FOLD_CASE) {
-        destLength=ustr_foldCase(csp, temp, destCapacity, src, srcLength,
-                                 options, pErrorCode);
+        destLength=ustr_foldCase(csm->csp, temp, destCapacity, src, srcLength,
+                                 csm->options, pErrorCode);
     } else {
         UCaseContext csc={ NULL };
-        int32_t locCache;
 
         csc.p=(void *)src;
         csc.limit=srcLength;
-        locCache=0;
-
-        /* the internal functions require locale!=NULL */
-        if(locale==NULL) {
-            locale=uloc_getDefault();
-        }
 
         if(toWhichCase==TO_LOWER) {
-            destLength=_caseMap(csp, ucase_toFullLower,
+            destLength=_caseMap(csm, ucase_toFullLower,
                                 temp, destCapacity,
                                 src, &csc,
                                 0, srcLength,
-                                locale, &locCache, pErrorCode);
+                                pErrorCode);
         } else if(toWhichCase==TO_UPPER) {
-            destLength=_caseMap(csp, ucase_toFullUpper,
+            destLength=_caseMap(csm, ucase_toFullUpper,
                                 temp, destCapacity,
                                 src, &csc,
                                 0, srcLength,
-                                locale, &locCache, pErrorCode);
+                                pErrorCode);
         } else /* if(toWhichCase==TO_TITLE) */ {
-    #if UCONFIG_NO_BREAK_ITERATION
+#if UCONFIG_NO_BREAK_ITERATION
             *pErrorCode=U_UNSUPPORTED_ERROR;
-    #else
-            if(titleIter==NULL) {
-                titleIter=ubrk_open(UBRK_WORD, locale,
-                                    src, srcLength,
-                                    pErrorCode);
-                ownTitleIter=(UBool)U_SUCCESS(*pErrorCode);
-            }
-            if(U_SUCCESS(*pErrorCode)) {
-                destLength=_toTitle(csp, temp, destCapacity,
-                                    src, &csc, srcLength,
-                                    titleIter, locale, &locCache, pErrorCode);
-            }
-    #endif
+#else
+            /* UCaseMap is actually non-const in toTitle() APIs. */
+            destLength=_toTitle((UCaseMap *)csm, temp, destCapacity,
+                                src, &csc, srcLength,
+                                pErrorCode);
+#endif
         }
     }
     if(temp!=dest) {
@@ -444,12 +546,6 @@ caseMap(UChar *dest, int32_t destCapacity,
         }
     }
 
-#if !UCONFIG_NO_BREAK_ITERATION
-    if(ownTitleIter) {
-        ubrk_close(titleIter);
-    }
-#endif
-
     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
 }
 
@@ -460,9 +556,11 @@ u_strToLower(UChar *dest, int32_t destCapacity,
              const UChar *src, int32_t srcLength,
              const char *locale,
              UErrorCode *pErrorCode) {
-    return caseMap(dest, destCapacity,
+    UCaseMap csm={ NULL };
+    setTempCaseMap(&csm, locale, pErrorCode);
+    return caseMap(&csm,
+                   dest, destCapacity,
                    src, srcLength,
-                   NULL, locale, 0,
                    TO_LOWER, pErrorCode);
 }
 
@@ -471,9 +569,11 @@ u_strToUpper(UChar *dest, int32_t destCapacity,
              const UChar *src, int32_t srcLength,
              const char *locale,
              UErrorCode *pErrorCode) {
-    return caseMap(dest, destCapacity,
+    UCaseMap csm={ NULL };
+    setTempCaseMap(&csm, locale, pErrorCode);
+    return caseMap(&csm,
+                   dest, destCapacity,
                    src, srcLength,
-                   NULL, locale, 0,
                    TO_UPPER, pErrorCode);
 }
 
@@ -485,9 +585,29 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
              UBreakIterator *titleIter,
              const char *locale,
              UErrorCode *pErrorCode) {
-    return caseMap(dest, destCapacity,
+    UCaseMap csm={ NULL };
+    int32_t length;
+
+    csm.iter=titleIter;
+    setTempCaseMap(&csm, locale, pErrorCode);
+    length=caseMap(&csm,
+                   dest, destCapacity,
+                   src, srcLength,
+                   TO_TITLE, pErrorCode);
+    if(titleIter==NULL && csm.iter!=NULL) {
+        ubrk_close(csm.iter);
+    }
+    return length;
+}
+
+U_CAPI int32_t U_EXPORT2
+ucasemap_toTitle(UCaseMap *csm,
+                 UChar *dest, int32_t destCapacity,
+                 const UChar *src, int32_t srcLength,
+                 UErrorCode *pErrorCode) {
+    return caseMap(csm,
+                   dest, destCapacity,
                    src, srcLength,
-                   titleIter, locale, 0,
                    TO_TITLE, pErrorCode);
 }
 
@@ -498,9 +618,12 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
               const UChar *src, int32_t srcLength,
               uint32_t options,
               UErrorCode *pErrorCode) {
-    return caseMap(dest, destCapacity,
+    UCaseMap csm={ NULL };
+    csm.csp=ucase_getSingleton();
+    csm.options=options;
+    return caseMap(&csm,
+                   dest, destCapacity,
                    src, srcLength,
-                   NULL, NULL, options,
                    FOLD_CASE, pErrorCode);
 }
 
@@ -527,7 +650,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
              const UChar *s2, int32_t length2,
              uint32_t options,
              UErrorCode *pErrorCode) {
-    UCaseProps *csp;
+    const UCaseProps *csp;
 
     /* current-level start/limit - s1/s2 as current */
     const UChar *start1, *start2, *limit1, *limit2;
@@ -554,7 +677,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
      * assume that at least the option U_COMPARE_IGNORE_CASE is set
      * otherwise this function would have to behave exactly as uprv_strCompare()
      */
-    csp=ucase_getSingleton(pErrorCode);
+    csp=ucase_getSingleton();
     if(U_FAILURE(*pErrorCode)) {
         return 0;
     }