icuSources/common/unistr_case.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 1999-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  unistr_case.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:2
  14 *
  15 *   created on: 2004aug19
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Case-mapping functions moved here from unistr.cpp
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/brkiter.h"
  23 #include "unicode/casemap.h"
  24 #include "unicode/edits.h"
  25 #include "unicode/putil.h"
  26 #include "cstring.h"
  27 #include "cmemory.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/unistr.h"
  30 #include "unicode/uchar.h"
  31 #include "uassert.h"
  32 #include "ucasemap_imp.h"
  33 #include "uelement.h"
  34
  35 U_NAMESPACE_BEGIN
  36
  37 //========================================
  38 // Read-only implementation
  39 //========================================
  40
  41 int8_t
  42 UnicodeString::doCaseCompare(int32_t start,
  43                              int32_t length,
  44                              const UChar *srcChars,
  45                              int32_t srcStart,
  46                              int32_t srcLength,
  47                              uint32_t options) const
  48 {
  49   // compare illegal string values
  50   // treat const UChar *srcChars==NULL as an empty string
  51   if(isBogus()) {
  52     return -1;
  53   }
  54
  55   // pin indices to legal values
  56   pinIndices(start, length);
  57
  58   if(srcChars == NULL) {
  59     srcStart = srcLength = 0;
  60   }
  61
  62   // get the correct pointer
  63   const UChar *chars = getArrayStart();
  64
  65   chars += start;
  66   if(srcStart!=0) {
  67     srcChars += srcStart;
  68   }
  69
  70   if(chars != srcChars) {
  71     UErrorCode errorCode=U_ZERO_ERROR;
  72     int32_t result=u_strcmpFold(chars, length, srcChars, srcLength,
  73                                 options|U_COMPARE_IGNORE_CASE, &errorCode);
  74     if(result!=0) {
  75       return (int8_t)(result >> 24 | 1);
  76     }
  77   } else {
  78     // get the srcLength if necessary
  79     if(srcLength < 0) {
  80       srcLength = u_strlen(srcChars + srcStart);
  81     }
  82     if(length != srcLength) {
  83       return (int8_t)((length - srcLength) >> 24 | 1);
  84     }
  85   }
  86   return 0;
  87 }
  88
  89 //========================================
  90 // Write implementation
  91 //========================================
  92
  93 UnicodeString &
  94 UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  95                        UStringCaseMapper *stringCaseMapper) {
  96   if(isEmpty() || !isWritable()) {
  97     // nothing to do
  98     return *this;
  99   }
 100
 101   UChar oldBuffer[2 * US_STACKBUF_SIZE];
 102   UChar *oldArray;
 103   int32_t oldLength = length();
 104   int32_t newLength;
 105   UBool writable = isBufferWritable();
 106   UErrorCode errorCode = U_ZERO_ERROR;
 107
 108 #if !UCONFIG_NO_BREAK_ITERATION
 109   // Read-only alias to the original string contents for the titlecasing BreakIterator.
 110   // We cannot set the iterator simply to *this because *this is being modified.
 111   UnicodeString oldString;
 112 #endif
 113
 114   // Try to avoid heap-allocating a new character array for this string.
 115   if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
 116     // Short string: Copy the contents into a temporary buffer and
 117     // case-map back into the current array, or into the stack buffer.
 118     UChar *buffer = getArrayStart();
 119     int32_t capacity;
 120     oldArray = oldBuffer;
 121     u_memcpy(oldBuffer, buffer, oldLength);
 122     if (writable) {
 123       capacity = getCapacity();
 124     } else {
 125       // Switch from the read-only alias or shared heap buffer to the stack buffer.
 126       if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
 127         return *this;
 128       }
 129       U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
 130       buffer = fUnion.fStackFields.fBuffer;
 131       capacity = US_STACKBUF_SIZE;
 132     }
 133 #if !UCONFIG_NO_BREAK_ITERATION
 134     if (iter != nullptr) {
 135       oldString.setTo(FALSE, oldArray, oldLength);
 136       iter->setText(oldString);
 137     }
 138 #endif
 139     newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
 140                                  buffer, capacity,
 141                                  oldArray, oldLength, NULL, errorCode);
 142     if (U_SUCCESS(errorCode)) {
 143       setLength(newLength);
 144       return *this;
 145     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
 146       // common overflow handling below
 147     } else {
 148       setToBogus();
 149       return *this;
 150     }
 151   } else {
 152     // Longer string or read-only buffer:
 153     // Collect only changes and then apply them to this string.
 154     // Case mapping often changes only small parts of a string,
 155     // and often does not change its length.
 156     oldArray = getArrayStart();
 157     Edits edits;
 158     UChar replacementChars[200];
 159 #if !UCONFIG_NO_BREAK_ITERATION
 160     if (iter != nullptr) {
 161       oldString.setTo(FALSE, oldArray, oldLength);
 162       iter->setText(oldString);
 163     }
 164 #endif
 165     stringCaseMapper(caseLocale, options | U_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR
 166                      replacementChars, UPRV_LENGTHOF(replacementChars),
 167                      oldArray, oldLength, &edits, errorCode);
 168     if (U_SUCCESS(errorCode)) {
 169       // Grow the buffer at most once, not for multiple doReplace() calls.
 170       newLength = oldLength + edits.lengthDelta();
 171       if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
 172         return *this;
 173       }
 174       for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) {
 175         doReplace(ei.destinationIndex(), ei.oldLength(),
 176                   replacementChars, ei.replacementIndex(), ei.newLength());
 177       }
 178       if (U_FAILURE(errorCode)) {
 179         setToBogus();
 180       }
 181       return *this;
 182     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
 183       // common overflow handling below
 184       newLength = oldLength + edits.lengthDelta();
 185     } else {
 186       setToBogus();
 187       return *this;
 188     }
 189   }
 190
 191   // Handle buffer overflow, newLength is known.
 192   // We need to allocate a new buffer for the internal string case mapping function.
 193   // This is very similar to how doReplace() keeps the old array pointer
 194   // and deletes the old array itself after it is done.
 195   // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
 196   int32_t *bufferToDelete = 0;
 197   if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
 198     return *this;
 199   }
 200   errorCode = U_ZERO_ERROR;
 201   // No need to iter->setText() again: The case mapper restarts via iter->first().
 202   newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
 203                                getArrayStart(), getCapacity(),
 204                                oldArray, oldLength, NULL, errorCode);
 205   if (bufferToDelete) {
 206     uprv_free(bufferToDelete);
 207   }
 208   if (U_SUCCESS(errorCode)) {
 209     setLength(newLength);
 210   } else {
 211     setToBogus();
 212   }
 213   return *this;
 214 }
 215
 216 UnicodeString &
 217 UnicodeString::foldCase(uint32_t options) {
 218   return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);
 219 }
 220
 221 U_NAMESPACE_END
 222
 223 // Defined here to reduce dependencies on break iterator
 224 U_CAPI int32_t U_EXPORT2
 225 uhash_hashCaselessUnicodeString(const UElement key) {
 226     U_NAMESPACE_USE
 227     const UnicodeString *str = (const UnicodeString*) key.pointer;
 228     if (str == NULL) {
 229         return 0;
 230     }
 231     // Inefficient; a better way would be to have a hash function in
 232     // UnicodeString that does case folding on the fly.
 233     UnicodeString copy(*str);
 234     return copy.foldCase().hashCode();
 235 }
 236
 237 // Defined here to reduce dependencies on break iterator
 238 U_CAPI UBool U_EXPORT2
 239 uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) {
 240     U_NAMESPACE_USE
 241     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
 242     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
 243     if (str1 == str2) {
 244         return TRUE;
 245     }
 246     if (str1 == NULL || str2 == NULL) {
 247         return FALSE;
 248     }
 249     return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == 0;
 250 }