icuSources/common/unistr_case.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 1999-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  unistr_case.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:2
  14 *
  15 *   created on: 2004aug19
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Case-mapping functions moved here from unistr.cpp
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/casemap.h"
  23 #include "unicode/edits.h"
  24 #include "unicode/putil.h"
  25 #include "cstring.h"
  26 #include "cmemory.h"
  27 #include "unicode/ustring.h"
  28 #include "unicode/unistr.h"
  29 #include "unicode/uchar.h"
  30 #include "uassert.h"
  31 #include "ucasemap_imp.h"
  32 #include "uelement.h"
  33
  34 U_NAMESPACE_BEGIN
  35
  36 //========================================
  37 // Read-only implementation
  38 //========================================
  39
  40 int8_t
  41 UnicodeString::doCaseCompare(int32_t start,
  42                              int32_t length,
  43                              const UChar *srcChars,
  44                              int32_t srcStart,
  45                              int32_t srcLength,
  46                              uint32_t options) const
  47 {
  48   // compare illegal string values
  49   // treat const UChar *srcChars==NULL as an empty string
  50   if(isBogus()) {
  51     return -1;
  52   }
  53
  54   // pin indices to legal values
  55   pinIndices(start, length);
  56
  57   if(srcChars == NULL) {
  58     srcStart = srcLength = 0;
  59   }
  60
  61   // get the correct pointer
  62   const UChar *chars = getArrayStart();
  63
  64   chars += start;
  65   if(srcStart!=0) {
  66     srcChars += srcStart;
  67   }
  68
  69   if(chars != srcChars) {
  70     UErrorCode errorCode=U_ZERO_ERROR;
  71     int32_t result=u_strcmpFold(chars, length, srcChars, srcLength,
  72                                 options|U_COMPARE_IGNORE_CASE, &errorCode);
  73     if(result!=0) {
  74       return (int8_t)(result >> 24 | 1);
  75     }
  76   } else {
  77     // get the srcLength if necessary
  78     if(srcLength < 0) {
  79       srcLength = u_strlen(srcChars + srcStart);
  80     }
  81     if(length != srcLength) {
  82       return (int8_t)((length - srcLength) >> 24 | 1);
  83     }
  84   }
  85   return 0;
  86 }
  87
  88 //========================================
  89 // Write implementation
  90 //========================================
  91
  92 UnicodeString &
  93 UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  94                        UStringCaseMapper *stringCaseMapper) {
  95   if(isEmpty() || !isWritable()) {
  96     // nothing to do
  97     return *this;
  98   }
  99
 100   UChar oldBuffer[2 * US_STACKBUF_SIZE];
 101   UChar *oldArray;
 102   int32_t oldLength = length();
 103   int32_t newLength;
 104   UBool writable = isBufferWritable();
 105   UErrorCode errorCode = U_ZERO_ERROR;
 106
 107   // Try to avoid heap-allocating a new character array for this string.
 108   if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
 109     // Short string: Copy the contents into a temporary buffer and
 110     // case-map back into the current array, or into the stack buffer.
 111     UChar *buffer = getArrayStart();
 112     int32_t capacity;
 113     oldArray = oldBuffer;
 114     u_memcpy(oldBuffer, buffer, oldLength);
 115     if (writable) {
 116       capacity = getCapacity();
 117     } else {
 118       // Switch from the read-only alias or shared heap buffer to the stack buffer.
 119       if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
 120         return *this;
 121       }
 122       U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
 123       buffer = fUnion.fStackFields.fBuffer;
 124       capacity = US_STACKBUF_SIZE;
 125     }
 126     newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
 127                                  buffer, capacity,
 128                                  oldArray, oldLength, NULL, errorCode);
 129     if (U_SUCCESS(errorCode)) {
 130       setLength(newLength);
 131       return *this;
 132     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
 133       // common overflow handling below
 134     } else {
 135       setToBogus();
 136       return *this;
 137     }
 138   } else {
 139     // Longer string or read-only buffer:
 140     // Collect only changes and then apply them to this string.
 141     // Case mapping often changes only small parts of a string,
 142     // and often does not change its length.
 143     oldArray = getArrayStart();
 144     Edits edits;
 145     UChar replacementChars[200];
 146     stringCaseMapper(caseLocale, options | UCASEMAP_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR
 147                      replacementChars, UPRV_LENGTHOF(replacementChars),
 148                      oldArray, oldLength, &edits, errorCode);
 149     if (U_SUCCESS(errorCode)) {
 150       // Grow the buffer at most once, not for multiple doReplace() calls.
 151       newLength = oldLength + edits.lengthDelta();
 152       if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
 153         return *this;
 154       }
 155       for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) {
 156         doReplace(ei.destinationIndex(), ei.oldLength(),
 157                   replacementChars, ei.replacementIndex(), ei.newLength());
 158       }
 159       if (U_FAILURE(errorCode)) {
 160         setToBogus();
 161       }
 162       return *this;
 163     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
 164       // common overflow handling below
 165       newLength = oldLength + edits.lengthDelta();
 166     } else {
 167       setToBogus();
 168       return *this;
 169     }
 170   }
 171
 172   // Handle buffer overflow, newLength is known.
 173   // We need to allocate a new buffer for the internal string case mapping function.
 174   // This is very similar to how doReplace() keeps the old array pointer
 175   // and deletes the old array itself after it is done.
 176   // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
 177   int32_t *bufferToDelete = 0;
 178   if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
 179     return *this;
 180   }
 181   errorCode = U_ZERO_ERROR;
 182   newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
 183                                getArrayStart(), getCapacity(),
 184                                oldArray, oldLength, NULL, errorCode);
 185   if (bufferToDelete) {
 186     uprv_free(bufferToDelete);
 187   }
 188   if (U_SUCCESS(errorCode)) {
 189     setLength(newLength);
 190   } else {
 191     setToBogus();
 192   }
 193   return *this;
 194 }
 195
 196 UnicodeString &
 197 UnicodeString::foldCase(uint32_t options) {
 198   return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);
 199 }
 200
 201 U_NAMESPACE_END
 202
 203 // Defined here to reduce dependencies on break iterator
 204 U_CAPI int32_t U_EXPORT2
 205 uhash_hashCaselessUnicodeString(const UElement key) {
 206     U_NAMESPACE_USE
 207     const UnicodeString *str = (const UnicodeString*) key.pointer;
 208     if (str == NULL) {
 209         return 0;
 210     }
 211     // Inefficient; a better way would be to have a hash function in
 212     // UnicodeString that does case folding on the fly.
 213     UnicodeString copy(*str);
 214     return copy.foldCase().hashCode();
 215 }
 216
 217 // Defined here to reduce dependencies on break iterator
 218 U_CAPI UBool U_EXPORT2
 219 uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) {
 220     U_NAMESPACE_USE
 221     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
 222     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
 223     if (str1 == str2) {
 224         return TRUE;
 225     }
 226     if (str1 == NULL || str2 == NULL) {
 227         return FALSE;
 228     }
 229     return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == 0;
 230 }