runtime/UString.cpp

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   5  *  Copyright (C) 2009 Google Inc. All rights reserved.
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public License
  18  *  along with this library; see the file COPYING.LIB.  If not, write to
  19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  20  *  Boston, MA 02110-1301, USA.
  21  *
  22  */
  23
  24 #include "config.h"
  25 #include "UString.h"
  26
  27 #include "JSGlobalObjectFunctions.h"
  28 #include "Heap.h"
  29 #include "Identifier.h"
  30 #include "Operations.h"
  31 #include <ctype.h>
  32 #include <limits.h>
  33 #include <limits>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <wtf/ASCIICType.h>
  37 #include <wtf/Assertions.h>
  38 #include <wtf/DecimalNumber.h>
  39 #include <wtf/MathExtras.h>
  40 #include <wtf/StringExtras.h>
  41 #include <wtf/Vector.h>
  42 #include <wtf/unicode/UTF8.h>
  43
  44 #if HAVE(STRINGS_H)
  45 #include <strings.h>
  46 #endif
  47
  48 using namespace WTF;
  49 using namespace WTF::Unicode;
  50 using namespace std;
  51
  52 namespace JSC {
  53
  54 extern const double NaN;
  55 extern const double Inf;
  56
  57 COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
  58
  59 // Construct a string with UTF-16 data.
  60 UString::UString(const UChar* characters, unsigned length)
  61     : m_impl(characters ? StringImpl::create(characters, length) : 0)
  62 {
  63 }
  64
  65 // Construct a string with UTF-16 data, from a null-terminated source.
  66 UString::UString(const UChar* characters)
  67 {
  68     if (!characters)
  69         return;
  70
  71     int length = 0;
  72     while (characters[length] != UChar(0))
  73         ++length;
  74
  75     m_impl = StringImpl::create(characters, length);
  76 }
  77
  78 // Construct a string with latin1 data.
  79 UString::UString(const char* characters, unsigned length)
  80     : m_impl(characters ? StringImpl::create(characters, length) : 0)
  81 {
  82 }
  83
  84 // Construct a string with latin1 data, from a null-terminated source.
  85 UString::UString(const char* characters)
  86     : m_impl(characters ? StringImpl::create(characters) : 0)
  87 {
  88 }
  89
  90 UString UString::number(int i)
  91 {
  92     UChar buf[1 + sizeof(i) * 3];
  93     UChar* end = buf + WTF_ARRAY_LENGTH(buf);
  94     UChar* p = end;
  95
  96     if (i == 0)
  97         *--p = '0';
  98     else if (i == INT_MIN) {
  99         char minBuf[1 + sizeof(i) * 3];
 100         snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN);
 101         return UString(minBuf);
 102     } else {
 103         bool negative = false;
 104         if (i < 0) {
 105             negative = true;
 106             i = -i;
 107         }
 108         while (i) {
 109             *--p = static_cast<unsigned short>((i % 10) + '0');
 110             i /= 10;
 111         }
 112         if (negative)
 113             *--p = '-';
 114     }
 115
 116     return UString(p, static_cast<unsigned>(end - p));
 117 }
 118
 119 UString UString::number(long long i)
 120 {
 121     UChar buf[1 + sizeof(i) * 3];
 122     UChar* end = buf + WTF_ARRAY_LENGTH(buf);
 123     UChar* p = end;
 124
 125     if (i == 0)
 126         *--p = '0';
 127     else if (i == std::numeric_limits<long long>::min()) {
 128         char minBuf[1 + sizeof(i) * 3];
 129 #if OS(WINDOWS)
 130         snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min());
 131 #else
 132         snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min());
 133 #endif
 134         return UString(minBuf);
 135     } else {
 136         bool negative = false;
 137         if (i < 0) {
 138             negative = true;
 139             i = -i;
 140         }
 141         while (i) {
 142             *--p = static_cast<unsigned short>((i % 10) + '0');
 143             i /= 10;
 144         }
 145         if (negative)
 146             *--p = '-';
 147     }
 148
 149     return UString(p, static_cast<unsigned>(end - p));
 150 }
 151
 152 UString UString::number(unsigned u)
 153 {
 154     UChar buf[sizeof(u) * 3];
 155     UChar* end = buf + WTF_ARRAY_LENGTH(buf);
 156     UChar* p = end;
 157
 158     if (u == 0)
 159         *--p = '0';
 160     else {
 161         while (u) {
 162             *--p = static_cast<unsigned short>((u % 10) + '0');
 163             u /= 10;
 164         }
 165     }
 166
 167     return UString(p, static_cast<unsigned>(end - p));
 168 }
 169
 170 UString UString::number(long l)
 171 {
 172     UChar buf[1 + sizeof(l) * 3];
 173     UChar* end = buf + WTF_ARRAY_LENGTH(buf);
 174     UChar* p = end;
 175
 176     if (l == 0)
 177         *--p = '0';
 178     else if (l == LONG_MIN) {
 179         char minBuf[1 + sizeof(l) * 3];
 180         snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN);
 181         return UString(minBuf);
 182     } else {
 183         bool negative = false;
 184         if (l < 0) {
 185             negative = true;
 186             l = -l;
 187         }
 188         while (l) {
 189             *--p = static_cast<unsigned short>((l % 10) + '0');
 190             l /= 10;
 191         }
 192         if (negative)
 193             *--p = '-';
 194     }
 195
 196     return UString(p, end - p);
 197 }
 198
 199 UString UString::number(double d)
 200 {
 201     NumberToStringBuffer buffer;
 202     unsigned length = numberToString(d, buffer);
 203     return UString(buffer, length);
 204 }
 205
 206 UString UString::substringSharingImpl(unsigned offset, unsigned length) const
 207 {
 208     // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
 209
 210     unsigned stringLength = this->length();
 211     offset = min(offset, stringLength);
 212     length = min(length, stringLength - offset);
 213
 214     if (!offset && length == stringLength)
 215         return *this;
 216     return UString(StringImpl::create(m_impl, offset, length));
 217 }
 218
 219 bool operator==(const UString& s1, const char *s2)
 220 {
 221     if (s2 == 0)
 222         return s1.isEmpty();
 223
 224     const UChar* u = s1.characters();
 225     const UChar* uend = u + s1.length();
 226     while (u != uend && *s2) {
 227         if (u[0] != (unsigned char)*s2)
 228             return false;
 229         s2++;
 230         u++;
 231     }
 232
 233     return u == uend && *s2 == 0;
 234 }
 235
 236 bool operator<(const UString& s1, const UString& s2)
 237 {
 238     const unsigned l1 = s1.length();
 239     const unsigned l2 = s2.length();
 240     const unsigned lmin = l1 < l2 ? l1 : l2;
 241     const UChar* c1 = s1.characters();
 242     const UChar* c2 = s2.characters();
 243     unsigned l = 0;
 244     while (l < lmin && *c1 == *c2) {
 245         c1++;
 246         c2++;
 247         l++;
 248     }
 249     if (l < lmin)
 250         return (c1[0] < c2[0]);
 251
 252     return (l1 < l2);
 253 }
 254
 255 bool operator>(const UString& s1, const UString& s2)
 256 {
 257     const unsigned l1 = s1.length();
 258     const unsigned l2 = s2.length();
 259     const unsigned lmin = l1 < l2 ? l1 : l2;
 260     const UChar* c1 = s1.characters();
 261     const UChar* c2 = s2.characters();
 262     unsigned l = 0;
 263     while (l < lmin && *c1 == *c2) {
 264         c1++;
 265         c2++;
 266         l++;
 267     }
 268     if (l < lmin)
 269         return (c1[0] > c2[0]);
 270
 271     return (l1 > l2);
 272 }
 273
 274 CString UString::ascii() const
 275 {
 276     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
 277     // preserved, characters outside of this range are converted to '?'.
 278
 279     unsigned length = this->length();
 280     const UChar* characters = this->characters();
 281
 282     char* characterBuffer;
 283     CString result = CString::newUninitialized(length, characterBuffer);
 284
 285     for (unsigned i = 0; i < length; ++i) {
 286         UChar ch = characters[i];
 287         characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
 288     }
 289
 290     return result;
 291 }
 292
 293 CString UString::latin1() const
 294 {
 295     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
 296     // preserved, characters outside of this range are converted to '?'.
 297
 298     unsigned length = this->length();
 299     const UChar* characters = this->characters();
 300
 301     char* characterBuffer;
 302     CString result = CString::newUninitialized(length, characterBuffer);
 303
 304     for (unsigned i = 0; i < length; ++i) {
 305         UChar ch = characters[i];
 306         characterBuffer[i] = ch > 0xff ? '?' : ch;
 307     }
 308
 309     return result;
 310 }
 311
 312 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
 313 static inline void putUTF8Triple(char*& buffer, UChar ch)
 314 {
 315     ASSERT(ch >= 0x0800);
 316     *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
 317     *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
 318     *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
 319 }
 320
 321 CString UString::utf8(bool strict) const
 322 {
 323     unsigned length = this->length();
 324     const UChar* characters = this->characters();
 325
 326     // Allocate a buffer big enough to hold all the characters
 327     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
 328     // Optimization ideas, if we find this function is hot:
 329     //  * We could speculatively create a CStringBuffer to contain 'length'
 330     //    characters, and resize if necessary (i.e. if the buffer contains
 331     //    non-ascii characters). (Alternatively, scan the buffer first for
 332     //    ascii characters, so we know this will be sufficient).
 333     //  * We could allocate a CStringBuffer with an appropriate size to
 334     //    have a good chance of being able to write the string into the
 335     //    buffer without reallocing (say, 1.5 x length).
 336     if (length > numeric_limits<unsigned>::max() / 3)
 337         return CString();
 338     Vector<char, 1024> bufferVector(length * 3);
 339
 340     char* buffer = bufferVector.data();
 341     ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
 342     ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
 343
 344     // Only produced from strict conversion.
 345     if (result == sourceIllegal)
 346         return CString();
 347
 348     // Check for an unconverted high surrogate.
 349     if (result == sourceExhausted) {
 350         if (strict)
 351             return CString();
 352         // This should be one unpaired high surrogate. Treat it the same
 353         // was as an unpaired high surrogate would have been handled in
 354         // the middle of a string with non-strict conversion - which is
 355         // to say, simply encode it to UTF-8.
 356         ASSERT((characters + 1) == (this->characters() + length));
 357         ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
 358         // There should be room left, since one UChar hasn't been converted.
 359         ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
 360         putUTF8Triple(buffer, *characters);
 361     }
 362
 363     return CString(bufferVector.data(), buffer - bufferVector.data());
 364 }
 365
 366 } // namespace JSC