runtime/UString.cpp

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   5  *  Copyright (C) 2009 Google Inc. All rights reserved.
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public License
  18  *  along with this library; see the file COPYING.LIB.  If not, write to
  19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  20  *  Boston, MA 02110-1301, USA.
  21  *
  22  */
  23
  24 #include "config.h"
  25 #include "UString.h"
  26
  27 #include "JSGlobalObjectFunctions.h"
  28 #include "Heap.h"
  29 #include "Identifier.h"
  30 #include "Operations.h"
  31 #include <ctype.h>
  32 #include <limits.h>
  33 #include <limits>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <wtf/ASCIICType.h>
  37 #include <wtf/Assertions.h>
  38 #include <wtf/MathExtras.h>
  39 #include <wtf/StringExtras.h>
  40 #include <wtf/Vector.h>
  41 #include <wtf/dtoa.h>
  42 #include <wtf/unicode/UTF8.h>
  43
  44 #if HAVE(STRINGS_H)
  45 #include <strings.h>
  46 #endif
  47
  48 using namespace WTF;
  49 using namespace WTF::Unicode;
  50 using namespace std;
  51
  52 namespace JSC {
  53
  54 COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
  55
  56 // Construct a string with UTF-16 data.
  57 UString::UString(const UChar* characters, unsigned length)
  58     : m_impl(characters ? StringImpl::create(characters, length) : 0)
  59 {
  60 }
  61
  62 // Construct a string with UTF-16 data, from a null-terminated source.
  63 UString::UString(const UChar* characters)
  64 {
  65     if (!characters)
  66         return;
  67
  68     int length = 0;
  69     while (characters[length] != UChar(0))
  70         ++length;
  71
  72     m_impl = StringImpl::create(characters, length);
  73 }
  74
  75 // Construct a string with latin1 data.
  76 UString::UString(const LChar* characters, unsigned length)
  77     : m_impl(characters ? StringImpl::create(characters, length) : 0)
  78 {
  79 }
  80
  81 UString::UString(const char* characters, unsigned length)
  82     : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0)
  83 {
  84 }
  85
  86 // Construct a string with latin1 data, from a null-terminated source.
  87 UString::UString(const LChar* characters)
  88     : m_impl(characters ? StringImpl::create(characters) : 0)
  89 {
  90 }
  91
  92 UString::UString(const char* characters)
  93     : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0)
  94 {
  95 }
  96
  97 UString UString::number(int i)
  98 {
  99     LChar buf[1 + sizeof(i) * 3];
 100     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
 101     LChar* p = end;
 102
 103     if (i == 0)
 104         *--p = '0';
 105     else if (i == INT_MIN) {
 106         char minBuf[1 + sizeof(i) * 3];
 107         snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN);
 108         return UString(minBuf);
 109     } else {
 110         bool negative = false;
 111         if (i < 0) {
 112             negative = true;
 113             i = -i;
 114         }
 115         while (i) {
 116             *--p = static_cast<unsigned short>((i % 10) + '0');
 117             i /= 10;
 118         }
 119         if (negative)
 120             *--p = '-';
 121     }
 122
 123     return UString(p, static_cast<unsigned>(end - p));
 124 }
 125
 126 UString UString::number(long long i)
 127 {
 128     LChar buf[1 + sizeof(i) * 3];
 129     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
 130     LChar* p = end;
 131
 132     if (i == 0)
 133         *--p = '0';
 134     else if (i == std::numeric_limits<long long>::min()) {
 135         char minBuf[1 + sizeof(i) * 3];
 136 #if OS(WINDOWS)
 137         snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min());
 138 #else
 139         snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min());
 140 #endif
 141         return UString(minBuf);
 142     } else {
 143         bool negative = false;
 144         if (i < 0) {
 145             negative = true;
 146             i = -i;
 147         }
 148         while (i) {
 149             *--p = static_cast<unsigned short>((i % 10) + '0');
 150             i /= 10;
 151         }
 152         if (negative)
 153             *--p = '-';
 154     }
 155
 156     return UString(p, static_cast<unsigned>(end - p));
 157 }
 158
 159 UString UString::number(unsigned u)
 160 {
 161     LChar buf[sizeof(u) * 3];
 162     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
 163     LChar* p = end;
 164
 165     if (u == 0)
 166         *--p = '0';
 167     else {
 168         while (u) {
 169             *--p = static_cast<unsigned short>((u % 10) + '0');
 170             u /= 10;
 171         }
 172     }
 173
 174     return UString(p, static_cast<unsigned>(end - p));
 175 }
 176
 177 UString UString::number(long l)
 178 {
 179     LChar buf[1 + sizeof(l) * 3];
 180     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
 181     LChar* p = end;
 182
 183     if (l == 0)
 184         *--p = '0';
 185     else if (l == LONG_MIN) {
 186         char minBuf[1 + sizeof(l) * 3];
 187         snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN);
 188         return UString(minBuf);
 189     } else {
 190         bool negative = false;
 191         if (l < 0) {
 192             negative = true;
 193             l = -l;
 194         }
 195         while (l) {
 196             *--p = static_cast<unsigned short>((l % 10) + '0');
 197             l /= 10;
 198         }
 199         if (negative)
 200             *--p = '-';
 201     }
 202
 203     return UString(p, end - p);
 204 }
 205
 206 UString UString::number(double d)
 207 {
 208     NumberToStringBuffer buffer;
 209     return UString(numberToString(d, buffer));
 210 }
 211
 212 UString UString::substringSharingImpl(unsigned offset, unsigned length) const
 213 {
 214     // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
 215
 216     unsigned stringLength = this->length();
 217     offset = min(offset, stringLength);
 218     length = min(length, stringLength - offset);
 219
 220     if (!offset && length == stringLength)
 221         return *this;
 222     return UString(StringImpl::create(m_impl, offset, length));
 223 }
 224
 225 bool operator==(const UString& s1, const char *s2)
 226 {
 227     if (s1.isEmpty())
 228         return !s2;
 229
 230     return equal(s1.impl(), s2);
 231 }
 232
 233 // This method assumes that all simple checks have been performed by
 234 // the inlined operator==() in the header file.
 235 bool equalSlowCase(const UString& s1, const UString& s2)
 236 {
 237     StringImpl* rep1 = s1.impl();
 238     StringImpl* rep2 = s2.impl();
 239     unsigned size1 = rep1->length();
 240
 241     // At this point we know
 242     //   (a) that the strings are the same length and
 243     //   (b) that they are greater than zero length.
 244     bool s1Is8Bit = rep1->is8Bit();
 245     bool s2Is8Bit = rep2->is8Bit();
 246
 247     if (s1Is8Bit) {
 248         const LChar* d1 = rep1->characters8();
 249         if (s2Is8Bit) {
 250             const LChar* d2 = rep2->characters8();
 251
 252             if (d1 == d2) // Check to see if the data pointers are the same.
 253                 return true;
 254
 255             // Do quick checks for sizes 1 and 2.
 256             switch (size1) {
 257             case 1:
 258                 return d1[0] == d2[0];
 259             case 2:
 260                 return (d1[0] == d2[0]) & (d1[1] == d2[1]);
 261             default:
 262                 return (!memcmp(d1, d2, size1 * sizeof(LChar)));
 263             }
 264         }
 265
 266         const UChar* d2 = rep2->characters16();
 267
 268         for (unsigned i = 0; i < size1; i++) {
 269             if (d1[i] != d2[i])
 270                 return false;
 271         }
 272         return true;
 273     }
 274
 275     if (s2Is8Bit) {
 276         const UChar* d1 = rep1->characters16();
 277         const LChar* d2 = rep2->characters8();
 278
 279         for (unsigned i = 0; i < size1; i++) {
 280             if (d1[i] != d2[i])
 281                 return false;
 282         }
 283         return true;
 284
 285     }
 286
 287     const UChar* d1 = rep1->characters16();
 288     const UChar* d2 = rep2->characters16();
 289
 290     if (d1 == d2) // Check to see if the data pointers are the same.
 291         return true;
 292
 293     // Do quick checks for sizes 1 and 2.
 294     switch (size1) {
 295     case 1:
 296         return d1[0] == d2[0];
 297     case 2:
 298         return (d1[0] == d2[0]) & (d1[1] == d2[1]);
 299     default:
 300         return (!memcmp(d1, d2, size1 * sizeof(UChar)));
 301     }
 302 }
 303
 304 bool operator<(const UString& s1, const UString& s2)
 305 {
 306     const unsigned l1 = s1.length();
 307     const unsigned l2 = s2.length();
 308     const unsigned lmin = l1 < l2 ? l1 : l2;
 309     if (s1.is8Bit() && s2.is8Bit()) {
 310         const LChar* c1 = s1.characters8();
 311         const LChar* c2 = s2.characters8();
 312         unsigned length = 0;
 313         while (length < lmin && *c1 == *c2) {
 314             c1++;
 315             c2++;
 316             length++;
 317         }
 318         if (length < lmin)
 319             return (c1[0] < c2[0]);
 320
 321         return (l1 < l2);
 322     }
 323     const UChar* c1 = s1.characters();
 324     const UChar* c2 = s2.characters();
 325     unsigned length = 0;
 326     while (length < lmin && *c1 == *c2) {
 327         c1++;
 328         c2++;
 329         length++;
 330     }
 331     if (length < lmin)
 332         return (c1[0] < c2[0]);
 333
 334     return (l1 < l2);
 335 }
 336
 337 bool operator>(const UString& s1, const UString& s2)
 338 {
 339     const unsigned l1 = s1.length();
 340     const unsigned l2 = s2.length();
 341     const unsigned lmin = l1 < l2 ? l1 : l2;
 342     const UChar* c1 = s1.characters();
 343     const UChar* c2 = s2.characters();
 344     unsigned l = 0;
 345     while (l < lmin && *c1 == *c2) {
 346         c1++;
 347         c2++;
 348         l++;
 349     }
 350     if (l < lmin)
 351         return (c1[0] > c2[0]);
 352
 353     return (l1 > l2);
 354 }
 355
 356 CString UString::ascii() const
 357 {
 358     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
 359     // preserved, characters outside of this range are converted to '?'.
 360
 361     unsigned length = this->length();
 362
 363     if (this->is8Bit()) {
 364         const LChar* characters = this->characters8();
 365
 366         char* characterBuffer;
 367         CString result = CString::newUninitialized(length, characterBuffer);
 368
 369         for (unsigned i = 0; i < length; ++i) {
 370             LChar ch = characters[i];
 371             characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
 372         }
 373
 374         return result;
 375     }
 376
 377     const UChar* characters = this->characters16();
 378
 379     char* characterBuffer;
 380     CString result = CString::newUninitialized(length, characterBuffer);
 381
 382     for (unsigned i = 0; i < length; ++i) {
 383         UChar ch = characters[i];
 384         characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
 385     }
 386
 387     return result;
 388 }
 389
 390 CString UString::latin1() const
 391 {
 392     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
 393     // preserved, characters outside of this range are converted to '?'.
 394
 395     unsigned length = this->length();
 396     const UChar* characters = this->characters();
 397
 398     char* characterBuffer;
 399     CString result = CString::newUninitialized(length, characterBuffer);
 400
 401     for (unsigned i = 0; i < length; ++i) {
 402         UChar ch = characters[i];
 403         characterBuffer[i] = ch > 0xff ? '?' : ch;
 404     }
 405
 406     return result;
 407 }
 408
 409 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
 410 static inline void putUTF8Triple(char*& buffer, UChar ch)
 411 {
 412     ASSERT(ch >= 0x0800);
 413     *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
 414     *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
 415     *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
 416 }
 417
 418 CString UString::utf8(bool strict) const
 419 {
 420     unsigned length = this->length();
 421
 422     if (!length)
 423         return CString("", 0);
 424
 425     // Allocate a buffer big enough to hold all the characters
 426     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
 427     // Optimization ideas, if we find this function is hot:
 428     //  * We could speculatively create a CStringBuffer to contain 'length'
 429     //    characters, and resize if necessary (i.e. if the buffer contains
 430     //    non-ascii characters). (Alternatively, scan the buffer first for
 431     //    ascii characters, so we know this will be sufficient).
 432     //  * We could allocate a CStringBuffer with an appropriate size to
 433     //    have a good chance of being able to write the string into the
 434     //    buffer without reallocing (say, 1.5 x length).
 435     if (length > numeric_limits<unsigned>::max() / 3)
 436         return CString();
 437
 438     Vector<char, 1024> bufferVector(length * 3);
 439     char* buffer = bufferVector.data();
 440
 441     if (is8Bit()) {
 442         const LChar* characters = this->characters8();
 443
 444         ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
 445         ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
 446     } else {
 447         const UChar* characters = this->characters16();
 448
 449         ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
 450         ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
 451
 452         // Only produced from strict conversion.
 453         if (result == sourceIllegal)
 454             return CString();
 455
 456         // Check for an unconverted high surrogate.
 457         if (result == sourceExhausted) {
 458             if (strict)
 459                 return CString();
 460             // This should be one unpaired high surrogate. Treat it the same
 461             // was as an unpaired high surrogate would have been handled in
 462             // the middle of a string with non-strict conversion - which is
 463             // to say, simply encode it to UTF-8.
 464             ASSERT((characters + 1) == (this->characters() + length));
 465             ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
 466             // There should be room left, since one UChar hasn't been converted.
 467             ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
 468             putUTF8Triple(buffer, *characters);
 469         }
 470     }
 471
 472     return CString(bufferVector.data(), buffer - bufferVector.data());
 473 }
 474
 475 } // namespace JSC