2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 
   3  *  Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) 
   5  *  Copyright (C) 2009 Google Inc. All rights reserved. 
   7  *  This library is free software; you can redistribute it and/or 
   8  *  modify it under the terms of the GNU Library General Public 
   9  *  License as published by the Free Software Foundation; either 
  10  *  version 2 of the License, or (at your option) any later version. 
  12  *  This library is distributed in the hope that it will be useful, 
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  15  *  Library General Public License for more details. 
  17  *  You should have received a copy of the GNU Library General Public License 
  18  *  along with this library; see the file COPYING.LIB.  If not, write to 
  19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
  20  *  Boston, MA 02110-1301, USA. 
  27 #include "JSGlobalObjectFunctions.h" 
  29 #include "Identifier.h" 
  30 #include "Operations.h" 
  36 #include <wtf/ASCIICType.h> 
  37 #include <wtf/Assertions.h> 
  38 #include <wtf/MathExtras.h> 
  39 #include <wtf/StringExtras.h> 
  40 #include <wtf/Vector.h> 
  42 #include <wtf/unicode/UTF8.h> 
  49 using namespace WTF::Unicode
; 
  54 COMPILE_ASSERT(sizeof(UString
) == sizeof(void*), UString_should_stay_small
); 
  56 // Construct a string with UTF-16 data. 
  57 UString::UString(const UChar
* characters
, unsigned length
) 
  58     : m_impl(characters 
? StringImpl::create(characters
, length
) : 0) 
  62 // Construct a string with UTF-16 data, from a null-terminated source. 
  63 UString::UString(const UChar
* characters
) 
  69     while (characters
[length
] != UChar(0)) 
  72     m_impl 
= StringImpl::create(characters
, length
); 
  75 // Construct a string with latin1 data. 
  76 UString::UString(const LChar
* characters
, unsigned length
) 
  77     : m_impl(characters 
? StringImpl::create(characters
, length
) : 0) 
  81 UString::UString(const char* characters
, unsigned length
) 
  82     : m_impl(characters 
? StringImpl::create(reinterpret_cast<const LChar
*>(characters
), length
) : 0) 
  86 // Construct a string with latin1 data, from a null-terminated source. 
  87 UString::UString(const LChar
* characters
) 
  88     : m_impl(characters 
? StringImpl::create(characters
) : 0) 
  92 UString::UString(const char* characters
) 
  93     : m_impl(characters 
? StringImpl::create(reinterpret_cast<const LChar
*>(characters
)) : 0) 
  97 UString 
UString::number(int i
) 
  99     LChar buf
[1 + sizeof(i
) * 3]; 
 100     LChar
* end 
= buf 
+ WTF_ARRAY_LENGTH(buf
); 
 105     else if (i 
== INT_MIN
) { 
 106         char minBuf
[1 + sizeof(i
) * 3]; 
 107         snprintf(minBuf
, sizeof(minBuf
), "%d", INT_MIN
); 
 108         return UString(minBuf
); 
 110         bool negative 
= false; 
 116             *--p 
= static_cast<unsigned short>((i 
% 10) + '0'); 
 123     return UString(p
, static_cast<unsigned>(end 
- p
)); 
 126 UString 
UString::number(long long i
) 
 128     LChar buf
[1 + sizeof(i
) * 3]; 
 129     LChar
* end 
= buf 
+ WTF_ARRAY_LENGTH(buf
); 
 134     else if (i 
== std::numeric_limits
<long long>::min()) { 
 135         char minBuf
[1 + sizeof(i
) * 3]; 
 137         snprintf(minBuf
, sizeof(minBuf
), "%I64d", std::numeric_limits
<long long>::min()); 
 139         snprintf(minBuf
, sizeof(minBuf
), "%lld", std::numeric_limits
<long long>::min()); 
 141         return UString(minBuf
); 
 143         bool negative 
= false; 
 149             *--p 
= static_cast<unsigned short>((i 
% 10) + '0'); 
 156     return UString(p
, static_cast<unsigned>(end 
- p
)); 
 159 UString 
UString::number(unsigned u
) 
 161     LChar buf
[sizeof(u
) * 3]; 
 162     LChar
* end 
= buf 
+ WTF_ARRAY_LENGTH(buf
); 
 169             *--p 
= static_cast<unsigned short>((u 
% 10) + '0'); 
 174     return UString(p
, static_cast<unsigned>(end 
- p
)); 
 177 UString 
UString::number(long l
) 
 179     LChar buf
[1 + sizeof(l
) * 3]; 
 180     LChar
* end 
= buf 
+ WTF_ARRAY_LENGTH(buf
); 
 185     else if (l 
== LONG_MIN
) { 
 186         char minBuf
[1 + sizeof(l
) * 3]; 
 187         snprintf(minBuf
, sizeof(minBuf
), "%ld", LONG_MIN
); 
 188         return UString(minBuf
); 
 190         bool negative 
= false; 
 196             *--p 
= static_cast<unsigned short>((l 
% 10) + '0'); 
 203     return UString(p
, end 
- p
); 
 206 UString 
UString::number(double d
) 
 208     NumberToStringBuffer buffer
; 
 209     return UString(numberToString(d
, buffer
)); 
 212 UString 
UString::substringSharingImpl(unsigned offset
, unsigned length
) const 
 214     // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar). 
 216     unsigned stringLength 
= this->length(); 
 217     offset 
= min(offset
, stringLength
); 
 218     length 
= min(length
, stringLength 
- offset
); 
 220     if (!offset 
&& length 
== stringLength
) 
 222     return UString(StringImpl::create(m_impl
, offset
, length
)); 
 225 bool operator==(const UString
& s1
, const char *s2
) 
 230     return equal(s1
.impl(), s2
); 
 233 // This method assumes that all simple checks have been performed by 
 234 // the inlined operator==() in the header file. 
 235 bool equalSlowCase(const UString
& s1
, const UString
& s2
) 
 237     StringImpl
* rep1 
= s1
.impl(); 
 238     StringImpl
* rep2 
= s2
.impl(); 
 239     unsigned size1 
= rep1
->length(); 
 241     // At this point we know  
 242     //   (a) that the strings are the same length and 
 243     //   (b) that they are greater than zero length. 
 244     bool s1Is8Bit 
= rep1
->is8Bit(); 
 245     bool s2Is8Bit 
= rep2
->is8Bit(); 
 248         const LChar
* d1 
= rep1
->characters8(); 
 250             const LChar
* d2 
= rep2
->characters8(); 
 252             if (d1 
== d2
) // Check to see if the data pointers are the same. 
 255             // Do quick checks for sizes 1 and 2. 
 258                 return d1
[0] == d2
[0]; 
 260                 return (d1
[0] == d2
[0]) & (d1
[1] == d2
[1]); 
 262                 return (!memcmp(d1
, d2
, size1 
* sizeof(LChar
))); 
 266         const UChar
* d2 
= rep2
->characters16(); 
 268         for (unsigned i 
= 0; i 
< size1
; i
++) { 
 276         const UChar
* d1 
= rep1
->characters16(); 
 277         const LChar
* d2 
= rep2
->characters8(); 
 279         for (unsigned i 
= 0; i 
< size1
; i
++) { 
 287     const UChar
* d1 
= rep1
->characters16(); 
 288     const UChar
* d2 
= rep2
->characters16(); 
 290     if (d1 
== d2
) // Check to see if the data pointers are the same. 
 293     // Do quick checks for sizes 1 and 2. 
 296         return d1
[0] == d2
[0]; 
 298         return (d1
[0] == d2
[0]) & (d1
[1] == d2
[1]); 
 300         return (!memcmp(d1
, d2
, size1 
* sizeof(UChar
))); 
 304 bool operator<(const UString
& s1
, const UString
& s2
) 
 306     const unsigned l1 
= s1
.length(); 
 307     const unsigned l2 
= s2
.length(); 
 308     const unsigned lmin 
= l1 
< l2 
? l1 
: l2
; 
 309     if (s1
.is8Bit() && s2
.is8Bit()) { 
 310         const LChar
* c1 
= s1
.characters8(); 
 311         const LChar
* c2 
= s2
.characters8(); 
 313         while (length 
< lmin 
&& *c1 
== *c2
) { 
 319             return (c1
[0] < c2
[0]); 
 323     const UChar
* c1 
= s1
.characters(); 
 324     const UChar
* c2 
= s2
.characters(); 
 326     while (length 
< lmin 
&& *c1 
== *c2
) { 
 332         return (c1
[0] < c2
[0]); 
 337 bool operator>(const UString
& s1
, const UString
& s2
) 
 339     const unsigned l1 
= s1
.length(); 
 340     const unsigned l2 
= s2
.length(); 
 341     const unsigned lmin 
= l1 
< l2 
? l1 
: l2
; 
 342     const UChar
* c1 
= s1
.characters(); 
 343     const UChar
* c2 
= s2
.characters(); 
 345     while (l 
< lmin 
&& *c1 
== *c2
) { 
 351         return (c1
[0] > c2
[0]); 
 356 CString 
UString::ascii() const 
 358     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 
 359     // preserved, characters outside of this range are converted to '?'. 
 361     unsigned length 
= this->length(); 
 363     if (this->is8Bit()) { 
 364         const LChar
* characters 
= this->characters8(); 
 366         char* characterBuffer
; 
 367         CString result 
= CString::newUninitialized(length
, characterBuffer
); 
 369         for (unsigned i 
= 0; i 
< length
; ++i
) { 
 370             LChar ch 
= characters
[i
]; 
 371             characterBuffer
[i
] = ch 
&& (ch 
< 0x20 || ch 
> 0x7f) ? '?' : ch
; 
 377     const UChar
* characters 
= this->characters16(); 
 379     char* characterBuffer
; 
 380     CString result 
= CString::newUninitialized(length
, characterBuffer
); 
 382     for (unsigned i 
= 0; i 
< length
; ++i
) { 
 383         UChar ch 
= characters
[i
]; 
 384         characterBuffer
[i
] = ch 
&& (ch 
< 0x20 || ch 
>= 0x7f) ? '?' : ch
; 
 390 CString 
UString::latin1() const 
 392     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 
 393     // preserved, characters outside of this range are converted to '?'. 
 395     unsigned length 
= this->length(); 
 396     const UChar
* characters 
= this->characters(); 
 398     char* characterBuffer
; 
 399     CString result 
= CString::newUninitialized(length
, characterBuffer
); 
 401     for (unsigned i 
= 0; i 
< length
; ++i
) { 
 402         UChar ch 
= characters
[i
]; 
 403         characterBuffer
[i
] = ch 
> 0xff ? '?' : ch
; 
 409 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 
 410 static inline void putUTF8Triple(char*& buffer
, UChar ch
) 
 412     ASSERT(ch 
>= 0x0800); 
 413     *buffer
++ = static_cast<char>(((ch 
>> 12) & 0x0F) | 0xE0); 
 414     *buffer
++ = static_cast<char>(((ch 
>> 6) & 0x3F) | 0x80); 
 415     *buffer
++ = static_cast<char>((ch 
& 0x3F) | 0x80); 
 418 CString 
UString::utf8(bool strict
) const 
 420     unsigned length 
= this->length(); 
 423         return CString("", 0); 
 425     // Allocate a buffer big enough to hold all the characters 
 426     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 
 427     // Optimization ideas, if we find this function is hot: 
 428     //  * We could speculatively create a CStringBuffer to contain 'length'  
 429     //    characters, and resize if necessary (i.e. if the buffer contains 
 430     //    non-ascii characters). (Alternatively, scan the buffer first for 
 431     //    ascii characters, so we know this will be sufficient). 
 432     //  * We could allocate a CStringBuffer with an appropriate size to 
 433     //    have a good chance of being able to write the string into the 
 434     //    buffer without reallocing (say, 1.5 x length). 
 435     if (length 
> numeric_limits
<unsigned>::max() / 3) 
 438     Vector
<char, 1024> bufferVector(length 
* 3); 
 439     char* buffer 
= bufferVector
.data(); 
 442         const LChar
* characters 
= this->characters8(); 
 444         ConversionResult result 
= convertLatin1ToUTF8(&characters
, characters 
+ length
, &buffer
, buffer 
+ bufferVector
.size()); 
 445         ASSERT_UNUSED(result
, result 
!= targetExhausted
); // (length * 3) should be sufficient for any conversion 
 447         const UChar
* characters 
= this->characters16(); 
 449         ConversionResult result 
= convertUTF16ToUTF8(&characters
, characters 
+ length
, &buffer
, buffer 
+ bufferVector
.size(), strict
); 
 450         ASSERT(result 
!= targetExhausted
); // (length * 3) should be sufficient for any conversion 
 452         // Only produced from strict conversion. 
 453         if (result 
== sourceIllegal
) 
 456         // Check for an unconverted high surrogate. 
 457         if (result 
== sourceExhausted
) { 
 460             // This should be one unpaired high surrogate. Treat it the same 
 461             // was as an unpaired high surrogate would have been handled in 
 462             // the middle of a string with non-strict conversion - which is 
 463             // to say, simply encode it to UTF-8. 
 464             ASSERT((characters 
+ 1) == (this->characters() + length
)); 
 465             ASSERT((*characters 
>= 0xD800) && (*characters 
<= 0xDBFF)); 
 466             // There should be room left, since one UChar hasn't been converted. 
 467             ASSERT((buffer 
+ 3) <= (buffer 
+ bufferVector
.size())); 
 468             putUTF8Triple(buffer
, *characters
); 
 472     return CString(bufferVector
.data(), buffer 
- bufferVector
.data());