2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2009 Google Inc. All rights reserved.
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
27 #include "JSGlobalObjectFunctions.h"
29 #include "Identifier.h"
30 #include "Operations.h"
36 #include <wtf/ASCIICType.h>
37 #include <wtf/Assertions.h>
38 #include <wtf/MathExtras.h>
39 #include <wtf/StringExtras.h>
40 #include <wtf/Vector.h>
42 #include <wtf/unicode/UTF8.h>
49 using namespace WTF::Unicode
;
54 COMPILE_ASSERT(sizeof(UString
) == sizeof(void*), UString_should_stay_small
);
56 // Construct a string with UTF-16 data.
57 UString::UString(const UChar
* characters
, unsigned length
)
58 : m_impl(characters
? StringImpl::create(characters
, length
) : 0)
62 // Construct a string with UTF-16 data, from a null-terminated source.
63 UString::UString(const UChar
* characters
)
69 while (characters
[length
] != UChar(0))
72 m_impl
= StringImpl::create(characters
, length
);
75 // Construct a string with latin1 data.
76 UString::UString(const LChar
* characters
, unsigned length
)
77 : m_impl(characters
? StringImpl::create(characters
, length
) : 0)
81 UString::UString(const char* characters
, unsigned length
)
82 : m_impl(characters
? StringImpl::create(reinterpret_cast<const LChar
*>(characters
), length
) : 0)
86 // Construct a string with latin1 data, from a null-terminated source.
87 UString::UString(const LChar
* characters
)
88 : m_impl(characters
? StringImpl::create(characters
) : 0)
92 UString::UString(const char* characters
)
93 : m_impl(characters
? StringImpl::create(reinterpret_cast<const LChar
*>(characters
)) : 0)
97 UString
UString::number(int i
)
99 LChar buf
[1 + sizeof(i
) * 3];
100 LChar
* end
= buf
+ WTF_ARRAY_LENGTH(buf
);
105 else if (i
== INT_MIN
) {
106 char minBuf
[1 + sizeof(i
) * 3];
107 snprintf(minBuf
, sizeof(minBuf
), "%d", INT_MIN
);
108 return UString(minBuf
);
110 bool negative
= false;
116 *--p
= static_cast<unsigned short>((i
% 10) + '0');
123 return UString(p
, static_cast<unsigned>(end
- p
));
126 UString
UString::number(long long i
)
128 LChar buf
[1 + sizeof(i
) * 3];
129 LChar
* end
= buf
+ WTF_ARRAY_LENGTH(buf
);
134 else if (i
== std::numeric_limits
<long long>::min()) {
135 char minBuf
[1 + sizeof(i
) * 3];
137 snprintf(minBuf
, sizeof(minBuf
), "%I64d", std::numeric_limits
<long long>::min());
139 snprintf(minBuf
, sizeof(minBuf
), "%lld", std::numeric_limits
<long long>::min());
141 return UString(minBuf
);
143 bool negative
= false;
149 *--p
= static_cast<unsigned short>((i
% 10) + '0');
156 return UString(p
, static_cast<unsigned>(end
- p
));
159 UString
UString::number(unsigned u
)
161 LChar buf
[sizeof(u
) * 3];
162 LChar
* end
= buf
+ WTF_ARRAY_LENGTH(buf
);
169 *--p
= static_cast<unsigned short>((u
% 10) + '0');
174 return UString(p
, static_cast<unsigned>(end
- p
));
177 UString
UString::number(long l
)
179 LChar buf
[1 + sizeof(l
) * 3];
180 LChar
* end
= buf
+ WTF_ARRAY_LENGTH(buf
);
185 else if (l
== LONG_MIN
) {
186 char minBuf
[1 + sizeof(l
) * 3];
187 snprintf(minBuf
, sizeof(minBuf
), "%ld", LONG_MIN
);
188 return UString(minBuf
);
190 bool negative
= false;
196 *--p
= static_cast<unsigned short>((l
% 10) + '0');
203 return UString(p
, end
- p
);
206 UString
UString::number(double d
)
208 NumberToStringBuffer buffer
;
209 return UString(numberToString(d
, buffer
));
212 UString
UString::substringSharingImpl(unsigned offset
, unsigned length
) const
214 // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
216 unsigned stringLength
= this->length();
217 offset
= min(offset
, stringLength
);
218 length
= min(length
, stringLength
- offset
);
220 if (!offset
&& length
== stringLength
)
222 return UString(StringImpl::create(m_impl
, offset
, length
));
225 bool operator==(const UString
& s1
, const char *s2
)
230 return equal(s1
.impl(), s2
);
233 // This method assumes that all simple checks have been performed by
234 // the inlined operator==() in the header file.
235 bool equalSlowCase(const UString
& s1
, const UString
& s2
)
237 StringImpl
* rep1
= s1
.impl();
238 StringImpl
* rep2
= s2
.impl();
239 unsigned size1
= rep1
->length();
241 // At this point we know
242 // (a) that the strings are the same length and
243 // (b) that they are greater than zero length.
244 bool s1Is8Bit
= rep1
->is8Bit();
245 bool s2Is8Bit
= rep2
->is8Bit();
248 const LChar
* d1
= rep1
->characters8();
250 const LChar
* d2
= rep2
->characters8();
252 if (d1
== d2
) // Check to see if the data pointers are the same.
255 // Do quick checks for sizes 1 and 2.
258 return d1
[0] == d2
[0];
260 return (d1
[0] == d2
[0]) & (d1
[1] == d2
[1]);
262 return (!memcmp(d1
, d2
, size1
* sizeof(LChar
)));
266 const UChar
* d2
= rep2
->characters16();
268 for (unsigned i
= 0; i
< size1
; i
++) {
276 const UChar
* d1
= rep1
->characters16();
277 const LChar
* d2
= rep2
->characters8();
279 for (unsigned i
= 0; i
< size1
; i
++) {
287 const UChar
* d1
= rep1
->characters16();
288 const UChar
* d2
= rep2
->characters16();
290 if (d1
== d2
) // Check to see if the data pointers are the same.
293 // Do quick checks for sizes 1 and 2.
296 return d1
[0] == d2
[0];
298 return (d1
[0] == d2
[0]) & (d1
[1] == d2
[1]);
300 return (!memcmp(d1
, d2
, size1
* sizeof(UChar
)));
304 bool operator<(const UString
& s1
, const UString
& s2
)
306 const unsigned l1
= s1
.length();
307 const unsigned l2
= s2
.length();
308 const unsigned lmin
= l1
< l2
? l1
: l2
;
309 if (s1
.is8Bit() && s2
.is8Bit()) {
310 const LChar
* c1
= s1
.characters8();
311 const LChar
* c2
= s2
.characters8();
313 while (length
< lmin
&& *c1
== *c2
) {
319 return (c1
[0] < c2
[0]);
323 const UChar
* c1
= s1
.characters();
324 const UChar
* c2
= s2
.characters();
326 while (length
< lmin
&& *c1
== *c2
) {
332 return (c1
[0] < c2
[0]);
337 bool operator>(const UString
& s1
, const UString
& s2
)
339 const unsigned l1
= s1
.length();
340 const unsigned l2
= s2
.length();
341 const unsigned lmin
= l1
< l2
? l1
: l2
;
342 const UChar
* c1
= s1
.characters();
343 const UChar
* c2
= s2
.characters();
345 while (l
< lmin
&& *c1
== *c2
) {
351 return (c1
[0] > c2
[0]);
356 CString
UString::ascii() const
358 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
359 // preserved, characters outside of this range are converted to '?'.
361 unsigned length
= this->length();
363 if (this->is8Bit()) {
364 const LChar
* characters
= this->characters8();
366 char* characterBuffer
;
367 CString result
= CString::newUninitialized(length
, characterBuffer
);
369 for (unsigned i
= 0; i
< length
; ++i
) {
370 LChar ch
= characters
[i
];
371 characterBuffer
[i
] = ch
&& (ch
< 0x20 || ch
> 0x7f) ? '?' : ch
;
377 const UChar
* characters
= this->characters16();
379 char* characterBuffer
;
380 CString result
= CString::newUninitialized(length
, characterBuffer
);
382 for (unsigned i
= 0; i
< length
; ++i
) {
383 UChar ch
= characters
[i
];
384 characterBuffer
[i
] = ch
&& (ch
< 0x20 || ch
>= 0x7f) ? '?' : ch
;
390 CString
UString::latin1() const
392 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
393 // preserved, characters outside of this range are converted to '?'.
395 unsigned length
= this->length();
396 const UChar
* characters
= this->characters();
398 char* characterBuffer
;
399 CString result
= CString::newUninitialized(length
, characterBuffer
);
401 for (unsigned i
= 0; i
< length
; ++i
) {
402 UChar ch
= characters
[i
];
403 characterBuffer
[i
] = ch
> 0xff ? '?' : ch
;
409 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
410 static inline void putUTF8Triple(char*& buffer
, UChar ch
)
412 ASSERT(ch
>= 0x0800);
413 *buffer
++ = static_cast<char>(((ch
>> 12) & 0x0F) | 0xE0);
414 *buffer
++ = static_cast<char>(((ch
>> 6) & 0x3F) | 0x80);
415 *buffer
++ = static_cast<char>((ch
& 0x3F) | 0x80);
418 CString
UString::utf8(bool strict
) const
420 unsigned length
= this->length();
423 return CString("", 0);
425 // Allocate a buffer big enough to hold all the characters
426 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
427 // Optimization ideas, if we find this function is hot:
428 // * We could speculatively create a CStringBuffer to contain 'length'
429 // characters, and resize if necessary (i.e. if the buffer contains
430 // non-ascii characters). (Alternatively, scan the buffer first for
431 // ascii characters, so we know this will be sufficient).
432 // * We could allocate a CStringBuffer with an appropriate size to
433 // have a good chance of being able to write the string into the
434 // buffer without reallocing (say, 1.5 x length).
435 if (length
> numeric_limits
<unsigned>::max() / 3)
438 Vector
<char, 1024> bufferVector(length
* 3);
439 char* buffer
= bufferVector
.data();
442 const LChar
* characters
= this->characters8();
444 ConversionResult result
= convertLatin1ToUTF8(&characters
, characters
+ length
, &buffer
, buffer
+ bufferVector
.size());
445 ASSERT_UNUSED(result
, result
!= targetExhausted
); // (length * 3) should be sufficient for any conversion
447 const UChar
* characters
= this->characters16();
449 ConversionResult result
= convertUTF16ToUTF8(&characters
, characters
+ length
, &buffer
, buffer
+ bufferVector
.size(), strict
);
450 ASSERT(result
!= targetExhausted
); // (length * 3) should be sufficient for any conversion
452 // Only produced from strict conversion.
453 if (result
== sourceIllegal
)
456 // Check for an unconverted high surrogate.
457 if (result
== sourceExhausted
) {
460 // This should be one unpaired high surrogate. Treat it the same
461 // was as an unpaired high surrogate would have been handled in
462 // the middle of a string with non-strict conversion - which is
463 // to say, simply encode it to UTF-8.
464 ASSERT((characters
+ 1) == (this->characters() + length
));
465 ASSERT((*characters
>= 0xD800) && (*characters
<= 0xDBFF));
466 // There should be room left, since one UChar hasn't been converted.
467 ASSERT((buffer
+ 3) <= (buffer
+ bufferVector
.size()));
468 putUTF8Triple(buffer
, *characters
);
472 return CString(bufferVector
.data(), buffer
- bufferVector
.data());