]> git.saurik.com Git - apple/javascriptcore.git/blob - runtime/UString.cpp
5b1e9a0e0b12a0a7aefc627192562ab43377d4dc
[apple/javascriptcore.git] / runtime / UString.cpp
1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2009 Google Inc. All rights reserved.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 *
22 */
23
24 #include "config.h"
25 #include "UString.h"
26
27 #include "JSGlobalObjectFunctions.h"
28 #include "Heap.h"
29 #include "Identifier.h"
30 #include "Operations.h"
31 #include <ctype.h>
32 #include <limits.h>
33 #include <limits>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <wtf/ASCIICType.h>
37 #include <wtf/Assertions.h>
38 #include <wtf/MathExtras.h>
39 #include <wtf/StringExtras.h>
40 #include <wtf/Vector.h>
41 #include <wtf/dtoa.h>
42 #include <wtf/unicode/UTF8.h>
43
44 #if HAVE(STRINGS_H)
45 #include <strings.h>
46 #endif
47
48 using namespace WTF;
49 using namespace WTF::Unicode;
50 using namespace std;
51
52 namespace JSC {
53
54 COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
55
56 // Construct a string with UTF-16 data.
57 UString::UString(const UChar* characters, unsigned length)
58 : m_impl(characters ? StringImpl::create(characters, length) : 0)
59 {
60 }
61
62 // Construct a string with UTF-16 data, from a null-terminated source.
63 UString::UString(const UChar* characters)
64 {
65 if (!characters)
66 return;
67
68 int length = 0;
69 while (characters[length] != UChar(0))
70 ++length;
71
72 m_impl = StringImpl::create(characters, length);
73 }
74
75 // Construct a string with latin1 data.
76 UString::UString(const LChar* characters, unsigned length)
77 : m_impl(characters ? StringImpl::create(characters, length) : 0)
78 {
79 }
80
81 UString::UString(const char* characters, unsigned length)
82 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0)
83 {
84 }
85
86 // Construct a string with latin1 data, from a null-terminated source.
87 UString::UString(const LChar* characters)
88 : m_impl(characters ? StringImpl::create(characters) : 0)
89 {
90 }
91
92 UString::UString(const char* characters)
93 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0)
94 {
95 }
96
97 UString UString::number(int i)
98 {
99 LChar buf[1 + sizeof(i) * 3];
100 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
101 LChar* p = end;
102
103 if (i == 0)
104 *--p = '0';
105 else if (i == INT_MIN) {
106 char minBuf[1 + sizeof(i) * 3];
107 snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN);
108 return UString(minBuf);
109 } else {
110 bool negative = false;
111 if (i < 0) {
112 negative = true;
113 i = -i;
114 }
115 while (i) {
116 *--p = static_cast<unsigned short>((i % 10) + '0');
117 i /= 10;
118 }
119 if (negative)
120 *--p = '-';
121 }
122
123 return UString(p, static_cast<unsigned>(end - p));
124 }
125
126 UString UString::number(long long i)
127 {
128 LChar buf[1 + sizeof(i) * 3];
129 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
130 LChar* p = end;
131
132 if (i == 0)
133 *--p = '0';
134 else if (i == std::numeric_limits<long long>::min()) {
135 char minBuf[1 + sizeof(i) * 3];
136 #if OS(WINDOWS)
137 snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min());
138 #else
139 snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min());
140 #endif
141 return UString(minBuf);
142 } else {
143 bool negative = false;
144 if (i < 0) {
145 negative = true;
146 i = -i;
147 }
148 while (i) {
149 *--p = static_cast<unsigned short>((i % 10) + '0');
150 i /= 10;
151 }
152 if (negative)
153 *--p = '-';
154 }
155
156 return UString(p, static_cast<unsigned>(end - p));
157 }
158
159 UString UString::number(unsigned u)
160 {
161 LChar buf[sizeof(u) * 3];
162 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
163 LChar* p = end;
164
165 if (u == 0)
166 *--p = '0';
167 else {
168 while (u) {
169 *--p = static_cast<unsigned short>((u % 10) + '0');
170 u /= 10;
171 }
172 }
173
174 return UString(p, static_cast<unsigned>(end - p));
175 }
176
177 UString UString::number(long l)
178 {
179 LChar buf[1 + sizeof(l) * 3];
180 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
181 LChar* p = end;
182
183 if (l == 0)
184 *--p = '0';
185 else if (l == LONG_MIN) {
186 char minBuf[1 + sizeof(l) * 3];
187 snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN);
188 return UString(minBuf);
189 } else {
190 bool negative = false;
191 if (l < 0) {
192 negative = true;
193 l = -l;
194 }
195 while (l) {
196 *--p = static_cast<unsigned short>((l % 10) + '0');
197 l /= 10;
198 }
199 if (negative)
200 *--p = '-';
201 }
202
203 return UString(p, end - p);
204 }
205
206 UString UString::number(double d)
207 {
208 NumberToStringBuffer buffer;
209 return UString(numberToString(d, buffer));
210 }
211
212 UString UString::substringSharingImpl(unsigned offset, unsigned length) const
213 {
214 // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
215
216 unsigned stringLength = this->length();
217 offset = min(offset, stringLength);
218 length = min(length, stringLength - offset);
219
220 if (!offset && length == stringLength)
221 return *this;
222 return UString(StringImpl::create(m_impl, offset, length));
223 }
224
225 bool operator==(const UString& s1, const char *s2)
226 {
227 if (s1.isEmpty())
228 return !s2;
229
230 return equal(s1.impl(), s2);
231 }
232
233 // This method assumes that all simple checks have been performed by
234 // the inlined operator==() in the header file.
235 bool equalSlowCase(const UString& s1, const UString& s2)
236 {
237 StringImpl* rep1 = s1.impl();
238 StringImpl* rep2 = s2.impl();
239 unsigned size1 = rep1->length();
240
241 // At this point we know
242 // (a) that the strings are the same length and
243 // (b) that they are greater than zero length.
244 bool s1Is8Bit = rep1->is8Bit();
245 bool s2Is8Bit = rep2->is8Bit();
246
247 if (s1Is8Bit) {
248 const LChar* d1 = rep1->characters8();
249 if (s2Is8Bit) {
250 const LChar* d2 = rep2->characters8();
251
252 if (d1 == d2) // Check to see if the data pointers are the same.
253 return true;
254
255 // Do quick checks for sizes 1 and 2.
256 switch (size1) {
257 case 1:
258 return d1[0] == d2[0];
259 case 2:
260 return (d1[0] == d2[0]) & (d1[1] == d2[1]);
261 default:
262 return (!memcmp(d1, d2, size1 * sizeof(LChar)));
263 }
264 }
265
266 const UChar* d2 = rep2->characters16();
267
268 for (unsigned i = 0; i < size1; i++) {
269 if (d1[i] != d2[i])
270 return false;
271 }
272 return true;
273 }
274
275 if (s2Is8Bit) {
276 const UChar* d1 = rep1->characters16();
277 const LChar* d2 = rep2->characters8();
278
279 for (unsigned i = 0; i < size1; i++) {
280 if (d1[i] != d2[i])
281 return false;
282 }
283 return true;
284
285 }
286
287 const UChar* d1 = rep1->characters16();
288 const UChar* d2 = rep2->characters16();
289
290 if (d1 == d2) // Check to see if the data pointers are the same.
291 return true;
292
293 // Do quick checks for sizes 1 and 2.
294 switch (size1) {
295 case 1:
296 return d1[0] == d2[0];
297 case 2:
298 return (d1[0] == d2[0]) & (d1[1] == d2[1]);
299 default:
300 return (!memcmp(d1, d2, size1 * sizeof(UChar)));
301 }
302 }
303
304 bool operator<(const UString& s1, const UString& s2)
305 {
306 const unsigned l1 = s1.length();
307 const unsigned l2 = s2.length();
308 const unsigned lmin = l1 < l2 ? l1 : l2;
309 if (s1.is8Bit() && s2.is8Bit()) {
310 const LChar* c1 = s1.characters8();
311 const LChar* c2 = s2.characters8();
312 unsigned length = 0;
313 while (length < lmin && *c1 == *c2) {
314 c1++;
315 c2++;
316 length++;
317 }
318 if (length < lmin)
319 return (c1[0] < c2[0]);
320
321 return (l1 < l2);
322 }
323 const UChar* c1 = s1.characters();
324 const UChar* c2 = s2.characters();
325 unsigned length = 0;
326 while (length < lmin && *c1 == *c2) {
327 c1++;
328 c2++;
329 length++;
330 }
331 if (length < lmin)
332 return (c1[0] < c2[0]);
333
334 return (l1 < l2);
335 }
336
337 bool operator>(const UString& s1, const UString& s2)
338 {
339 const unsigned l1 = s1.length();
340 const unsigned l2 = s2.length();
341 const unsigned lmin = l1 < l2 ? l1 : l2;
342 const UChar* c1 = s1.characters();
343 const UChar* c2 = s2.characters();
344 unsigned l = 0;
345 while (l < lmin && *c1 == *c2) {
346 c1++;
347 c2++;
348 l++;
349 }
350 if (l < lmin)
351 return (c1[0] > c2[0]);
352
353 return (l1 > l2);
354 }
355
356 CString UString::ascii() const
357 {
358 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
359 // preserved, characters outside of this range are converted to '?'.
360
361 unsigned length = this->length();
362
363 if (this->is8Bit()) {
364 const LChar* characters = this->characters8();
365
366 char* characterBuffer;
367 CString result = CString::newUninitialized(length, characterBuffer);
368
369 for (unsigned i = 0; i < length; ++i) {
370 LChar ch = characters[i];
371 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
372 }
373
374 return result;
375 }
376
377 const UChar* characters = this->characters16();
378
379 char* characterBuffer;
380 CString result = CString::newUninitialized(length, characterBuffer);
381
382 for (unsigned i = 0; i < length; ++i) {
383 UChar ch = characters[i];
384 characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
385 }
386
387 return result;
388 }
389
390 CString UString::latin1() const
391 {
392 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
393 // preserved, characters outside of this range are converted to '?'.
394
395 unsigned length = this->length();
396 const UChar* characters = this->characters();
397
398 char* characterBuffer;
399 CString result = CString::newUninitialized(length, characterBuffer);
400
401 for (unsigned i = 0; i < length; ++i) {
402 UChar ch = characters[i];
403 characterBuffer[i] = ch > 0xff ? '?' : ch;
404 }
405
406 return result;
407 }
408
409 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
410 static inline void putUTF8Triple(char*& buffer, UChar ch)
411 {
412 ASSERT(ch >= 0x0800);
413 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
414 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
415 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
416 }
417
418 CString UString::utf8(bool strict) const
419 {
420 unsigned length = this->length();
421
422 if (!length)
423 return CString("", 0);
424
425 // Allocate a buffer big enough to hold all the characters
426 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
427 // Optimization ideas, if we find this function is hot:
428 // * We could speculatively create a CStringBuffer to contain 'length'
429 // characters, and resize if necessary (i.e. if the buffer contains
430 // non-ascii characters). (Alternatively, scan the buffer first for
431 // ascii characters, so we know this will be sufficient).
432 // * We could allocate a CStringBuffer with an appropriate size to
433 // have a good chance of being able to write the string into the
434 // buffer without reallocing (say, 1.5 x length).
435 if (length > numeric_limits<unsigned>::max() / 3)
436 return CString();
437
438 Vector<char, 1024> bufferVector(length * 3);
439 char* buffer = bufferVector.data();
440
441 if (is8Bit()) {
442 const LChar* characters = this->characters8();
443
444 ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
445 ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
446 } else {
447 const UChar* characters = this->characters16();
448
449 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
450 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
451
452 // Only produced from strict conversion.
453 if (result == sourceIllegal)
454 return CString();
455
456 // Check for an unconverted high surrogate.
457 if (result == sourceExhausted) {
458 if (strict)
459 return CString();
460 // This should be one unpaired high surrogate. Treat it the same
461 // was as an unpaired high surrogate would have been handled in
462 // the middle of a string with non-strict conversion - which is
463 // to say, simply encode it to UTF-8.
464 ASSERT((characters + 1) == (this->characters() + length));
465 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
466 // There should be room left, since one UChar hasn't been converted.
467 ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
468 putUTF8Triple(buffer, *characters);
469 }
470 }
471
472 return CString(bufferVector.data(), buffer - bufferVector.data());
473 }
474
475 } // namespace JSC