]>
Commit | Line | Data |
---|---|---|
9dae56ea A |
1 | /* |
2 | * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) | |
ba379fdc | 3 | * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. |
9dae56ea | 4 | * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) |
ba379fdc | 5 | * Copyright (C) 2009 Google Inc. All rights reserved. |
9dae56ea A |
6 | * |
7 | * This library is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Library General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * This library is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Library General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Library General Public License | |
18 | * along with this library; see the file COPYING.LIB. If not, write to | |
19 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
20 | * Boston, MA 02110-1301, USA. | |
21 | * | |
22 | */ | |
23 | ||
24 | #include "config.h" | |
25 | #include "UString.h" | |
26 | ||
27 | #include "JSGlobalObjectFunctions.h" | |
14957cd0 | 28 | #include "Heap.h" |
9dae56ea A |
29 | #include "Identifier.h" |
30 | #include "Operations.h" | |
31 | #include <ctype.h> | |
9dae56ea | 32 | #include <limits.h> |
f9bf01c6 | 33 | #include <limits> |
9dae56ea A |
34 | #include <stdio.h> |
35 | #include <stdlib.h> | |
36 | #include <wtf/ASCIICType.h> | |
37 | #include <wtf/Assertions.h> | |
38 | #include <wtf/MathExtras.h> | |
f9bf01c6 | 39 | #include <wtf/StringExtras.h> |
9dae56ea | 40 | #include <wtf/Vector.h> |
6fe7ccc8 | 41 | #include <wtf/dtoa.h> |
9dae56ea A |
42 | #include <wtf/unicode/UTF8.h> |
43 | ||
9dae56ea A |
44 | #if HAVE(STRINGS_H) |
45 | #include <strings.h> | |
46 | #endif | |
47 | ||
48 | using namespace WTF; | |
49 | using namespace WTF::Unicode; | |
50 | using namespace std; | |
51 | ||
9dae56ea | 52 | namespace JSC { |
4e4e5a6f | 53 | |
14957cd0 | 54 | COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small); |
9dae56ea | 55 | |
14957cd0 A |
56 | // Construct a string with UTF-16 data. |
57 | UString::UString(const UChar* characters, unsigned length) | |
58 | : m_impl(characters ? StringImpl::create(characters, length) : 0) | |
9dae56ea | 59 | { |
9dae56ea A |
60 | } |
61 | ||
14957cd0 A |
62 | // Construct a string with UTF-16 data, from a null-terminated source. |
63 | UString::UString(const UChar* characters) | |
9dae56ea | 64 | { |
14957cd0 A |
65 | if (!characters) |
66 | return; | |
67 | ||
68 | int length = 0; | |
69 | while (characters[length] != UChar(0)) | |
70 | ++length; | |
71 | ||
72 | m_impl = StringImpl::create(characters, length); | |
9dae56ea A |
73 | } |
74 | ||
14957cd0 | 75 | // Construct a string with latin1 data. |
6fe7ccc8 | 76 | UString::UString(const LChar* characters, unsigned length) |
14957cd0 | 77 | : m_impl(characters ? StringImpl::create(characters, length) : 0) |
9dae56ea | 78 | { |
9dae56ea A |
79 | } |
80 | ||
6fe7ccc8 A |
81 | UString::UString(const char* characters, unsigned length) |
82 | : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0) | |
83 | { | |
84 | } | |
85 | ||
14957cd0 | 86 | // Construct a string with latin1 data, from a null-terminated source. |
6fe7ccc8 | 87 | UString::UString(const LChar* characters) |
14957cd0 | 88 | : m_impl(characters ? StringImpl::create(characters) : 0) |
9dae56ea | 89 | { |
9dae56ea A |
90 | } |
91 | ||
6fe7ccc8 A |
92 | UString::UString(const char* characters) |
93 | : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0) | |
94 | { | |
95 | } | |
96 | ||
14957cd0 | 97 | UString UString::number(int i) |
9dae56ea | 98 | { |
6fe7ccc8 A |
99 | LChar buf[1 + sizeof(i) * 3]; |
100 | LChar* end = buf + WTF_ARRAY_LENGTH(buf); | |
101 | LChar* p = end; | |
4e4e5a6f | 102 | |
9dae56ea A |
103 | if (i == 0) |
104 | *--p = '0'; | |
105 | else if (i == INT_MIN) { | |
106 | char minBuf[1 + sizeof(i) * 3]; | |
fb8617cd | 107 | snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN); |
f9bf01c6 | 108 | return UString(minBuf); |
9dae56ea A |
109 | } else { |
110 | bool negative = false; | |
111 | if (i < 0) { | |
112 | negative = true; | |
113 | i = -i; | |
114 | } | |
115 | while (i) { | |
116 | *--p = static_cast<unsigned short>((i % 10) + '0'); | |
117 | i /= 10; | |
118 | } | |
119 | if (negative) | |
120 | *--p = '-'; | |
121 | } | |
122 | ||
4e4e5a6f | 123 | return UString(p, static_cast<unsigned>(end - p)); |
9dae56ea A |
124 | } |
125 | ||
14957cd0 | 126 | UString UString::number(long long i) |
9dae56ea | 127 | { |
6fe7ccc8 A |
128 | LChar buf[1 + sizeof(i) * 3]; |
129 | LChar* end = buf + WTF_ARRAY_LENGTH(buf); | |
130 | LChar* p = end; | |
f9bf01c6 | 131 | |
9dae56ea A |
132 | if (i == 0) |
133 | *--p = '0'; | |
f9bf01c6 | 134 | else if (i == std::numeric_limits<long long>::min()) { |
9dae56ea | 135 | char minBuf[1 + sizeof(i) * 3]; |
14957cd0 A |
136 | #if OS(WINDOWS) |
137 | snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min()); | |
138 | #else | |
f9bf01c6 | 139 | snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min()); |
14957cd0 | 140 | #endif |
9dae56ea A |
141 | return UString(minBuf); |
142 | } else { | |
143 | bool negative = false; | |
144 | if (i < 0) { | |
145 | negative = true; | |
146 | i = -i; | |
147 | } | |
148 | while (i) { | |
149 | *--p = static_cast<unsigned short>((i % 10) + '0'); | |
150 | i /= 10; | |
151 | } | |
152 | if (negative) | |
153 | *--p = '-'; | |
154 | } | |
155 | ||
4e4e5a6f | 156 | return UString(p, static_cast<unsigned>(end - p)); |
9dae56ea A |
157 | } |
158 | ||
14957cd0 | 159 | UString UString::number(unsigned u) |
9dae56ea | 160 | { |
6fe7ccc8 A |
161 | LChar buf[sizeof(u) * 3]; |
162 | LChar* end = buf + WTF_ARRAY_LENGTH(buf); | |
163 | LChar* p = end; | |
4e4e5a6f | 164 | |
9dae56ea A |
165 | if (u == 0) |
166 | *--p = '0'; | |
167 | else { | |
168 | while (u) { | |
169 | *--p = static_cast<unsigned short>((u % 10) + '0'); | |
170 | u /= 10; | |
171 | } | |
172 | } | |
4e4e5a6f A |
173 | |
174 | return UString(p, static_cast<unsigned>(end - p)); | |
9dae56ea A |
175 | } |
176 | ||
14957cd0 | 177 | UString UString::number(long l) |
9dae56ea | 178 | { |
6fe7ccc8 A |
179 | LChar buf[1 + sizeof(l) * 3]; |
180 | LChar* end = buf + WTF_ARRAY_LENGTH(buf); | |
181 | LChar* p = end; | |
9dae56ea A |
182 | |
183 | if (l == 0) | |
184 | *--p = '0'; | |
185 | else if (l == LONG_MIN) { | |
186 | char minBuf[1 + sizeof(l) * 3]; | |
fb8617cd | 187 | snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN); |
9dae56ea A |
188 | return UString(minBuf); |
189 | } else { | |
190 | bool negative = false; | |
191 | if (l < 0) { | |
192 | negative = true; | |
193 | l = -l; | |
194 | } | |
195 | while (l) { | |
196 | *--p = static_cast<unsigned short>((l % 10) + '0'); | |
197 | l /= 10; | |
198 | } | |
199 | if (negative) | |
200 | *--p = '-'; | |
201 | } | |
202 | ||
4e4e5a6f | 203 | return UString(p, end - p); |
9dae56ea A |
204 | } |
205 | ||
14957cd0 | 206 | UString UString::number(double d) |
9dae56ea | 207 | { |
14957cd0 | 208 | NumberToStringBuffer buffer; |
6fe7ccc8 | 209 | return UString(numberToString(d, buffer)); |
9dae56ea A |
210 | } |
211 | ||
14957cd0 | 212 | UString UString::substringSharingImpl(unsigned offset, unsigned length) const |
9dae56ea | 213 | { |
14957cd0 | 214 | // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar). |
9dae56ea | 215 | |
14957cd0 A |
216 | unsigned stringLength = this->length(); |
217 | offset = min(offset, stringLength); | |
218 | length = min(length, stringLength - offset); | |
9dae56ea | 219 | |
14957cd0 | 220 | if (!offset && length == stringLength) |
9dae56ea | 221 | return *this; |
14957cd0 | 222 | return UString(StringImpl::create(m_impl, offset, length)); |
9dae56ea A |
223 | } |
224 | ||
9dae56ea A |
225 | bool operator==(const UString& s1, const char *s2) |
226 | { | |
6fe7ccc8 A |
227 | if (s1.isEmpty()) |
228 | return !s2; | |
9dae56ea | 229 | |
6fe7ccc8 A |
230 | return equal(s1.impl(), s2); |
231 | } | |
232 | ||
233 | // This method assumes that all simple checks have been performed by | |
234 | // the inlined operator==() in the header file. | |
235 | bool equalSlowCase(const UString& s1, const UString& s2) | |
236 | { | |
237 | StringImpl* rep1 = s1.impl(); | |
238 | StringImpl* rep2 = s2.impl(); | |
239 | unsigned size1 = rep1->length(); | |
240 | ||
241 | // At this point we know | |
242 | // (a) that the strings are the same length and | |
243 | // (b) that they are greater than zero length. | |
244 | bool s1Is8Bit = rep1->is8Bit(); | |
245 | bool s2Is8Bit = rep2->is8Bit(); | |
246 | ||
247 | if (s1Is8Bit) { | |
248 | const LChar* d1 = rep1->characters8(); | |
249 | if (s2Is8Bit) { | |
250 | const LChar* d2 = rep2->characters8(); | |
251 | ||
252 | if (d1 == d2) // Check to see if the data pointers are the same. | |
253 | return true; | |
254 | ||
255 | // Do quick checks for sizes 1 and 2. | |
256 | switch (size1) { | |
257 | case 1: | |
258 | return d1[0] == d2[0]; | |
259 | case 2: | |
260 | return (d1[0] == d2[0]) & (d1[1] == d2[1]); | |
261 | default: | |
262 | return (!memcmp(d1, d2, size1 * sizeof(LChar))); | |
263 | } | |
264 | } | |
265 | ||
266 | const UChar* d2 = rep2->characters16(); | |
267 | ||
268 | for (unsigned i = 0; i < size1; i++) { | |
269 | if (d1[i] != d2[i]) | |
270 | return false; | |
271 | } | |
272 | return true; | |
273 | } | |
274 | ||
275 | if (s2Is8Bit) { | |
276 | const UChar* d1 = rep1->characters16(); | |
277 | const LChar* d2 = rep2->characters8(); | |
278 | ||
279 | for (unsigned i = 0; i < size1; i++) { | |
280 | if (d1[i] != d2[i]) | |
281 | return false; | |
282 | } | |
283 | return true; | |
284 | ||
285 | } | |
286 | ||
287 | const UChar* d1 = rep1->characters16(); | |
288 | const UChar* d2 = rep2->characters16(); | |
289 | ||
290 | if (d1 == d2) // Check to see if the data pointers are the same. | |
291 | return true; | |
292 | ||
293 | // Do quick checks for sizes 1 and 2. | |
294 | switch (size1) { | |
295 | case 1: | |
296 | return d1[0] == d2[0]; | |
297 | case 2: | |
298 | return (d1[0] == d2[0]) & (d1[1] == d2[1]); | |
299 | default: | |
300 | return (!memcmp(d1, d2, size1 * sizeof(UChar))); | |
301 | } | |
9dae56ea A |
302 | } |
303 | ||
304 | bool operator<(const UString& s1, const UString& s2) | |
305 | { | |
14957cd0 A |
306 | const unsigned l1 = s1.length(); |
307 | const unsigned l2 = s2.length(); | |
4e4e5a6f | 308 | const unsigned lmin = l1 < l2 ? l1 : l2; |
6fe7ccc8 A |
309 | if (s1.is8Bit() && s2.is8Bit()) { |
310 | const LChar* c1 = s1.characters8(); | |
311 | const LChar* c2 = s2.characters8(); | |
312 | unsigned length = 0; | |
313 | while (length < lmin && *c1 == *c2) { | |
314 | c1++; | |
315 | c2++; | |
316 | length++; | |
317 | } | |
318 | if (length < lmin) | |
319 | return (c1[0] < c2[0]); | |
320 | ||
321 | return (l1 < l2); | |
322 | } | |
14957cd0 A |
323 | const UChar* c1 = s1.characters(); |
324 | const UChar* c2 = s2.characters(); | |
6fe7ccc8 A |
325 | unsigned length = 0; |
326 | while (length < lmin && *c1 == *c2) { | |
9dae56ea A |
327 | c1++; |
328 | c2++; | |
6fe7ccc8 | 329 | length++; |
9dae56ea | 330 | } |
6fe7ccc8 | 331 | if (length < lmin) |
9dae56ea A |
332 | return (c1[0] < c2[0]); |
333 | ||
334 | return (l1 < l2); | |
335 | } | |
336 | ||
337 | bool operator>(const UString& s1, const UString& s2) | |
338 | { | |
14957cd0 A |
339 | const unsigned l1 = s1.length(); |
340 | const unsigned l2 = s2.length(); | |
4e4e5a6f | 341 | const unsigned lmin = l1 < l2 ? l1 : l2; |
14957cd0 A |
342 | const UChar* c1 = s1.characters(); |
343 | const UChar* c2 = s2.characters(); | |
4e4e5a6f | 344 | unsigned l = 0; |
9dae56ea A |
345 | while (l < lmin && *c1 == *c2) { |
346 | c1++; | |
347 | c2++; | |
348 | l++; | |
349 | } | |
350 | if (l < lmin) | |
351 | return (c1[0] > c2[0]); | |
352 | ||
353 | return (l1 > l2); | |
354 | } | |
355 | ||
14957cd0 | 356 | CString UString::ascii() const |
9dae56ea | 357 | { |
14957cd0 A |
358 | // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are |
359 | // preserved, characters outside of this range are converted to '?'. | |
360 | ||
361 | unsigned length = this->length(); | |
6fe7ccc8 A |
362 | |
363 | if (this->is8Bit()) { | |
364 | const LChar* characters = this->characters8(); | |
365 | ||
366 | char* characterBuffer; | |
367 | CString result = CString::newUninitialized(length, characterBuffer); | |
368 | ||
369 | for (unsigned i = 0; i < length; ++i) { | |
370 | LChar ch = characters[i]; | |
371 | characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; | |
372 | } | |
373 | ||
374 | return result; | |
375 | } | |
376 | ||
377 | const UChar* characters = this->characters16(); | |
14957cd0 A |
378 | |
379 | char* characterBuffer; | |
380 | CString result = CString::newUninitialized(length, characterBuffer); | |
381 | ||
382 | for (unsigned i = 0; i < length; ++i) { | |
383 | UChar ch = characters[i]; | |
384 | characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch; | |
9dae56ea A |
385 | } |
386 | ||
14957cd0 A |
387 | return result; |
388 | } | |
9dae56ea | 389 | |
14957cd0 A |
390 | CString UString::latin1() const |
391 | { | |
392 | // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are | |
393 | // preserved, characters outside of this range are converted to '?'. | |
394 | ||
395 | unsigned length = this->length(); | |
396 | const UChar* characters = this->characters(); | |
9dae56ea | 397 | |
14957cd0 A |
398 | char* characterBuffer; |
399 | CString result = CString::newUninitialized(length, characterBuffer); | |
400 | ||
401 | for (unsigned i = 0; i < length; ++i) { | |
402 | UChar ch = characters[i]; | |
403 | characterBuffer[i] = ch > 0xff ? '?' : ch; | |
404 | } | |
405 | ||
406 | return result; | |
9dae56ea A |
407 | } |
408 | ||
14957cd0 A |
409 | // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. |
410 | static inline void putUTF8Triple(char*& buffer, UChar ch) | |
9dae56ea | 411 | { |
14957cd0 A |
412 | ASSERT(ch >= 0x0800); |
413 | *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
414 | *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
415 | *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
416 | } | |
417 | ||
418 | CString UString::utf8(bool strict) const | |
419 | { | |
420 | unsigned length = this->length(); | |
6fe7ccc8 A |
421 | |
422 | if (!length) | |
423 | return CString("", 0); | |
14957cd0 A |
424 | |
425 | // Allocate a buffer big enough to hold all the characters | |
426 | // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
427 | // Optimization ideas, if we find this function is hot: | |
428 | // * We could speculatively create a CStringBuffer to contain 'length' | |
429 | // characters, and resize if necessary (i.e. if the buffer contains | |
430 | // non-ascii characters). (Alternatively, scan the buffer first for | |
431 | // ascii characters, so we know this will be sufficient). | |
432 | // * We could allocate a CStringBuffer with an appropriate size to | |
433 | // have a good chance of being able to write the string into the | |
434 | // buffer without reallocing (say, 1.5 x length). | |
b80e6193 | 435 | if (length > numeric_limits<unsigned>::max() / 3) |
9dae56ea | 436 | return CString(); |
14957cd0 | 437 | |
6fe7ccc8 | 438 | Vector<char, 1024> bufferVector(length * 3); |
14957cd0 | 439 | char* buffer = bufferVector.data(); |
14957cd0 | 440 | |
6fe7ccc8 A |
441 | if (is8Bit()) { |
442 | const LChar* characters = this->characters8(); | |
14957cd0 | 443 | |
6fe7ccc8 A |
444 | ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size()); |
445 | ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion | |
446 | } else { | |
447 | const UChar* characters = this->characters16(); | |
448 | ||
449 | ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); | |
450 | ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion | |
451 | ||
452 | // Only produced from strict conversion. | |
453 | if (result == sourceIllegal) | |
14957cd0 | 454 | return CString(); |
6fe7ccc8 A |
455 | |
456 | // Check for an unconverted high surrogate. | |
457 | if (result == sourceExhausted) { | |
458 | if (strict) | |
459 | return CString(); | |
460 | // This should be one unpaired high surrogate. Treat it the same | |
461 | // was as an unpaired high surrogate would have been handled in | |
462 | // the middle of a string with non-strict conversion - which is | |
463 | // to say, simply encode it to UTF-8. | |
464 | ASSERT((characters + 1) == (this->characters() + length)); | |
465 | ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); | |
466 | // There should be room left, since one UChar hasn't been converted. | |
467 | ASSERT((buffer + 3) <= (buffer + bufferVector.size())); | |
468 | putUTF8Triple(buffer, *characters); | |
469 | } | |
14957cd0 | 470 | } |
9dae56ea | 471 | |
14957cd0 | 472 | return CString(bufferVector.data(), buffer - bufferVector.data()); |
9dae56ea A |
473 | } |
474 | ||
9dae56ea | 475 | } // namespace JSC |