2 * (C) 1999 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010 Apple Inc. All rights reserved.
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
23 #include "WTFString.h"
26 #include <wtf/ASCIICType.h>
27 #include <wtf/text/CString.h>
28 #include <wtf/StringExtras.h>
29 #include <wtf/Vector.h>
31 #include <wtf/unicode/UTF8.h>
32 #include <wtf/unicode/Unicode.h>
38 using namespace Unicode
;
41 // Construct a string with UTF-16 data.
42 String::String(const UChar
* characters
, unsigned length
)
43 : m_impl(characters
? StringImpl::create(characters
, length
) : 0)
47 // Construct a string with UTF-16 data, from a null-terminated source.
48 String::String(const UChar
* str
)
54 while (str
[len
] != UChar(0))
57 if (len
> numeric_limits
<unsigned>::max())
60 m_impl
= StringImpl::create(str
, len
);
63 // Construct a string with latin1 data.
64 String::String(const char* characters
, unsigned length
)
65 : m_impl(characters
? StringImpl::create(characters
, length
) : 0)
69 // Construct a string with latin1 data, from a null-terminated source.
70 String::String(const char* characters
)
71 : m_impl(characters
? StringImpl::create(characters
) : 0)
75 void String::append(const String
& str
)
80 // FIXME: This is extremely inefficient. So much so that we might want to take this
81 // out of String's API. We can make it better by optimizing the case where exactly
82 // one String is pointing at this StringImpl, but even then it's going to require a
83 // call to fastMalloc every single time.
87 if (str
.length() > numeric_limits
<unsigned>::max() - m_impl
->length())
89 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(m_impl
->length() + str
.length(), data
);
90 memcpy(data
, m_impl
->characters(), m_impl
->length() * sizeof(UChar
));
91 memcpy(data
+ m_impl
->length(), str
.characters(), str
.length() * sizeof(UChar
));
92 m_impl
= newImpl
.release();
98 void String::append(char c
)
100 // FIXME: This is extremely inefficient. So much so that we might want to take this
101 // out of String's API. We can make it better by optimizing the case where exactly
102 // one String is pointing at this StringImpl, but even then it's going to require a
103 // call to fastMalloc every single time.
106 if (m_impl
->length() >= numeric_limits
<unsigned>::max())
108 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(m_impl
->length() + 1, data
);
109 memcpy(data
, m_impl
->characters(), m_impl
->length() * sizeof(UChar
));
110 data
[m_impl
->length()] = c
;
111 m_impl
= newImpl
.release();
113 m_impl
= StringImpl::create(&c
, 1);
116 void String::append(UChar c
)
118 // FIXME: This is extremely inefficient. So much so that we might want to take this
119 // out of String's API. We can make it better by optimizing the case where exactly
120 // one String is pointing at this StringImpl, but even then it's going to require a
121 // call to fastMalloc every single time.
124 if (m_impl
->length() >= numeric_limits
<unsigned>::max())
126 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(m_impl
->length() + 1, data
);
127 memcpy(data
, m_impl
->characters(), m_impl
->length() * sizeof(UChar
));
128 data
[m_impl
->length()] = c
;
129 m_impl
= newImpl
.release();
131 m_impl
= StringImpl::create(&c
, 1);
134 int codePointCompare(const String
& a
, const String
& b
)
136 return codePointCompare(a
.impl(), b
.impl());
139 void String::insert(const String
& str
, unsigned pos
)
148 insert(str
.characters(), str
.length(), pos
);
151 void String::append(const UChar
* charactersToAppend
, unsigned lengthToAppend
)
154 if (!charactersToAppend
)
156 m_impl
= StringImpl::create(charactersToAppend
, lengthToAppend
);
163 ASSERT(charactersToAppend
);
165 if (lengthToAppend
> numeric_limits
<unsigned>::max() - length())
167 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(length() + lengthToAppend
, data
);
168 memcpy(data
, characters(), length() * sizeof(UChar
));
169 memcpy(data
+ length(), charactersToAppend
, lengthToAppend
* sizeof(UChar
));
170 m_impl
= newImpl
.release();
173 void String::insert(const UChar
* charactersToInsert
, unsigned lengthToInsert
, unsigned position
)
175 if (position
>= length()) {
176 append(charactersToInsert
, lengthToInsert
);
185 ASSERT(charactersToInsert
);
187 if (lengthToInsert
> numeric_limits
<unsigned>::max() - length())
189 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(length() + lengthToInsert
, data
);
190 memcpy(data
, characters(), position
* sizeof(UChar
));
191 memcpy(data
+ position
, charactersToInsert
, lengthToInsert
* sizeof(UChar
));
192 memcpy(data
+ position
+ lengthToInsert
, characters() + position
, (length() - position
) * sizeof(UChar
));
193 m_impl
= newImpl
.release();
196 UChar32
String::characterStartingAt(unsigned i
) const
198 if (!m_impl
|| i
>= m_impl
->length())
200 return m_impl
->characterStartingAt(i
);
203 void String::truncate(unsigned position
)
205 if (position
>= length())
208 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(position
, data
);
209 memcpy(data
, characters(), position
* sizeof(UChar
));
210 m_impl
= newImpl
.release();
213 void String::remove(unsigned position
, int lengthToRemove
)
215 if (lengthToRemove
<= 0)
217 if (position
>= length())
219 if (static_cast<unsigned>(lengthToRemove
) > length() - position
)
220 lengthToRemove
= length() - position
;
222 RefPtr
<StringImpl
> newImpl
= StringImpl::createUninitialized(length() - lengthToRemove
, data
);
223 memcpy(data
, characters(), position
* sizeof(UChar
));
224 memcpy(data
+ position
, characters() + position
+ lengthToRemove
,
225 (length() - lengthToRemove
- position
) * sizeof(UChar
));
226 m_impl
= newImpl
.release();
229 String
String::substring(unsigned pos
, unsigned len
) const
233 return m_impl
->substring(pos
, len
);
236 String
String::substringSharingImpl(unsigned offset
, unsigned length
) const
238 // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
240 unsigned stringLength
= this->length();
241 offset
= min(offset
, stringLength
);
242 length
= min(length
, stringLength
- offset
);
244 if (!offset
&& length
== stringLength
)
246 return String(StringImpl::create(m_impl
, offset
, length
));
249 String
String::lower() const
253 return m_impl
->lower();
256 String
String::upper() const
260 return m_impl
->upper();
263 String
String::stripWhiteSpace() const
267 return m_impl
->stripWhiteSpace();
270 String
String::simplifyWhiteSpace() const
274 return m_impl
->simplifyWhiteSpace();
277 String
String::removeCharacters(CharacterMatchFunctionPtr findMatch
) const
281 return m_impl
->removeCharacters(findMatch
);
284 String
String::foldCase() const
288 return m_impl
->foldCase();
291 bool String::percentage(int& result
) const
293 if (!m_impl
|| !m_impl
->length())
296 if ((*m_impl
)[m_impl
->length() - 1] != '%')
299 result
= charactersToIntStrict(m_impl
->characters(), m_impl
->length() - 1);
303 const UChar
* String::charactersWithNullTermination()
307 if (m_impl
->hasTerminatingNullCharacter())
308 return m_impl
->characters();
309 m_impl
= StringImpl::createWithTerminatingNullCharacter(*m_impl
);
310 return m_impl
->characters();
313 String
String::format(const char *format
, ...)
316 // Use QString::vsprintf to avoid the locale dependent formatting of vsnprintf.
317 // https://bugs.webkit.org/show_bug.cgi?id=18994
319 va_start(args
, format
);
322 buffer
.vsprintf(format
, args
);
326 QByteArray ba
= buffer
.toUtf8();
327 return StringImpl::create(ba
.constData(), ba
.length());
331 va_start(args
, format
);
333 Vector
<char, 256> buffer
;
335 int bufferSize
= 256;
336 buffer
.resize(bufferSize
);
338 int written
= vsnprintf(buffer
.data(), bufferSize
, format
, args
);
344 return StringImpl::create(buffer
.data(), written
);
347 buffer
.resize(bufferSize
);
348 va_start(args
, format
);
353 va_start(args
, format
);
355 Vector
<char, 256> buffer
;
357 // Do the format once to get the length.
359 int result
= _vscprintf(format
, args
);
362 int result
= vsnprintf(&ch
, 1, format
, args
);
363 // We need to call va_end() and then va_start() again here, as the
364 // contents of args is undefined after the call to vsnprintf
365 // according to http://man.cx/snprintf(3)
367 // Not calling va_end/va_start here happens to work on lots of
368 // systems, but fails e.g. on 64bit Linux.
370 va_start(args
, format
);
377 unsigned len
= result
;
378 buffer
.grow(len
+ 1);
380 // Now do the formatting again, guaranteed to fit.
381 vsnprintf(buffer
.data(), buffer
.size(), format
, args
);
385 return StringImpl::create(buffer
.data(), len
);
389 String
String::number(short n
)
391 return String::format("%hd", n
);
394 String
String::number(unsigned short n
)
396 return String::format("%hu", n
);
399 String
String::number(int n
)
401 return String::format("%d", n
);
404 String
String::number(unsigned n
)
406 return String::format("%u", n
);
409 String
String::number(long n
)
411 return String::format("%ld", n
);
414 String
String::number(unsigned long n
)
416 return String::format("%lu", n
);
419 String
String::number(long long n
)
421 #if OS(WINDOWS) && !PLATFORM(QT)
422 return String::format("%I64i", n
);
424 return String::format("%lli", n
);
428 String
String::number(unsigned long long n
)
430 #if OS(WINDOWS) && !PLATFORM(QT)
431 return String::format("%I64u", n
);
433 return String::format("%llu", n
);
437 String
String::number(double n
)
439 return String::format("%.6lg", n
);
442 int String::toIntStrict(bool* ok
, int base
) const
449 return m_impl
->toIntStrict(ok
, base
);
452 unsigned String::toUIntStrict(bool* ok
, int base
) const
459 return m_impl
->toUIntStrict(ok
, base
);
462 int64_t String::toInt64Strict(bool* ok
, int base
) const
469 return m_impl
->toInt64Strict(ok
, base
);
472 uint64_t String::toUInt64Strict(bool* ok
, int base
) const
479 return m_impl
->toUInt64Strict(ok
, base
);
482 intptr_t String::toIntPtrStrict(bool* ok
, int base
) const
489 return m_impl
->toIntPtrStrict(ok
, base
);
493 int String::toInt(bool* ok
) const
500 return m_impl
->toInt(ok
);
503 unsigned String::toUInt(bool* ok
) const
510 return m_impl
->toUInt(ok
);
513 int64_t String::toInt64(bool* ok
) const
520 return m_impl
->toInt64(ok
);
523 uint64_t String::toUInt64(bool* ok
) const
530 return m_impl
->toUInt64(ok
);
533 intptr_t String::toIntPtr(bool* ok
) const
540 return m_impl
->toIntPtr(ok
);
543 double String::toDouble(bool* ok
, bool* didReadNumber
) const
549 *didReadNumber
= false;
552 return m_impl
->toDouble(ok
, didReadNumber
);
555 float String::toFloat(bool* ok
, bool* didReadNumber
) const
561 *didReadNumber
= false;
564 return m_impl
->toFloat(ok
, didReadNumber
);
567 String
String::threadsafeCopy() const
571 return m_impl
->threadsafeCopy();
574 String
String::crossThreadString() const
578 return m_impl
->crossThreadString();
581 void String::split(const String
& separator
, bool allowEmptyEntries
, Vector
<String
>& result
) const
585 unsigned startPos
= 0;
587 while ((endPos
= find(separator
, startPos
)) != notFound
) {
588 if (allowEmptyEntries
|| startPos
!= endPos
)
589 result
.append(substring(startPos
, endPos
- startPos
));
590 startPos
= endPos
+ separator
.length();
592 if (allowEmptyEntries
|| startPos
!= length())
593 result
.append(substring(startPos
));
596 void String::split(const String
& separator
, Vector
<String
>& result
) const
598 split(separator
, false, result
);
601 void String::split(UChar separator
, bool allowEmptyEntries
, Vector
<String
>& result
) const
605 unsigned startPos
= 0;
607 while ((endPos
= find(separator
, startPos
)) != notFound
) {
608 if (allowEmptyEntries
|| startPos
!= endPos
)
609 result
.append(substring(startPos
, endPos
- startPos
));
610 startPos
= endPos
+ 1;
612 if (allowEmptyEntries
|| startPos
!= length())
613 result
.append(substring(startPos
));
616 void String::split(UChar separator
, Vector
<String
>& result
) const
618 split(String(&separator
, 1), false, result
);
621 CString
String::ascii() const
623 // Printable ASCII characters 32..127 and the null character are
624 // preserved, characters outside of this range are converted to '?'.
626 unsigned length
= this->length();
627 const UChar
* characters
= this->characters();
629 char* characterBuffer
;
630 CString result
= CString::newUninitialized(length
, characterBuffer
);
632 for (unsigned i
= 0; i
< length
; ++i
) {
633 UChar ch
= characters
[i
];
634 characterBuffer
[i
] = ch
&& (ch
< 0x20 || ch
> 0x7f) ? '?' : ch
;
640 CString
String::latin1() const
642 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
643 // preserved, characters outside of this range are converted to '?'.
645 unsigned length
= this->length();
646 const UChar
* characters
= this->characters();
648 char* characterBuffer
;
649 CString result
= CString::newUninitialized(length
, characterBuffer
);
651 for (unsigned i
= 0; i
< length
; ++i
) {
652 UChar ch
= characters
[i
];
653 characterBuffer
[i
] = ch
> 0xff ? '?' : ch
;
659 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
660 static inline void putUTF8Triple(char*& buffer
, UChar ch
)
662 ASSERT(ch
>= 0x0800);
663 *buffer
++ = static_cast<char>(((ch
>> 12) & 0x0F) | 0xE0);
664 *buffer
++ = static_cast<char>(((ch
>> 6) & 0x3F) | 0x80);
665 *buffer
++ = static_cast<char>((ch
& 0x3F) | 0x80);
668 CString
String::utf8(bool strict
) const
670 unsigned length
= this->length();
671 const UChar
* characters
= this->characters();
673 // Allocate a buffer big enough to hold all the characters
674 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
675 // Optimization ideas, if we find this function is hot:
676 // * We could speculatively create a CStringBuffer to contain 'length'
677 // characters, and resize if necessary (i.e. if the buffer contains
678 // non-ascii characters). (Alternatively, scan the buffer first for
679 // ascii characters, so we know this will be sufficient).
680 // * We could allocate a CStringBuffer with an appropriate size to
681 // have a good chance of being able to write the string into the
682 // buffer without reallocing (say, 1.5 x length).
683 if (length
> numeric_limits
<unsigned>::max() / 3)
685 Vector
<char, 1024> bufferVector(length
* 3);
687 char* buffer
= bufferVector
.data();
688 ConversionResult result
= convertUTF16ToUTF8(&characters
, characters
+ length
, &buffer
, buffer
+ bufferVector
.size(), strict
);
689 ASSERT(result
!= targetExhausted
); // (length * 3) should be sufficient for any conversion
691 // Only produced from strict conversion.
692 if (result
== sourceIllegal
)
695 // Check for an unconverted high surrogate.
696 if (result
== sourceExhausted
) {
699 // This should be one unpaired high surrogate. Treat it the same
700 // was as an unpaired high surrogate would have been handled in
701 // the middle of a string with non-strict conversion - which is
702 // to say, simply encode it to UTF-8.
703 ASSERT((characters
+ 1) == (this->characters() + length
));
704 ASSERT((*characters
>= 0xD800) && (*characters
<= 0xDBFF));
705 // There should be room left, since one UChar hasn't been converted.
706 ASSERT((buffer
+ 3) <= (buffer
+ bufferVector
.size()));
707 putUTF8Triple(buffer
, *characters
);
710 return CString(bufferVector
.data(), buffer
- bufferVector
.data());
713 String
String::fromUTF8(const char* stringStart
, size_t length
)
715 if (length
> numeric_limits
<unsigned>::max())
721 // We'll use a StringImpl as a buffer; if the source string only contains ascii this should be
722 // the right length, if there are any multi-byte sequences this buffer will be too large.
724 String
stringBuffer(StringImpl::createUninitialized(length
, buffer
));
725 UChar
* bufferEnd
= buffer
+ length
;
727 // Try converting into the buffer.
728 const char* stringCurrent
= stringStart
;
729 if (convertUTF8ToUTF16(&stringCurrent
, stringStart
+ length
, &buffer
, bufferEnd
) != conversionOK
)
732 // stringBuffer is full (the input must have been all ascii) so just return it!
733 if (buffer
== bufferEnd
)
736 // stringBuffer served its purpose as a buffer, copy the contents out into a new string.
737 unsigned utf16Length
= buffer
- stringBuffer
.characters();
738 ASSERT(utf16Length
< length
);
739 return String(stringBuffer
.characters(), utf16Length
);
742 String
String::fromUTF8(const char* string
)
746 return fromUTF8(string
, strlen(string
));
749 String
String::fromUTF8WithLatin1Fallback(const char* string
, size_t size
)
751 String utf8
= fromUTF8(string
, size
);
753 return String(string
, size
);
759 static bool isCharacterAllowedInBase(UChar c
, int base
)
764 return c
- '0' < base
;
765 if (isASCIIAlpha(c
)) {
768 return (c
>= 'a' && c
< 'a' + base
- 10)
769 || (c
>= 'A' && c
< 'A' + base
- 10);
774 template <typename IntegralType
>
775 static inline IntegralType
toIntegralType(const UChar
* data
, size_t length
, bool* ok
, int base
)
777 static const IntegralType integralMax
= numeric_limits
<IntegralType
>::max();
778 static const bool isSigned
= numeric_limits
<IntegralType
>::is_signed
;
779 const IntegralType maxMultiplier
= integralMax
/ base
;
781 IntegralType value
= 0;
783 bool isNegative
= false;
788 // skip leading whitespace
789 while (length
&& isSpaceOrNewline(*data
)) {
794 if (isSigned
&& length
&& *data
== '-') {
798 } else if (length
&& *data
== '+') {
803 if (!length
|| !isCharacterAllowedInBase(*data
, base
))
806 while (length
&& isCharacterAllowedInBase(*data
, base
)) {
808 IntegralType digitValue
;
811 digitValue
= c
- '0';
813 digitValue
= c
- 'a' + 10;
815 digitValue
= c
- 'A' + 10;
817 if (value
> maxMultiplier
|| (value
== maxMultiplier
&& digitValue
> (integralMax
% base
) + isNegative
))
820 value
= base
* value
+ digitValue
;
825 #pragma warning(push, 0)
826 #pragma warning(disable:4146)
836 // skip trailing space
837 while (length
&& isSpaceOrNewline(*data
)) {
847 return isOk
? value
: 0;
850 static unsigned lengthOfCharactersAsInteger(const UChar
* data
, size_t length
)
854 // Allow leading spaces.
855 for (; i
!= length
; ++i
) {
856 if (!isSpaceOrNewline(data
[i
]))
861 if (i
!= length
&& (data
[i
] == '+' || data
[i
] == '-'))
865 for (; i
!= length
; ++i
) {
866 if (!isASCIIDigit(data
[i
]))
873 int charactersToIntStrict(const UChar
* data
, size_t length
, bool* ok
, int base
)
875 return toIntegralType
<int>(data
, length
, ok
, base
);
878 unsigned charactersToUIntStrict(const UChar
* data
, size_t length
, bool* ok
, int base
)
880 return toIntegralType
<unsigned>(data
, length
, ok
, base
);
883 int64_t charactersToInt64Strict(const UChar
* data
, size_t length
, bool* ok
, int base
)
885 return toIntegralType
<int64_t>(data
, length
, ok
, base
);
888 uint64_t charactersToUInt64Strict(const UChar
* data
, size_t length
, bool* ok
, int base
)
890 return toIntegralType
<uint64_t>(data
, length
, ok
, base
);
893 intptr_t charactersToIntPtrStrict(const UChar
* data
, size_t length
, bool* ok
, int base
)
895 return toIntegralType
<intptr_t>(data
, length
, ok
, base
);
898 int charactersToInt(const UChar
* data
, size_t length
, bool* ok
)
900 return toIntegralType
<int>(data
, lengthOfCharactersAsInteger(data
, length
), ok
, 10);
903 unsigned charactersToUInt(const UChar
* data
, size_t length
, bool* ok
)
905 return toIntegralType
<unsigned>(data
, lengthOfCharactersAsInteger(data
, length
), ok
, 10);
908 int64_t charactersToInt64(const UChar
* data
, size_t length
, bool* ok
)
910 return toIntegralType
<int64_t>(data
, lengthOfCharactersAsInteger(data
, length
), ok
, 10);
913 uint64_t charactersToUInt64(const UChar
* data
, size_t length
, bool* ok
)
915 return toIntegralType
<uint64_t>(data
, lengthOfCharactersAsInteger(data
, length
), ok
, 10);
918 intptr_t charactersToIntPtr(const UChar
* data
, size_t length
, bool* ok
)
920 return toIntegralType
<intptr_t>(data
, lengthOfCharactersAsInteger(data
, length
), ok
, 10);
923 double charactersToDouble(const UChar
* data
, size_t length
, bool* ok
, bool* didReadNumber
)
929 *didReadNumber
= false;
933 Vector
<char, 256> bytes(length
+ 1);
934 for (unsigned i
= 0; i
< length
; ++i
)
935 bytes
[i
] = data
[i
] < 0x7F ? data
[i
] : '?';
936 bytes
[length
] = '\0';
937 char* start
= bytes
.data();
939 double val
= WTF::strtod(start
, &end
);
941 *ok
= (end
== 0 || *end
== '\0');
943 *didReadNumber
= end
- start
;
947 float charactersToFloat(const UChar
* data
, size_t length
, bool* ok
, bool* didReadNumber
)
949 // FIXME: This will return ok even when the string fits into a double but not a float.
950 return static_cast<float>(charactersToDouble(data
, length
, ok
, didReadNumber
));
953 const String
& emptyString()
955 DEFINE_STATIC_LOCAL(String
, emptyString
, (StringImpl::empty()));
962 // For use in the debugger
963 String
* string(const char*);
964 Vector
<char> asciiDebug(StringImpl
* impl
);
965 Vector
<char> asciiDebug(String
& string
);
967 String
* string(const char* s
)
970 return new String(s
);
973 Vector
<char> asciiDebug(StringImpl
* impl
)
976 return asciiDebug(String("[null]").impl());
979 unsigned length
= impl
->length();
980 const UChar
* characters
= impl
->characters();
982 buffer
.resize(length
+ 1);
983 for (unsigned i
= 0; i
< length
; ++i
) {
984 UChar ch
= characters
[i
];
985 buffer
[i
] = ch
&& (ch
< 0x20 || ch
> 0x7f) ? '?' : ch
;
987 buffer
[length
] = '\0';
992 Vector
<char> asciiDebug(String
& string
)
994 return asciiDebug(string
.impl());