]>
Commit | Line | Data |
---|---|---|
b37bf2e1 A |
1 | // -*- c-basic-offset: 2 -*- |
2 | /* | |
3 | * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) | |
4 | * Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved. | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Library General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Library General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Library General Public License | |
17 | * along with this library; see the file COPYING.LIB. If not, write to | |
18 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
19 | * Boston, MA 02110-1301, USA. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _KJS_USTRING_H_ | |
24 | #define _KJS_USTRING_H_ | |
25 | ||
26 | #include "JSLock.h" | |
27 | #include "collector.h" | |
28 | #include <stdint.h> | |
29 | #include <wtf/Assertions.h> | |
30 | #include <wtf/FastMalloc.h> | |
31 | #include <wtf/PassRefPtr.h> | |
32 | #include <wtf/RefPtr.h> | |
33 | #include <wtf/Vector.h> | |
34 | ||
35 | /* On some ARM platforms GCC won't pack structures by default so sizeof(UChar) | |
36 | will end up being != 2 which causes crashes since the code depends on that. */ | |
37 | #if COMPILER(GCC) && PLATFORM(FORCE_PACK) | |
38 | #define PACK_STRUCT __attribute__((packed)) | |
39 | #else | |
40 | #define PACK_STRUCT | |
41 | #endif | |
42 | ||
43 | /** | |
44 | * @internal | |
45 | */ | |
46 | namespace DOM { | |
47 | class DOMString; | |
48 | class AtomicString; | |
49 | } | |
50 | class KJScript; | |
51 | ||
52 | namespace KJS { | |
53 | ||
54 | using WTF::PlacementNewAdoptType; | |
55 | using WTF::PlacementNewAdopt; | |
56 | ||
57 | class UString; | |
58 | ||
59 | /** | |
60 | * @short Unicode character. | |
61 | * | |
62 | * UChar represents a 16 bit Unicode character. It's internal data | |
63 | * representation is compatible to XChar2b and QChar. It's therefore | |
64 | * possible to exchange data with X and Qt with shallow copies. | |
65 | */ | |
66 | struct UChar { | |
67 | /** | |
68 | * Construct a character with uninitialized value. | |
69 | */ | |
70 | UChar(); | |
71 | /** | |
72 | * Construct a character with the value denoted by the arguments. | |
73 | * @param h higher byte | |
74 | * @param l lower byte | |
75 | */ | |
76 | UChar(unsigned char h , unsigned char l); | |
77 | /** | |
78 | * Construct a character with the given value. | |
79 | * @param u 16 bit Unicode value | |
80 | */ | |
81 | UChar(char u); | |
82 | UChar(unsigned char u); | |
83 | UChar(unsigned short u); | |
84 | /** | |
85 | * @return The higher byte of the character. | |
86 | */ | |
87 | unsigned char high() const { return static_cast<unsigned char>(uc >> 8); } | |
88 | /** | |
89 | * @return The lower byte of the character. | |
90 | */ | |
91 | unsigned char low() const { return static_cast<unsigned char>(uc); } | |
92 | /** | |
93 | * @return the 16 bit Unicode value of the character | |
94 | */ | |
95 | unsigned short unicode() const { return uc; } | |
96 | ||
97 | unsigned short uc; | |
98 | } PACK_STRUCT; | |
99 | ||
100 | inline UChar::UChar() { } | |
101 | inline UChar::UChar(unsigned char h , unsigned char l) : uc(h << 8 | l) { } | |
102 | inline UChar::UChar(char u) : uc((unsigned char)u) { } | |
103 | inline UChar::UChar(unsigned char u) : uc(u) { } | |
104 | inline UChar::UChar(unsigned short u) : uc(u) { } | |
105 | ||
106 | /** | |
107 | * @short 8 bit char based string class | |
108 | */ | |
109 | class CString { | |
110 | public: | |
111 | CString() : data(0), length(0) { } | |
112 | CString(const char *c); | |
113 | CString(const char *c, size_t len); | |
114 | CString(const CString &); | |
115 | ||
116 | ~CString(); | |
117 | ||
118 | CString &append(const CString &); | |
119 | CString &operator=(const char *c); | |
120 | CString &operator=(const CString &); | |
121 | CString &operator+=(const CString &c) { return append(c); } | |
122 | ||
123 | size_t size() const { return length; } | |
124 | const char *c_str() const { return data; } | |
125 | private: | |
126 | char *data; | |
127 | size_t length; | |
128 | }; | |
129 | ||
130 | /** | |
131 | * @short Unicode string class | |
132 | */ | |
133 | class UString { | |
134 | friend bool operator==(const UString&, const UString&); | |
135 | ||
136 | public: | |
137 | /** | |
138 | * @internal | |
139 | */ | |
140 | struct Rep { | |
141 | ||
142 | static PassRefPtr<Rep> create(UChar *d, int l); | |
143 | static PassRefPtr<Rep> createCopying(const UChar *d, int l); | |
144 | static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length); | |
145 | ||
146 | void destroy(); | |
147 | ||
148 | bool baseIsSelf() const { return baseString == this; } | |
149 | UChar* data() const { return baseString->buf + baseString->preCapacity + offset; } | |
150 | int size() const { return len; } | |
151 | ||
152 | unsigned hash() const { if (_hash == 0) _hash = computeHash(data(), len); return _hash; } | |
153 | unsigned computedHash() const { ASSERT(_hash); return _hash; } // fast path for Identifiers | |
154 | ||
155 | static unsigned computeHash(const UChar *, int length); | |
156 | static unsigned computeHash(const char *); | |
157 | ||
158 | Rep* ref() { ASSERT(JSLock::lockCount() > 0); ++rc; return this; } | |
159 | ALWAYS_INLINE void deref() { ASSERT(JSLock::lockCount() > 0); if (--rc == 0) destroy(); } | |
160 | ||
161 | // unshared data | |
162 | int offset; | |
163 | int len; | |
164 | int rc; | |
165 | mutable unsigned _hash; | |
166 | bool isIdentifier; | |
167 | UString::Rep* baseString; | |
168 | size_t reportedCost; | |
169 | ||
170 | // potentially shared data | |
171 | UChar *buf; | |
172 | int usedCapacity; | |
173 | int capacity; | |
174 | int usedPreCapacity; | |
175 | int preCapacity; | |
176 | ||
177 | static Rep null; | |
178 | static Rep empty; | |
179 | }; | |
180 | ||
181 | public: | |
182 | ||
183 | /** | |
184 | * Constructs a null string. | |
185 | */ | |
186 | UString(); | |
187 | /** | |
188 | * Constructs a string from a classical zero-terminated char string. | |
189 | */ | |
190 | UString(const char *c); | |
191 | /** | |
192 | * Constructs a string from an array of Unicode characters of the specified | |
193 | * length. | |
194 | */ | |
195 | UString(const UChar *c, int length); | |
196 | /** | |
197 | * If copy is false the string data will be adopted. | |
198 | * That means that the data will NOT be copied and the pointer will | |
199 | * be deleted when the UString object is modified or destroyed. | |
200 | * Behaviour defaults to a deep copy if copy is true. | |
201 | */ | |
202 | UString(UChar *c, int length, bool copy); | |
203 | /** | |
204 | * Copy constructor. Makes a shallow copy only. | |
205 | */ | |
206 | UString(const UString &s) : m_rep(s.m_rep) {} | |
207 | ||
208 | UString(const Vector<UChar>& buffer); | |
209 | ||
210 | /** | |
211 | * Convenience declaration only ! You'll be on your own to write the | |
212 | * implementation for a construction from DOM::DOMString. | |
213 | * | |
214 | * Note: feel free to contact me if you want to see a dummy header for | |
215 | * your favorite FooString class here ! | |
216 | */ | |
217 | UString(const DOM::DOMString&); | |
218 | /** | |
219 | * Convenience declaration only ! See UString(const DOM::DOMString&). | |
220 | */ | |
221 | UString(const DOM::AtomicString&); | |
222 | ||
223 | /** | |
224 | * Concatenation constructor. Makes operator+ more efficient. | |
225 | */ | |
226 | UString(const UString &, const UString &); | |
227 | /** | |
228 | * Destructor. | |
229 | */ | |
230 | ~UString() {} | |
231 | ||
232 | // Special constructor for cases where we overwrite an object in place. | |
233 | UString(PlacementNewAdoptType) : m_rep(PlacementNewAdopt) { } | |
234 | ||
235 | /** | |
236 | * Constructs a string from an int. | |
237 | */ | |
238 | static UString from(int i); | |
239 | /** | |
240 | * Constructs a string from an unsigned int. | |
241 | */ | |
242 | static UString from(unsigned int u); | |
243 | /** | |
244 | * Constructs a string from a long int. | |
245 | */ | |
246 | static UString from(long u); | |
247 | /** | |
248 | * Constructs a string from a double. | |
249 | */ | |
250 | static UString from(double d); | |
251 | ||
252 | struct Range { | |
253 | public: | |
254 | Range(int pos, int len) : position(pos), length(len) {} | |
255 | Range() {} | |
256 | int position; | |
257 | int length; | |
258 | }; | |
259 | ||
260 | UString spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const; | |
261 | ||
262 | /** | |
263 | * Append another string. | |
264 | */ | |
265 | UString &append(const UString &); | |
266 | UString &append(const char *); | |
267 | UString &append(unsigned short); | |
268 | UString &append(char c) { return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); } | |
269 | UString &append(UChar c) { return append(c.uc); } | |
270 | ||
271 | /** | |
272 | * @return The string converted to the 8-bit string type CString(). | |
273 | * This method is not Unicode safe and shouldn't be used unless the string | |
274 | * is known to be ASCII. | |
275 | */ | |
276 | CString cstring() const; | |
277 | /** | |
278 | * Convert the Unicode string to plain ASCII chars chopping of any higher | |
279 | * bytes. This method should only be used for *debugging* purposes as it | |
280 | * is neither Unicode safe nor free from side effects. In order not to | |
281 | * waste any memory the char buffer is static and *shared* by all UString | |
282 | * instances. | |
283 | */ | |
284 | char *ascii() const; | |
285 | ||
286 | /** | |
287 | * Convert the string to UTF-8, assuming it is UTF-16 encoded. | |
288 | * In non-strict mode, this function is tolerant of badly formed UTF-16, it | |
289 | * can create UTF-8 strings that are invalid because they have characters in | |
290 | * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is | |
291 | * guaranteed to be otherwise valid. | |
292 | * In strict mode, error is returned as null CString. | |
293 | */ | |
294 | CString UTF8String(bool strict = false) const; | |
295 | ||
296 | /** | |
297 | * @see UString(const DOM::DOMString&). | |
298 | */ | |
299 | DOM::DOMString domString() const; | |
300 | ||
301 | /** | |
302 | * Assignment operator. | |
303 | */ | |
304 | UString &operator=(const char *c); | |
305 | /** | |
306 | * Appends the specified string. | |
307 | */ | |
308 | UString &operator+=(const UString &s) { return append(s); } | |
309 | UString &operator+=(const char *s) { return append(s); } | |
310 | ||
311 | /** | |
312 | * @return A pointer to the internal Unicode data. | |
313 | */ | |
314 | const UChar* data() const { return m_rep->data(); } | |
315 | /** | |
316 | * @return True if null. | |
317 | */ | |
318 | bool isNull() const { return (m_rep == &Rep::null); } | |
319 | /** | |
320 | * @return True if null or zero length. | |
321 | */ | |
322 | bool isEmpty() const { return (!m_rep->len); } | |
323 | /** | |
324 | * Use this if you want to make sure that this string is a plain ASCII | |
325 | * string. For example, if you don't want to lose any information when | |
326 | * using cstring() or ascii(). | |
327 | * | |
328 | * @return True if the string doesn't contain any non-ASCII characters. | |
329 | */ | |
330 | bool is8Bit() const; | |
331 | /** | |
332 | * @return The length of the string. | |
333 | */ | |
334 | int size() const { return m_rep->size(); } | |
335 | /** | |
336 | * Const character at specified position. | |
337 | */ | |
338 | const UChar operator[](int pos) const; | |
339 | ||
340 | /** | |
341 | * Attempts an conversion to a number. Apart from floating point numbers, | |
342 | * the algorithm will recognize hexadecimal representations (as | |
343 | * indicated by a 0x or 0X prefix) and +/- Infinity. | |
344 | * Returns NaN if the conversion failed. | |
345 | * @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number. | |
346 | * @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0. | |
347 | */ | |
348 | double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const; | |
349 | double toDouble(bool tolerateTrailingJunk) const; | |
350 | double toDouble() const; | |
351 | ||
352 | /** | |
353 | * Attempts an conversion to a 32-bit integer. ok will be set | |
354 | * according to the success. | |
355 | * @param tolerateEmptyString if false, toUInt32 will return false for *ok for an empty string. | |
356 | */ | |
357 | uint32_t toUInt32(bool *ok = 0) const; | |
358 | uint32_t toUInt32(bool *ok, bool tolerateEmptyString) const; | |
359 | uint32_t toStrictUInt32(bool *ok = 0) const; | |
360 | ||
361 | /** | |
362 | * Attempts an conversion to an array index. The "ok" boolean will be set | |
363 | * to true if it is a valid array index according to the rule from | |
364 | * ECMA 15.2 about what an array index is. It must exactly match the string | |
365 | * form of an unsigned integer, and be less than 2^32 - 1. | |
366 | */ | |
367 | unsigned toArrayIndex(bool *ok = 0) const; | |
368 | ||
369 | /** | |
370 | * @return Position of first occurrence of f starting at position pos. | |
371 | * -1 if the search was not successful. | |
372 | */ | |
373 | int find(const UString &f, int pos = 0) const; | |
374 | int find(UChar, int pos = 0) const; | |
375 | /** | |
376 | * @return Position of first occurrence of f searching backwards from | |
377 | * position pos. | |
378 | * -1 if the search was not successful. | |
379 | */ | |
380 | int rfind(const UString &f, int pos) const; | |
381 | int rfind(UChar, int pos) const; | |
382 | /** | |
383 | * @return The sub string starting at position pos and length len. | |
384 | */ | |
385 | UString substr(int pos = 0, int len = -1) const; | |
386 | /** | |
387 | * Static instance of a null string. | |
388 | */ | |
389 | static const UString &null(); | |
390 | ||
391 | Rep* rep() const { return m_rep.get(); } | |
392 | UString(PassRefPtr<Rep> r) : m_rep(r) { ASSERT(m_rep); } | |
393 | ||
394 | size_t cost() const; | |
395 | ||
396 | private: | |
397 | size_t expandedSize(size_t size, size_t otherSize) const; | |
398 | int usedCapacity() const; | |
399 | int usedPreCapacity() const; | |
400 | void expandCapacity(int requiredLength); | |
401 | void expandPreCapacity(int requiredPreCap); | |
402 | ||
403 | RefPtr<Rep> m_rep; | |
404 | }; | |
405 | ||
406 | inline bool operator==(const UChar &c1, const UChar &c2) { | |
407 | return (c1.uc == c2.uc); | |
408 | } | |
409 | bool operator==(const UString& s1, const UString& s2); | |
410 | inline bool operator!=(const UString& s1, const UString& s2) { | |
411 | return !KJS::operator==(s1, s2); | |
412 | } | |
413 | bool operator<(const UString& s1, const UString& s2); | |
414 | bool operator==(const UString& s1, const char *s2); | |
415 | inline bool operator!=(const UString& s1, const char *s2) { | |
416 | return !KJS::operator==(s1, s2); | |
417 | } | |
418 | inline bool operator==(const char *s1, const UString& s2) { | |
419 | return operator==(s2, s1); | |
420 | } | |
421 | inline bool operator!=(const char *s1, const UString& s2) { | |
422 | return !KJS::operator==(s1, s2); | |
423 | } | |
424 | bool operator==(const CString& s1, const CString& s2); | |
425 | inline UString operator+(const UString& s1, const UString& s2) { | |
426 | return UString(s1, s2); | |
427 | } | |
428 | ||
429 | int compare(const UString &, const UString &); | |
430 | ||
431 | inline UString::UString() | |
432 | : m_rep(&Rep::null) | |
433 | { | |
434 | } | |
435 | ||
436 | // Rule from ECMA 15.2 about what an array index is. | |
437 | // Must exactly match string form of an unsigned integer, and be less than 2^32 - 1. | |
438 | inline unsigned UString::toArrayIndex(bool *ok) const | |
439 | { | |
440 | unsigned i = toStrictUInt32(ok); | |
441 | if (ok && i >= 0xFFFFFFFFU) | |
442 | *ok = false; | |
443 | return i; | |
444 | } | |
445 | ||
446 | // We'd rather not do shared substring append for small strings, since | |
447 | // this runs too much risk of a tiny initial string holding down a | |
448 | // huge buffer. | |
449 | // FIXME: this should be size_t but that would cause warnings until we | |
450 | // fix UString sizes to be size_t instead of int | |
451 | static const int minShareSize = Collector::minExtraCostSize / sizeof(UChar); | |
452 | ||
453 | inline size_t UString::cost() const | |
454 | { | |
455 | size_t capacity = (m_rep->baseString->capacity + m_rep->baseString->preCapacity) * sizeof(UChar); | |
456 | size_t reportedCost = m_rep->baseString->reportedCost; | |
457 | ASSERT(capacity >= reportedCost); | |
458 | ||
459 | size_t capacityDelta = capacity - reportedCost; | |
460 | ||
461 | if (capacityDelta < static_cast<size_t>(minShareSize)) | |
462 | return 0; | |
463 | ||
464 | m_rep->baseString->reportedCost = capacity; | |
465 | return capacityDelta; | |
466 | } | |
467 | ||
468 | } // namespace | |
469 | ||
470 | #endif |