]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * Copyright (C) 1999-2003, International Business Machines Corporation and * | |
4 | * others. All Rights Reserved. * | |
5 | ****************************************************************************** | |
6 | * | |
7 | * File unistr.cpp | |
8 | * | |
9 | * Modification History: | |
10 | * | |
11 | * Date Name Description | |
12 | * 09/25/98 stephen Creation. | |
13 | * 04/20/99 stephen Overhauled per 4/16 code review. | |
14 | * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX | |
15 | * 11/18/99 aliu Added handleReplaceBetween() to make inherit from | |
16 | * Replaceable. | |
17 | * 06/25/01 grhoten Removed the dependency on iostream | |
18 | ****************************************************************************** | |
19 | */ | |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | #include "unicode/putil.h" | |
23 | #include "unicode/locid.h" | |
24 | #include "cstring.h" | |
25 | #include "cmemory.h" | |
26 | #include "unicode/ustring.h" | |
27 | #include "unicode/unistr.h" | |
28 | #include "unicode/uchar.h" | |
29 | #include "unicode/ucnv.h" | |
30 | #include "unicode/ubrk.h" | |
31 | #include "uhash.h" | |
32 | #include "ustr_imp.h" | |
33 | #include "unormimp.h" | |
34 | #include "umutex.h" | |
35 | ||
36 | #if 0 | |
37 | ||
38 | #if U_IOSTREAM_SOURCE >= 199711 | |
39 | #include <iostream> | |
40 | using namespace std; | |
41 | #elif U_IOSTREAM_SOURCE >= 198506 | |
42 | #include <iostream.h> | |
43 | #endif | |
44 | ||
45 | //DEBUGGING | |
46 | void | |
47 | print(const UnicodeString& s, | |
48 | const char *name) | |
49 | { | |
50 | UChar c; | |
51 | cout << name << ":|"; | |
52 | for(int i = 0; i < s.length(); ++i) { | |
53 | c = s[i]; | |
54 | if(c>= 0x007E || c < 0x0020) | |
55 | cout << "[0x" << hex << s[i] << "]"; | |
56 | else | |
57 | cout << (char) s[i]; | |
58 | } | |
59 | cout << '|' << endl; | |
60 | } | |
61 | ||
62 | void | |
63 | print(const UChar *s, | |
64 | int32_t len, | |
65 | const char *name) | |
66 | { | |
67 | UChar c; | |
68 | cout << name << ":|"; | |
69 | for(int i = 0; i < len; ++i) { | |
70 | c = s[i]; | |
71 | if(c>= 0x007E || c < 0x0020) | |
72 | cout << "[0x" << hex << s[i] << "]"; | |
73 | else | |
74 | cout << (char) s[i]; | |
75 | } | |
76 | cout << '|' << endl; | |
77 | } | |
78 | // END DEBUGGING | |
79 | #endif | |
80 | ||
81 | // Local function definitions for now | |
82 | ||
83 | // need to copy areas that may overlap | |
84 | static | |
85 | inline void | |
86 | us_arrayCopy(const UChar *src, int32_t srcStart, | |
87 | UChar *dst, int32_t dstStart, int32_t count) | |
88 | { | |
89 | if(count>0) { | |
90 | uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); | |
91 | } | |
92 | } | |
93 | ||
94 | // u_unescapeAt() callback to get a UChar from a UnicodeString | |
95 | U_CDECL_BEGIN | |
96 | static UChar U_CALLCONV | |
97 | UnicodeString_charAt(int32_t offset, void *context) { | |
98 | return ((UnicodeString*) context)->charAt(offset); | |
99 | } | |
100 | U_CDECL_END | |
101 | ||
102 | U_NAMESPACE_BEGIN | |
103 | ||
104 | const char UnicodeString::fgClassID=0; | |
105 | ||
106 | //======================================== | |
107 | // Reference Counting functions, put at top of file so that optimizing compilers | |
108 | // have a chance to automatically inline. | |
109 | //======================================== | |
110 | ||
111 | void | |
112 | UnicodeString::addRef() | |
113 | { umtx_atomic_inc((int32_t *)fArray - 1);} | |
114 | ||
115 | int32_t | |
116 | UnicodeString::removeRef() | |
117 | { return umtx_atomic_dec((int32_t *)fArray - 1);} | |
118 | ||
119 | int32_t | |
120 | UnicodeString::refCount() const | |
121 | { | |
122 | umtx_lock(NULL); | |
123 | // Note: without the lock to force a memory barrier, we might see a very | |
124 | // stale value on some multi-processor systems. | |
125 | int32_t count = *((int32_t *)fArray - 1); | |
126 | umtx_unlock(NULL); | |
127 | return count; | |
128 | } | |
129 | ||
130 | void | |
131 | UnicodeString::releaseArray() { | |
132 | if((fFlags & kRefCounted) && removeRef() == 0) { | |
133 | uprv_free((int32_t *)fArray - 1); | |
134 | } | |
135 | } | |
136 | ||
137 | ||
138 | ||
139 | //======================================== | |
140 | // Constructors | |
141 | //======================================== | |
142 | UnicodeString::UnicodeString() | |
143 | : fLength(0), | |
144 | fCapacity(US_STACKBUF_SIZE), | |
145 | fArray(fStackBuffer), | |
146 | fFlags(kShortString) | |
147 | {} | |
148 | ||
149 | UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) | |
150 | : fLength(0), | |
151 | fCapacity(US_STACKBUF_SIZE), | |
152 | fArray(0), | |
153 | fFlags(0) | |
154 | { | |
155 | if(count <= 0 || (uint32_t)c > 0x10ffff) { | |
156 | // just allocate and do not do anything else | |
157 | allocate(capacity); | |
158 | } else { | |
159 | // count > 0, allocate and fill the new string with count c's | |
160 | int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount; | |
161 | if(capacity < length) { | |
162 | capacity = length; | |
163 | } | |
164 | if(allocate(capacity)) { | |
165 | int32_t i = 0; | |
166 | ||
167 | // fill the new string with c | |
168 | if(unitCount == 1) { | |
169 | // fill with length UChars | |
170 | while(i < length) { | |
171 | fArray[i++] = (UChar)c; | |
172 | } | |
173 | } else { | |
174 | // get the code units for c | |
175 | UChar units[UTF_MAX_CHAR_LENGTH]; | |
176 | UTF_APPEND_CHAR_UNSAFE(units, i, c); | |
177 | ||
178 | // now it must be i==unitCount | |
179 | i = 0; | |
180 | ||
181 | // for Unicode, unitCount can only be 1, 2, 3, or 4 | |
182 | // 1 is handled above | |
183 | while(i < length) { | |
184 | int32_t unitIdx = 0; | |
185 | while(unitIdx < unitCount) { | |
186 | fArray[i++]=units[unitIdx++]; | |
187 | } | |
188 | } | |
189 | } | |
190 | } | |
191 | fLength = length; | |
192 | } | |
193 | } | |
194 | ||
195 | UnicodeString::UnicodeString(UChar ch) | |
196 | : fLength(1), | |
197 | fCapacity(US_STACKBUF_SIZE), | |
198 | fArray(fStackBuffer), | |
199 | fFlags(kShortString) | |
200 | { | |
201 | fStackBuffer[0] = ch; | |
202 | } | |
203 | ||
204 | UnicodeString::UnicodeString(UChar32 ch) | |
205 | : fLength(1), | |
206 | fCapacity(US_STACKBUF_SIZE), | |
207 | fArray(fStackBuffer), | |
208 | fFlags(kShortString) | |
209 | { | |
210 | int32_t i = 0; | |
211 | UBool isError = FALSE; | |
212 | U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError); | |
213 | fLength = i; | |
214 | } | |
215 | ||
216 | UnicodeString::UnicodeString(const UChar *text) | |
217 | : fLength(0), | |
218 | fCapacity(US_STACKBUF_SIZE), | |
219 | fArray(fStackBuffer), | |
220 | fFlags(kShortString) | |
221 | { | |
222 | doReplace(0, 0, text, 0, -1); | |
223 | } | |
224 | ||
225 | UnicodeString::UnicodeString(const UChar *text, | |
226 | int32_t textLength) | |
227 | : fLength(0), | |
228 | fCapacity(US_STACKBUF_SIZE), | |
229 | fArray(fStackBuffer), | |
230 | fFlags(kShortString) | |
231 | { | |
232 | doReplace(0, 0, text, 0, textLength); | |
233 | } | |
234 | ||
235 | UnicodeString::UnicodeString(UBool isTerminated, | |
236 | const UChar *text, | |
237 | int32_t textLength) | |
238 | : fLength(textLength), | |
239 | fCapacity(isTerminated ? textLength + 1 : textLength), | |
240 | fArray((UChar *)text), | |
241 | fFlags(kReadonlyAlias) | |
242 | { | |
243 | if(text == NULL) { | |
244 | // treat as an empty string, do not alias | |
245 | fLength = 0; | |
246 | fCapacity = US_STACKBUF_SIZE; | |
247 | fArray = fStackBuffer; | |
248 | fFlags = kShortString; | |
249 | } else if(textLength < -1 || | |
250 | (textLength == -1 && !isTerminated) || | |
251 | (textLength >= 0 && isTerminated && text[textLength] != 0) | |
252 | ) { | |
253 | setToBogus(); | |
254 | } else if(textLength == -1) { | |
255 | // text is terminated, or else it would have failed the above test | |
256 | fLength = u_strlen(text); | |
257 | fCapacity = fLength + 1; | |
258 | } | |
259 | } | |
260 | ||
261 | UnicodeString::UnicodeString(UChar *buff, | |
262 | int32_t buffLength, | |
263 | int32_t buffCapacity) | |
264 | : fLength(buffLength), | |
265 | fCapacity(buffCapacity), | |
266 | fArray(buff), | |
267 | fFlags(kWritableAlias) | |
268 | { | |
269 | if(buff == NULL) { | |
270 | // treat as an empty string, do not alias | |
271 | fLength = 0; | |
272 | fCapacity = US_STACKBUF_SIZE; | |
273 | fArray = fStackBuffer; | |
274 | fFlags = kShortString; | |
275 | } else if(buffLength < -1 || buffLength > buffCapacity) { | |
276 | setToBogus(); | |
277 | } else if(buffLength == -1) { | |
278 | // fLength = u_strlen(buff); but do not look beyond buffCapacity | |
279 | const UChar *p = buff, *limit = buff + buffCapacity; | |
280 | while(p != limit && *p != 0) { | |
281 | ++p; | |
282 | } | |
283 | fLength = (int32_t)(p - buff); | |
284 | } | |
285 | } | |
286 | ||
287 | UnicodeString::UnicodeString(const char *codepageData, | |
288 | const char *codepage) | |
289 | : fLength(0), | |
290 | fCapacity(US_STACKBUF_SIZE), | |
291 | fArray(fStackBuffer), | |
292 | fFlags(kShortString) | |
293 | { | |
294 | if(codepageData != 0) { | |
295 | doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); | |
296 | } | |
297 | } | |
298 | ||
299 | ||
300 | UnicodeString::UnicodeString(const char *codepageData, | |
301 | int32_t dataLength, | |
302 | const char *codepage) | |
303 | : fLength(0), | |
304 | fCapacity(US_STACKBUF_SIZE), | |
305 | fArray(fStackBuffer), | |
306 | fFlags(kShortString) | |
307 | { | |
308 | if(codepageData != 0) { | |
309 | doCodepageCreate(codepageData, dataLength, codepage); | |
310 | } | |
311 | } | |
312 | ||
313 | UnicodeString::UnicodeString(const char *src, int32_t srcLength, | |
314 | UConverter *cnv, | |
315 | UErrorCode &errorCode) | |
316 | : fLength(0), | |
317 | fCapacity(US_STACKBUF_SIZE), | |
318 | fArray(fStackBuffer), | |
319 | fFlags(kShortString) | |
320 | { | |
321 | if(U_SUCCESS(errorCode)) { | |
322 | // check arguments | |
323 | if(src==NULL) { | |
324 | // treat as an empty string, do nothing more | |
325 | } else if(srcLength<-1) { | |
326 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
327 | } else { | |
328 | // get input length | |
329 | if(srcLength==-1) { | |
330 | srcLength=(int32_t)uprv_strlen(src); | |
331 | } | |
332 | if(srcLength>0) { | |
333 | if(cnv!=0) { | |
334 | // use the provided converter | |
335 | ucnv_resetToUnicode(cnv); | |
336 | doCodepageCreate(src, srcLength, cnv, errorCode); | |
337 | } else { | |
338 | // use the default converter | |
339 | cnv=u_getDefaultConverter(&errorCode); | |
340 | doCodepageCreate(src, srcLength, cnv, errorCode); | |
341 | u_releaseDefaultConverter(cnv); | |
342 | } | |
343 | } | |
344 | } | |
345 | ||
346 | if(U_FAILURE(errorCode)) { | |
347 | setToBogus(); | |
348 | } | |
349 | } | |
350 | } | |
351 | ||
352 | UnicodeString::UnicodeString(const UnicodeString& that) | |
353 | : Replaceable(), | |
354 | fLength(0), | |
355 | fCapacity(US_STACKBUF_SIZE), | |
356 | fArray(fStackBuffer), | |
357 | fFlags(kShortString) | |
358 | { | |
359 | copyFrom(that); | |
360 | } | |
361 | ||
362 | UnicodeString::UnicodeString(const UnicodeString& that, | |
363 | int32_t srcStart) | |
364 | : Replaceable(), | |
365 | fLength(0), | |
366 | fCapacity(US_STACKBUF_SIZE), | |
367 | fArray(fStackBuffer), | |
368 | fFlags(kShortString) | |
369 | { | |
370 | setTo(that, srcStart); | |
371 | } | |
372 | ||
373 | UnicodeString::UnicodeString(const UnicodeString& that, | |
374 | int32_t srcStart, | |
375 | int32_t srcLength) | |
376 | : Replaceable(), | |
377 | fLength(0), | |
378 | fCapacity(US_STACKBUF_SIZE), | |
379 | fArray(fStackBuffer), | |
380 | fFlags(kShortString) | |
381 | { | |
382 | setTo(that, srcStart, srcLength); | |
383 | } | |
384 | ||
385 | // Replaceable base class clone() default implementation, does not clone | |
386 | Replaceable * | |
387 | Replaceable::clone() const { | |
388 | return NULL; | |
389 | } | |
390 | ||
391 | // UnicodeString overrides clone() with a real implementation | |
392 | Replaceable * | |
393 | UnicodeString::clone() const { | |
394 | return new UnicodeString(*this); | |
395 | } | |
396 | ||
397 | //======================================== | |
398 | // array allocation | |
399 | //======================================== | |
400 | ||
401 | UBool | |
402 | UnicodeString::allocate(int32_t capacity) { | |
403 | if(capacity <= US_STACKBUF_SIZE) { | |
404 | fArray = fStackBuffer; | |
405 | fCapacity = US_STACKBUF_SIZE; | |
406 | fFlags = kShortString; | |
407 | } else { | |
408 | // count bytes for the refCounter and the string capacity, and | |
409 | // round up to a multiple of 16; then divide by 4 and allocate int32_t's | |
410 | // to be safely aligned for the refCount | |
411 | int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2); | |
412 | int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); | |
413 | if(array != 0) { | |
414 | // set initial refCount and point behind the refCount | |
415 | *array++ = 1; | |
416 | ||
417 | // have fArray point to the first UChar | |
418 | fArray = (UChar *)array; | |
419 | fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); | |
420 | fFlags = kLongString; | |
421 | } else { | |
422 | fLength = 0; | |
423 | fCapacity = 0; | |
424 | fFlags = kIsBogus; | |
425 | return FALSE; | |
426 | } | |
427 | } | |
428 | return TRUE; | |
429 | } | |
430 | ||
431 | //======================================== | |
432 | // Destructor | |
433 | //======================================== | |
434 | UnicodeString::~UnicodeString() | |
435 | { | |
436 | releaseArray(); | |
437 | } | |
438 | ||
439 | ||
440 | //======================================== | |
441 | // Assignment | |
442 | //======================================== | |
443 | ||
444 | UnicodeString & | |
445 | UnicodeString::operator=(const UnicodeString &src) { | |
446 | return copyFrom(src); | |
447 | } | |
448 | ||
449 | UnicodeString & | |
450 | UnicodeString::fastCopyFrom(const UnicodeString &src) { | |
451 | return copyFrom(src, TRUE); | |
452 | } | |
453 | ||
454 | UnicodeString & | |
455 | UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { | |
456 | // if assigning to ourselves, do nothing | |
457 | if(this == 0 || this == &src) { | |
458 | return *this; | |
459 | } | |
460 | ||
461 | // is the right side bogus? | |
462 | if(&src == 0 || src.isBogus()) { | |
463 | setToBogus(); | |
464 | return *this; | |
465 | } | |
466 | ||
467 | // delete the current contents | |
468 | releaseArray(); | |
469 | ||
470 | // we always copy the length | |
471 | fLength = src.fLength; | |
472 | if(fLength == 0) { | |
473 | // empty string - use the stack buffer | |
474 | fArray = fStackBuffer; | |
475 | fCapacity = US_STACKBUF_SIZE; | |
476 | fFlags = kShortString; | |
477 | return *this; | |
478 | } | |
479 | ||
480 | // fLength>0 and not an "open" src.getBuffer(minCapacity) | |
481 | switch(src.fFlags) { | |
482 | case kShortString: | |
483 | // short string using the stack buffer, do the same | |
484 | fArray = fStackBuffer; | |
485 | fCapacity = US_STACKBUF_SIZE; | |
486 | fFlags = kShortString; | |
487 | uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR); | |
488 | break; | |
489 | case kLongString: | |
490 | // src uses a refCounted string buffer, use that buffer with refCount | |
491 | // src is const, use a cast - we don't really change it | |
492 | ((UnicodeString &)src).addRef(); | |
493 | // copy all fields, share the reference-counted buffer | |
494 | fArray = src.fArray; | |
495 | fCapacity = src.fCapacity; | |
496 | fFlags = src.fFlags; | |
497 | break; | |
498 | case kReadonlyAlias: | |
499 | if(fastCopy) { | |
500 | // src is a readonly alias, do the same | |
501 | // -> maintain the readonly alias as such | |
502 | fArray = src.fArray; | |
503 | fCapacity = src.fCapacity; | |
504 | fFlags = src.fFlags; | |
505 | break; | |
506 | } | |
507 | // else if(!fastCopy) fall through to case kWritableAlias | |
508 | // -> allocate a new buffer and copy the contents | |
509 | case kWritableAlias: | |
510 | // src is a writable alias; we make a copy of that instead | |
511 | if(allocate(fLength)) { | |
512 | uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR); | |
513 | break; | |
514 | } | |
515 | // if there is not enough memory, then fall through to setting to bogus | |
516 | default: | |
517 | // if src is bogus, set ourselves to bogus | |
518 | // do not call setToBogus() here because fArray and fFlags are not consistent here | |
519 | fArray = 0; | |
520 | fLength = 0; | |
521 | fCapacity = 0; | |
522 | fFlags = kIsBogus; | |
523 | break; | |
524 | } | |
525 | ||
526 | return *this; | |
527 | } | |
528 | ||
529 | //======================================== | |
530 | // Miscellaneous operations | |
531 | //======================================== | |
532 | ||
533 | UnicodeString UnicodeString::unescape() const { | |
534 | UnicodeString result; | |
535 | for (int32_t i=0; i<length(); ) { | |
536 | UChar32 c = charAt(i++); | |
537 | if (c == 0x005C /*'\\'*/) { | |
538 | c = unescapeAt(i); // advances i | |
539 | if (c == (UChar32)0xFFFFFFFF) { | |
540 | result.remove(); // return empty string | |
541 | break; // invalid escape sequence | |
542 | } | |
543 | } | |
544 | result.append(c); | |
545 | } | |
546 | return result; | |
547 | } | |
548 | ||
549 | UChar32 UnicodeString::unescapeAt(int32_t &offset) const { | |
550 | return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); | |
551 | } | |
552 | ||
553 | //======================================== | |
554 | // Read-only implementation | |
555 | //======================================== | |
556 | int8_t | |
557 | UnicodeString::doCompare( int32_t start, | |
558 | int32_t length, | |
559 | const UChar *srcChars, | |
560 | int32_t srcStart, | |
561 | int32_t srcLength) const | |
562 | { | |
563 | // compare illegal string values | |
564 | // treat const UChar *srcChars==NULL as an empty string | |
565 | if(isBogus()) { | |
566 | return -1; | |
567 | } | |
568 | ||
569 | // pin indices to legal values | |
570 | pinIndices(start, length); | |
571 | ||
572 | if(srcChars == NULL) { | |
573 | srcStart = srcLength = 0; | |
574 | } | |
575 | ||
576 | // get the correct pointer | |
577 | const UChar *chars = getArrayStart(); | |
578 | ||
579 | chars += start; | |
580 | srcChars += srcStart; | |
581 | ||
582 | int32_t minLength; | |
583 | int8_t lengthResult; | |
584 | ||
585 | // get the srcLength if necessary | |
586 | if(srcLength < 0) { | |
587 | srcLength = u_strlen(srcChars + srcStart); | |
588 | } | |
589 | ||
590 | // are we comparing different lengths? | |
591 | if(length != srcLength) { | |
592 | if(length < srcLength) { | |
593 | minLength = length; | |
594 | lengthResult = -1; | |
595 | } else { | |
596 | minLength = srcLength; | |
597 | lengthResult = 1; | |
598 | } | |
599 | } else { | |
600 | minLength = length; | |
601 | lengthResult = 0; | |
602 | } | |
603 | ||
604 | /* | |
605 | * note that uprv_memcmp() returns an int but we return an int8_t; | |
606 | * we need to take care not to truncate the result - | |
607 | * one way to do this is to right-shift the value to | |
608 | * move the sign bit into the lower 8 bits and making sure that this | |
609 | * does not become 0 itself | |
610 | */ | |
611 | ||
612 | if(minLength > 0 && chars != srcChars) { | |
613 | int32_t result; | |
614 | ||
615 | # if U_IS_BIG_ENDIAN | |
616 | // big-endian: byte comparison works | |
617 | result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); | |
618 | if(result != 0) { | |
619 | return (int8_t)(result >> 15 | 1); | |
620 | } | |
621 | # else | |
622 | // little-endian: compare UChar units | |
623 | do { | |
624 | result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); | |
625 | if(result != 0) { | |
626 | return (int8_t)(result >> 15 | 1); | |
627 | } | |
628 | } while(--minLength > 0); | |
629 | # endif | |
630 | } | |
631 | return lengthResult; | |
632 | } | |
633 | ||
634 | /* String compare in code point order - doCompare() compares in code unit order. */ | |
635 | int8_t | |
636 | UnicodeString::doCompareCodePointOrder(int32_t start, | |
637 | int32_t length, | |
638 | const UChar *srcChars, | |
639 | int32_t srcStart, | |
640 | int32_t srcLength) const | |
641 | { | |
642 | // compare illegal string values | |
643 | // treat const UChar *srcChars==NULL as an empty string | |
644 | if(isBogus()) { | |
645 | return -1; | |
646 | } | |
647 | ||
648 | // pin indices to legal values | |
649 | pinIndices(start, length); | |
650 | ||
651 | if(srcChars == NULL) { | |
652 | srcStart = srcLength = 0; | |
653 | } | |
654 | ||
655 | int32_t diff = uprv_strCompare(fArray + start, length, srcChars + srcStart, srcLength, FALSE, TRUE); | |
656 | /* translate the 32-bit result into an 8-bit one */ | |
657 | if(diff!=0) { | |
658 | return (int8_t)(diff >> 15 | 1); | |
659 | } else { | |
660 | return 0; | |
661 | } | |
662 | } | |
663 | ||
664 | int8_t | |
665 | UnicodeString::doCaseCompare(int32_t start, | |
666 | int32_t length, | |
667 | const UChar *srcChars, | |
668 | int32_t srcStart, | |
669 | int32_t srcLength, | |
670 | uint32_t options) const | |
671 | { | |
672 | // compare illegal string values | |
673 | // treat const UChar *srcChars==NULL as an empty string | |
674 | if(isBogus()) { | |
675 | return -1; | |
676 | } | |
677 | ||
678 | // pin indices to legal values | |
679 | pinIndices(start, length); | |
680 | ||
681 | if(srcChars == NULL) { | |
682 | srcStart = srcLength = 0; | |
683 | } | |
684 | ||
685 | // get the correct pointer | |
686 | const UChar *chars = getArrayStart(); | |
687 | ||
688 | chars += start; | |
689 | srcChars += srcStart; | |
690 | ||
691 | if(chars != srcChars) { | |
692 | UErrorCode errorCode=U_ZERO_ERROR; | |
693 | int32_t result=unorm_cmpEquivFold(chars, length, srcChars, srcLength, | |
694 | options|U_COMPARE_IGNORE_CASE, &errorCode); | |
695 | if(result!=0) { | |
696 | return (int8_t)(result >> 24 | 1); | |
697 | } | |
698 | } else { | |
699 | // get the srcLength if necessary | |
700 | if(srcLength < 0) { | |
701 | srcLength = u_strlen(srcChars + srcStart); | |
702 | } | |
703 | if(length != srcLength) { | |
704 | return (int8_t)((length - srcLength) >> 24 | 1); | |
705 | } | |
706 | } | |
707 | return 0; | |
708 | } | |
709 | ||
710 | int32_t | |
711 | UnicodeString::getLength() const { | |
712 | return length(); | |
713 | } | |
714 | ||
715 | UChar | |
716 | UnicodeString::getCharAt(int32_t offset) const { | |
717 | return charAt(offset); | |
718 | } | |
719 | ||
720 | UChar32 | |
721 | UnicodeString::getChar32At(int32_t offset) const { | |
722 | return char32At(offset); | |
723 | } | |
724 | ||
725 | int32_t | |
726 | UnicodeString::countChar32(int32_t start, int32_t length) const { | |
727 | pinIndices(start, length); | |
728 | // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL | |
729 | return u_countChar32(fArray+start, length); | |
730 | } | |
731 | ||
732 | UBool | |
733 | UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { | |
734 | pinIndices(start, length); | |
735 | // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL | |
736 | return u_strHasMoreChar32Than(fArray+start, length, number); | |
737 | } | |
738 | ||
739 | int32_t | |
740 | UnicodeString::moveIndex32(int32_t index, int32_t delta) const { | |
741 | // pin index | |
742 | if(index<0) { | |
743 | index=0; | |
744 | } else if(index>fLength) { | |
745 | index=fLength; | |
746 | } | |
747 | ||
748 | if(delta>0) { | |
749 | UTF_FWD_N(fArray, index, fLength, delta); | |
750 | } else { | |
751 | UTF_BACK_N(fArray, 0, index, -delta); | |
752 | } | |
753 | ||
754 | return index; | |
755 | } | |
756 | ||
757 | void | |
758 | UnicodeString::doExtract(int32_t start, | |
759 | int32_t length, | |
760 | UChar *dst, | |
761 | int32_t dstStart) const | |
762 | { | |
763 | // pin indices to legal values | |
764 | pinIndices(start, length); | |
765 | ||
766 | // do not copy anything if we alias dst itself | |
767 | if(fArray + start != dst + dstStart) { | |
768 | us_arrayCopy(getArrayStart(), start, dst, dstStart, length); | |
769 | } | |
770 | } | |
771 | ||
772 | int32_t | |
773 | UnicodeString::extract(UChar *dest, int32_t destCapacity, | |
774 | UErrorCode &errorCode) const { | |
775 | if(U_SUCCESS(errorCode)) { | |
776 | if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { | |
777 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
778 | } else { | |
779 | if(fLength>0 && fLength<=destCapacity && fArray!=dest) { | |
780 | uprv_memcpy(dest, fArray, fLength*U_SIZEOF_UCHAR); | |
781 | } | |
782 | return u_terminateUChars(dest, destCapacity, fLength, &errorCode); | |
783 | } | |
784 | } | |
785 | ||
786 | return fLength; | |
787 | } | |
788 | ||
789 | int32_t | |
790 | UnicodeString::indexOf(const UChar *srcChars, | |
791 | int32_t srcStart, | |
792 | int32_t srcLength, | |
793 | int32_t start, | |
794 | int32_t length) const | |
795 | { | |
796 | if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { | |
797 | return -1; | |
798 | } | |
799 | ||
800 | // UnicodeString does not find empty substrings | |
801 | if(srcLength < 0 && srcChars[srcStart] == 0) { | |
802 | return -1; | |
803 | } | |
804 | ||
805 | // get the indices within bounds | |
806 | pinIndices(start, length); | |
807 | ||
808 | // find the first occurrence of the substring | |
809 | const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength); | |
810 | if(match == NULL) { | |
811 | return -1; | |
812 | } else { | |
813 | return match - fArray; | |
814 | } | |
815 | } | |
816 | ||
817 | int32_t | |
818 | UnicodeString::doIndexOf(UChar c, | |
819 | int32_t start, | |
820 | int32_t length) const | |
821 | { | |
822 | // pin indices | |
823 | pinIndices(start, length); | |
824 | ||
825 | // find the first occurrence of c | |
826 | const UChar *match = u_memchr(fArray + start, c, length); | |
827 | if(match == NULL) { | |
828 | return -1; | |
829 | } else { | |
830 | return match - fArray; | |
831 | } | |
832 | } | |
833 | ||
834 | int32_t | |
835 | UnicodeString::doIndexOf(UChar32 c, | |
836 | int32_t start, | |
837 | int32_t length) const { | |
838 | // pin indices | |
839 | pinIndices(start, length); | |
840 | ||
841 | // find the first occurrence of c | |
842 | const UChar *match = u_memchr32(fArray + start, c, length); | |
843 | if(match == NULL) { | |
844 | return -1; | |
845 | } else { | |
846 | return match - fArray; | |
847 | } | |
848 | } | |
849 | ||
850 | int32_t | |
851 | UnicodeString::lastIndexOf(const UChar *srcChars, | |
852 | int32_t srcStart, | |
853 | int32_t srcLength, | |
854 | int32_t start, | |
855 | int32_t length) const | |
856 | { | |
857 | if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { | |
858 | return -1; | |
859 | } | |
860 | ||
861 | // UnicodeString does not find empty substrings | |
862 | if(srcLength < 0 && srcChars[srcStart] == 0) { | |
863 | return -1; | |
864 | } | |
865 | ||
866 | // get the indices within bounds | |
867 | pinIndices(start, length); | |
868 | ||
869 | // find the last occurrence of the substring | |
870 | const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength); | |
871 | if(match == NULL) { | |
872 | return -1; | |
873 | } else { | |
874 | return match - fArray; | |
875 | } | |
876 | } | |
877 | ||
878 | int32_t | |
879 | UnicodeString::doLastIndexOf(UChar c, | |
880 | int32_t start, | |
881 | int32_t length) const | |
882 | { | |
883 | if(isBogus()) { | |
884 | return -1; | |
885 | } | |
886 | ||
887 | // pin indices | |
888 | pinIndices(start, length); | |
889 | ||
890 | // find the last occurrence of c | |
891 | const UChar *match = u_memrchr(fArray + start, c, length); | |
892 | if(match == NULL) { | |
893 | return -1; | |
894 | } else { | |
895 | return match - fArray; | |
896 | } | |
897 | } | |
898 | ||
899 | int32_t | |
900 | UnicodeString::doLastIndexOf(UChar32 c, | |
901 | int32_t start, | |
902 | int32_t length) const { | |
903 | // pin indices | |
904 | pinIndices(start, length); | |
905 | ||
906 | // find the last occurrence of c | |
907 | const UChar *match = u_memrchr32(fArray + start, c, length); | |
908 | if(match == NULL) { | |
909 | return -1; | |
910 | } else { | |
911 | return match - fArray; | |
912 | } | |
913 | } | |
914 | ||
915 | //======================================== | |
916 | // Write implementation | |
917 | //======================================== | |
918 | ||
919 | UnicodeString& | |
920 | UnicodeString::findAndReplace(int32_t start, | |
921 | int32_t length, | |
922 | const UnicodeString& oldText, | |
923 | int32_t oldStart, | |
924 | int32_t oldLength, | |
925 | const UnicodeString& newText, | |
926 | int32_t newStart, | |
927 | int32_t newLength) | |
928 | { | |
929 | if(isBogus() || oldText.isBogus() || newText.isBogus()) { | |
930 | return *this; | |
931 | } | |
932 | ||
933 | pinIndices(start, length); | |
934 | oldText.pinIndices(oldStart, oldLength); | |
935 | newText.pinIndices(newStart, newLength); | |
936 | ||
937 | if(oldLength == 0) { | |
938 | return *this; | |
939 | } | |
940 | ||
941 | while(length > 0 && length >= oldLength) { | |
942 | int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); | |
943 | if(pos < 0) { | |
944 | // no more oldText's here: done | |
945 | break; | |
946 | } else { | |
947 | // we found oldText, replace it by newText and go beyond it | |
948 | replace(pos, oldLength, newText, newStart, newLength); | |
949 | length -= pos + oldLength - start; | |
950 | start = pos + newLength; | |
951 | } | |
952 | } | |
953 | ||
954 | return *this; | |
955 | } | |
956 | ||
957 | ||
958 | void | |
959 | UnicodeString::setToBogus() | |
960 | { | |
961 | releaseArray(); | |
962 | ||
963 | fArray = 0; | |
964 | fCapacity = fLength = 0; | |
965 | fFlags = kIsBogus; | |
966 | } | |
967 | ||
968 | // turn a bogus string into an empty one | |
969 | void | |
970 | UnicodeString::unBogus() { | |
971 | if(fFlags & kIsBogus) { | |
972 | fArray = fStackBuffer; | |
973 | fLength = 0; | |
974 | fCapacity = US_STACKBUF_SIZE; | |
975 | fFlags = kShortString; | |
976 | } | |
977 | } | |
978 | ||
979 | // setTo() analogous to the readonly-aliasing constructor with the same signature | |
980 | UnicodeString & | |
981 | UnicodeString::setTo(UBool isTerminated, | |
982 | const UChar *text, | |
983 | int32_t textLength) | |
984 | { | |
985 | if(fFlags & kOpenGetBuffer) { | |
986 | // do not modify a string that has an "open" getBuffer(minCapacity) | |
987 | return *this; | |
988 | } | |
989 | ||
990 | if(text == NULL) { | |
991 | // treat as an empty string, do not alias | |
992 | releaseArray(); | |
993 | fLength = 0; | |
994 | fCapacity = US_STACKBUF_SIZE; | |
995 | fArray = fStackBuffer; | |
996 | fFlags = kShortString; | |
997 | return *this; | |
998 | } | |
999 | ||
1000 | if( textLength < -1 || | |
1001 | (textLength == -1 && !isTerminated) || | |
1002 | (textLength >= 0 && isTerminated && text[textLength] != 0) | |
1003 | ) { | |
1004 | setToBogus(); | |
1005 | return *this; | |
1006 | } | |
1007 | ||
1008 | releaseArray(); | |
1009 | ||
1010 | fArray = (UChar *)text; | |
1011 | if(textLength != -1) { | |
1012 | fLength = textLength; | |
1013 | fCapacity = isTerminated ? fLength + 1 : fLength; | |
1014 | } else { | |
1015 | // text is terminated, or else it would have failed the above test | |
1016 | fLength = u_strlen(text); | |
1017 | fCapacity = fLength + 1; | |
1018 | } | |
1019 | ||
1020 | fFlags = kReadonlyAlias; | |
1021 | return *this; | |
1022 | } | |
1023 | ||
1024 | // setTo() analogous to the writable-aliasing constructor with the same signature | |
1025 | UnicodeString & | |
1026 | UnicodeString::setTo(UChar *buffer, | |
1027 | int32_t buffLength, | |
1028 | int32_t buffCapacity) { | |
1029 | if(fFlags & kOpenGetBuffer) { | |
1030 | // do not modify a string that has an "open" getBuffer(minCapacity) | |
1031 | return *this; | |
1032 | } | |
1033 | ||
1034 | if(buffer == NULL) { | |
1035 | // treat as an empty string, do not alias | |
1036 | releaseArray(); | |
1037 | fLength = 0; | |
1038 | fCapacity = US_STACKBUF_SIZE; | |
1039 | fArray = fStackBuffer; | |
1040 | fFlags = kShortString; | |
1041 | return *this; | |
1042 | } | |
1043 | ||
1044 | if(buffLength < 0 || buffLength > buffCapacity) { | |
1045 | setToBogus(); | |
1046 | return *this; | |
1047 | } | |
1048 | ||
1049 | releaseArray(); | |
1050 | ||
1051 | fArray = buffer; | |
1052 | fLength = buffLength; | |
1053 | fCapacity = buffCapacity; | |
1054 | fFlags = kWritableAlias; | |
1055 | return *this; | |
1056 | } | |
1057 | ||
1058 | UnicodeString& | |
1059 | UnicodeString::setCharAt(int32_t offset, | |
1060 | UChar c) | |
1061 | { | |
1062 | if(cloneArrayIfNeeded() && fLength > 0) { | |
1063 | if(offset < 0) { | |
1064 | offset = 0; | |
1065 | } else if(offset >= fLength) { | |
1066 | offset = fLength - 1; | |
1067 | } | |
1068 | ||
1069 | fArray[offset] = c; | |
1070 | } | |
1071 | return *this; | |
1072 | } | |
1073 | ||
1074 | /* | |
1075 | * Implement argument checking and buffer handling | |
1076 | * for string case mapping as a common function. | |
1077 | */ | |
1078 | enum { | |
1079 | TO_LOWER, | |
1080 | TO_UPPER, | |
1081 | TO_TITLE, | |
1082 | FOLD_CASE | |
1083 | }; | |
1084 | ||
1085 | UnicodeString & | |
1086 | UnicodeString::toLower() { | |
1087 | return caseMap(0, Locale::getDefault(), 0, TO_LOWER); | |
1088 | } | |
1089 | ||
1090 | UnicodeString & | |
1091 | UnicodeString::toLower(const Locale &locale) { | |
1092 | return caseMap(0, locale, 0, TO_LOWER); | |
1093 | } | |
1094 | ||
1095 | UnicodeString & | |
1096 | UnicodeString::toUpper() { | |
1097 | return caseMap(0, Locale::getDefault(), 0, TO_UPPER); | |
1098 | } | |
1099 | ||
1100 | UnicodeString & | |
1101 | UnicodeString::toUpper(const Locale &locale) { | |
1102 | return caseMap(0, locale, 0, TO_UPPER); | |
1103 | } | |
1104 | ||
1105 | #if !UCONFIG_NO_BREAK_ITERATION | |
1106 | ||
1107 | UnicodeString & | |
1108 | UnicodeString::toTitle(BreakIterator *titleIter) { | |
1109 | return caseMap(titleIter, Locale::getDefault(), 0, TO_TITLE); | |
1110 | } | |
1111 | ||
1112 | UnicodeString & | |
1113 | UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) { | |
1114 | return caseMap(titleIter, locale, 0, TO_TITLE); | |
1115 | } | |
1116 | ||
1117 | #endif | |
1118 | ||
1119 | UnicodeString & | |
1120 | UnicodeString::foldCase(uint32_t options) { | |
1121 | return caseMap(0, Locale::getDefault(), options, FOLD_CASE); | |
1122 | } | |
1123 | ||
1124 | UnicodeString & | |
1125 | UnicodeString::caseMap(BreakIterator *titleIter, | |
1126 | const Locale& locale, | |
1127 | uint32_t options, | |
1128 | int32_t toWhichCase) { | |
1129 | if(fLength <= 0) { | |
1130 | // nothing to do | |
1131 | return *this; | |
1132 | } | |
1133 | ||
1134 | // We need to allocate a new buffer for the internal string case mapping function. | |
1135 | // This is very similar to how doReplace() below keeps the old array pointer | |
1136 | // and deletes the old array itself after it is done. | |
1137 | // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array. | |
1138 | UChar *oldArray = fArray; | |
1139 | int32_t oldLength = fLength; | |
1140 | int32_t *bufferToDelete = 0; | |
1141 | ||
1142 | // Make sure that if the string is in fStackBuffer we do not overwrite it! | |
1143 | int32_t capacity; | |
1144 | if(fLength <= US_STACKBUF_SIZE) { | |
1145 | if(fArray == fStackBuffer) { | |
1146 | capacity = 2 * US_STACKBUF_SIZE; // make sure that cloneArrayIfNeeded() allocates a new buffer | |
1147 | } else { | |
1148 | capacity = US_STACKBUF_SIZE; | |
1149 | } | |
1150 | } else { | |
1151 | capacity = fLength + 20; | |
1152 | } | |
1153 | if(!cloneArrayIfNeeded(capacity, capacity, FALSE, &bufferToDelete, TRUE)) { | |
1154 | return *this; | |
1155 | } | |
1156 | ||
1157 | UErrorCode errorCode; | |
1158 | ||
1159 | #if !UCONFIG_NO_BREAK_ITERATION | |
1160 | // set up the titlecasing break iterator | |
1161 | UBreakIterator *cTitleIter = 0; | |
1162 | ||
1163 | if(toWhichCase == TO_TITLE) { | |
1164 | if(titleIter != 0) { | |
1165 | cTitleIter = (UBreakIterator *)titleIter; | |
1166 | } else { | |
1167 | errorCode = U_ZERO_ERROR; | |
1168 | cTitleIter = ubrk_open(UBRK_WORD, locale.getName(), | |
1169 | oldArray, oldLength, | |
1170 | &errorCode); | |
1171 | if(U_FAILURE(errorCode)) { | |
1172 | uprv_free(bufferToDelete); | |
1173 | setToBogus(); | |
1174 | return *this; | |
1175 | } | |
1176 | } | |
1177 | } | |
1178 | #endif | |
1179 | ||
1180 | // Case-map, and if the result is too long, then reallocate and repeat. | |
1181 | do { | |
1182 | errorCode = U_ZERO_ERROR; | |
1183 | if(toWhichCase==TO_LOWER) { | |
1184 | fLength = u_internalStrToLower(fArray, fCapacity, | |
1185 | oldArray, oldLength, | |
1186 | 0, oldLength, | |
1187 | locale.getName(), | |
1188 | &errorCode); | |
1189 | } else if(toWhichCase==TO_UPPER) { | |
1190 | fLength = u_internalStrToUpper(fArray, fCapacity, | |
1191 | oldArray, oldLength, | |
1192 | locale.getName(), | |
1193 | &errorCode); | |
1194 | #if !UCONFIG_NO_BREAK_ITERATION | |
1195 | } else if(toWhichCase==TO_TITLE) { | |
1196 | fLength = u_internalStrToTitle(fArray, fCapacity, | |
1197 | oldArray, oldLength, | |
1198 | cTitleIter, locale.getName(), | |
1199 | &errorCode); | |
1200 | #endif | |
1201 | } else { | |
1202 | fLength = u_internalStrFoldCase(fArray, fCapacity, | |
1203 | oldArray, oldLength, | |
1204 | options, | |
1205 | &errorCode); | |
1206 | } | |
1207 | } while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(fLength, fLength, FALSE)); | |
1208 | ||
1209 | #if !UCONFIG_NO_BREAK_ITERATION | |
1210 | if(cTitleIter != 0 && titleIter == 0) { | |
1211 | ubrk_close(cTitleIter); | |
1212 | } | |
1213 | #endif | |
1214 | ||
1215 | if (bufferToDelete) { | |
1216 | uprv_free(bufferToDelete); | |
1217 | } | |
1218 | if(U_FAILURE(errorCode)) { | |
1219 | setToBogus(); | |
1220 | } | |
1221 | return *this; | |
1222 | } | |
1223 | ||
1224 | UnicodeString& | |
1225 | UnicodeString::doReplace( int32_t start, | |
1226 | int32_t length, | |
1227 | const UnicodeString& src, | |
1228 | int32_t srcStart, | |
1229 | int32_t srcLength) | |
1230 | { | |
1231 | if(!src.isBogus()) { | |
1232 | // pin the indices to legal values | |
1233 | src.pinIndices(srcStart, srcLength); | |
1234 | ||
1235 | // get the characters from src | |
1236 | // and replace the range in ourselves with them | |
1237 | return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); | |
1238 | } else { | |
1239 | // remove the range | |
1240 | return doReplace(start, length, 0, 0, 0); | |
1241 | } | |
1242 | } | |
1243 | ||
1244 | UnicodeString& | |
1245 | UnicodeString::doReplace(int32_t start, | |
1246 | int32_t length, | |
1247 | const UChar *srcChars, | |
1248 | int32_t srcStart, | |
1249 | int32_t srcLength) | |
1250 | { | |
1251 | if(isBogus()) { | |
1252 | return *this; | |
1253 | } | |
1254 | ||
1255 | if(srcChars == 0) { | |
1256 | srcStart = srcLength = 0; | |
1257 | } else if(srcLength < 0) { | |
1258 | // get the srcLength if necessary | |
1259 | srcLength = u_strlen(srcChars + srcStart); | |
1260 | } | |
1261 | ||
1262 | int32_t *bufferToDelete = 0; | |
1263 | ||
1264 | // the following may change fArray but will not copy the current contents; | |
1265 | // therefore we need to keep the current fArray | |
1266 | UChar *oldArray = fArray; | |
1267 | int32_t oldLength = fLength; | |
1268 | ||
1269 | // pin the indices to legal values | |
1270 | pinIndices(start, length); | |
1271 | ||
1272 | // calculate the size of the string after the replace | |
1273 | int32_t newSize = oldLength - length + srcLength; | |
1274 | ||
1275 | // clone our array and allocate a bigger array if needed | |
1276 | if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize, | |
1277 | FALSE, &bufferToDelete) | |
1278 | ) { | |
1279 | return *this; | |
1280 | } | |
1281 | ||
1282 | // now do the replace | |
1283 | ||
1284 | if(fArray != oldArray) { | |
1285 | // if fArray changed, then we need to copy everything except what will change | |
1286 | us_arrayCopy(oldArray, 0, fArray, 0, start); | |
1287 | us_arrayCopy(oldArray, start + length, | |
1288 | fArray, start + srcLength, | |
1289 | oldLength - (start + length)); | |
1290 | } else if(length != srcLength) { | |
1291 | // fArray did not change; copy only the portion that isn't changing, leaving a hole | |
1292 | us_arrayCopy(oldArray, start + length, | |
1293 | fArray, start + srcLength, | |
1294 | oldLength - (start + length)); | |
1295 | } | |
1296 | ||
1297 | // now fill in the hole with the new string | |
1298 | us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength); | |
1299 | ||
1300 | fLength = newSize; | |
1301 | ||
1302 | // delayed delete in case srcChars == fArray when we started, and | |
1303 | // to keep oldArray alive for the above operations | |
1304 | if (bufferToDelete) { | |
1305 | uprv_free(bufferToDelete); | |
1306 | } | |
1307 | ||
1308 | return *this; | |
1309 | } | |
1310 | ||
1311 | /** | |
1312 | * Replaceable API | |
1313 | */ | |
1314 | void | |
1315 | UnicodeString::handleReplaceBetween(int32_t start, | |
1316 | int32_t limit, | |
1317 | const UnicodeString& text) { | |
1318 | replaceBetween(start, limit, text); | |
1319 | } | |
1320 | ||
1321 | /** | |
1322 | * Replaceable API | |
1323 | */ | |
1324 | void | |
1325 | UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { | |
1326 | if (limit <= start) { | |
1327 | return; // Nothing to do; avoid bogus malloc call | |
1328 | } | |
1329 | UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); | |
1330 | extractBetween(start, limit, text, 0); | |
1331 | insert(dest, text, 0, limit - start); | |
1332 | uprv_free(text); | |
1333 | } | |
1334 | ||
1335 | /** | |
1336 | * Replaceable API | |
1337 | * | |
1338 | * NOTE: This is for the Replaceable class. There is no rep.cpp, | |
1339 | * so we implement this function here. | |
1340 | */ | |
1341 | UBool Replaceable::hasMetaData() const { | |
1342 | return TRUE; | |
1343 | } | |
1344 | ||
1345 | /** | |
1346 | * Replaceable API | |
1347 | */ | |
1348 | UBool UnicodeString::hasMetaData() const { | |
1349 | return FALSE; | |
1350 | } | |
1351 | ||
1352 | UnicodeString& | |
1353 | UnicodeString::doReverse(int32_t start, | |
1354 | int32_t length) | |
1355 | { | |
1356 | if(fLength <= 1 || !cloneArrayIfNeeded()) { | |
1357 | return *this; | |
1358 | } | |
1359 | ||
1360 | // pin the indices to legal values | |
1361 | pinIndices(start, length); | |
1362 | ||
1363 | UChar *left = getArrayStart() + start; | |
1364 | UChar *right = getArrayStart() + start + length; | |
1365 | UChar swap; | |
1366 | UBool hasSupplementary = FALSE; | |
1367 | ||
1368 | while(left < --right) { | |
1369 | hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left); | |
1370 | hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right); | |
1371 | *right = swap; | |
1372 | } | |
1373 | ||
1374 | /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ | |
1375 | if(hasSupplementary) { | |
1376 | UChar swap2; | |
1377 | ||
1378 | left = getArrayStart() + start; | |
1379 | right = getArrayStart() + start + length - 1; // -1 so that we can look at *(left+1) if left<right | |
1380 | while(left < right) { | |
1381 | if(UTF_IS_TRAIL(swap = *left) && UTF_IS_LEAD(swap2 = *(left + 1))) { | |
1382 | *left++ = swap2; | |
1383 | *left++ = swap; | |
1384 | } else { | |
1385 | ++left; | |
1386 | } | |
1387 | } | |
1388 | } | |
1389 | ||
1390 | return *this; | |
1391 | } | |
1392 | ||
1393 | UBool | |
1394 | UnicodeString::padLeading(int32_t targetLength, | |
1395 | UChar padChar) | |
1396 | { | |
1397 | if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { | |
1398 | return FALSE; | |
1399 | } else { | |
1400 | // move contents up by padding width | |
1401 | int32_t start = targetLength - fLength; | |
1402 | us_arrayCopy(fArray, 0, fArray, start, fLength); | |
1403 | ||
1404 | // fill in padding character | |
1405 | while(--start >= 0) { | |
1406 | fArray[start] = padChar; | |
1407 | } | |
1408 | fLength = targetLength; | |
1409 | return TRUE; | |
1410 | } | |
1411 | } | |
1412 | ||
1413 | UBool | |
1414 | UnicodeString::padTrailing(int32_t targetLength, | |
1415 | UChar padChar) | |
1416 | { | |
1417 | if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { | |
1418 | return FALSE; | |
1419 | } else { | |
1420 | // fill in padding character | |
1421 | int32_t length = targetLength; | |
1422 | while(--length >= fLength) { | |
1423 | fArray[length] = padChar; | |
1424 | } | |
1425 | fLength = targetLength; | |
1426 | return TRUE; | |
1427 | } | |
1428 | } | |
1429 | ||
1430 | UnicodeString& | |
1431 | UnicodeString::trim() | |
1432 | { | |
1433 | if(isBogus()) { | |
1434 | return *this; | |
1435 | } | |
1436 | ||
1437 | UChar32 c; | |
1438 | int32_t i = fLength, length; | |
1439 | ||
1440 | // first cut off trailing white space | |
1441 | for(;;) { | |
1442 | length = i; | |
1443 | if(i <= 0) { | |
1444 | break; | |
1445 | } | |
1446 | UTF_PREV_CHAR(fArray, 0, i, c); | |
1447 | if(!(c == 0x20 || u_isWhitespace(c))) { | |
1448 | break; | |
1449 | } | |
1450 | } | |
1451 | if(length < fLength) { | |
1452 | fLength = length; | |
1453 | } | |
1454 | ||
1455 | // find leading white space | |
1456 | int32_t start; | |
1457 | i = 0; | |
1458 | for(;;) { | |
1459 | start = i; | |
1460 | if(i >= length) { | |
1461 | break; | |
1462 | } | |
1463 | UTF_NEXT_CHAR(fArray, i, length, c); | |
1464 | if(!(c == 0x20 || u_isWhitespace(c))) { | |
1465 | break; | |
1466 | } | |
1467 | } | |
1468 | ||
1469 | // move string forward over leading white space | |
1470 | if(start > 0) { | |
1471 | doReplace(0, start, 0, 0, 0); | |
1472 | } | |
1473 | ||
1474 | return *this; | |
1475 | } | |
1476 | ||
1477 | //======================================== | |
1478 | // Hashing | |
1479 | //======================================== | |
1480 | int32_t | |
1481 | UnicodeString::doHashCode() const | |
1482 | { | |
1483 | /* Delegate hash computation to uhash. This makes UnicodeString | |
1484 | * hashing consistent with UChar* hashing. */ | |
1485 | int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength); | |
1486 | if (hashCode == kInvalidHashCode) { | |
1487 | hashCode = kEmptyHashCode; | |
1488 | } | |
1489 | return hashCode; | |
1490 | } | |
1491 | ||
1492 | //======================================== | |
1493 | // Codeset conversion | |
1494 | //======================================== | |
1495 | int32_t | |
1496 | UnicodeString::extract(int32_t start, | |
1497 | int32_t length, | |
1498 | char *target, | |
1499 | uint32_t dstSize, | |
1500 | const char *codepage) const | |
1501 | { | |
1502 | // if the arguments are illegal, then do nothing | |
1503 | if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { | |
1504 | return 0; | |
1505 | } | |
1506 | ||
1507 | // pin the indices to legal values | |
1508 | pinIndices(start, length); | |
1509 | ||
1510 | // create the converter | |
1511 | UConverter *converter; | |
1512 | UErrorCode status = U_ZERO_ERROR; | |
1513 | ||
1514 | // just write the NUL if the string length is 0 | |
1515 | if(length == 0) { | |
1516 | if(dstSize >= 0x80000000) { | |
1517 | // careful: dstSize is unsigned! (0xffffffff means "unlimited") | |
1518 | // make sure that the NUL-termination works (takes int32_t) | |
1519 | dstSize=0x7fffffff; | |
1520 | } | |
1521 | return u_terminateChars(target, dstSize, 0, &status); | |
1522 | } | |
1523 | ||
1524 | // if the codepage is the default, use our cache | |
1525 | // if it is an empty string, then use the "invariant character" conversion | |
1526 | if (codepage == 0) { | |
1527 | converter = u_getDefaultConverter(&status); | |
1528 | } else if (*codepage == 0) { | |
1529 | // use the "invariant characters" conversion | |
1530 | int32_t destLength; | |
1531 | // careful: dstSize is unsigned! (0xffffffff means "unlimited") | |
1532 | if(dstSize >= 0x80000000) { | |
1533 | destLength = length; | |
1534 | // make sure that the NUL-termination works (takes int32_t) | |
1535 | dstSize=0x7fffffff; | |
1536 | } else if(length <= (int32_t)dstSize) { | |
1537 | destLength = length; | |
1538 | } else { | |
1539 | destLength = (int32_t)dstSize; | |
1540 | } | |
1541 | u_UCharsToChars(getArrayStart() + start, target, destLength); | |
1542 | return u_terminateChars(target, (int32_t)dstSize, length, &status); | |
1543 | } else { | |
1544 | converter = ucnv_open(codepage, &status); | |
1545 | } | |
1546 | ||
1547 | length = doExtract(start, length, target, (int32_t)dstSize, converter, status); | |
1548 | ||
1549 | // close the converter | |
1550 | if (codepage == 0) { | |
1551 | u_releaseDefaultConverter(converter); | |
1552 | } else { | |
1553 | ucnv_close(converter); | |
1554 | } | |
1555 | ||
1556 | return length; | |
1557 | } | |
1558 | ||
1559 | int32_t | |
1560 | UnicodeString::extract(char *dest, int32_t destCapacity, | |
1561 | UConverter *cnv, | |
1562 | UErrorCode &errorCode) const { | |
1563 | if(U_FAILURE(errorCode)) { | |
1564 | return 0; | |
1565 | } | |
1566 | ||
1567 | if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { | |
1568 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1569 | return 0; | |
1570 | } | |
1571 | ||
1572 | // nothing to do? | |
1573 | if(fLength<=0) { | |
1574 | return u_terminateChars(dest, destCapacity, 0, &errorCode); | |
1575 | } | |
1576 | ||
1577 | // get the converter | |
1578 | UBool isDefaultConverter; | |
1579 | if(cnv==0) { | |
1580 | isDefaultConverter=TRUE; | |
1581 | cnv=u_getDefaultConverter(&errorCode); | |
1582 | if(U_FAILURE(errorCode)) { | |
1583 | return 0; | |
1584 | } | |
1585 | } else { | |
1586 | isDefaultConverter=FALSE; | |
1587 | ucnv_resetFromUnicode(cnv); | |
1588 | } | |
1589 | ||
1590 | // convert | |
1591 | int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode); | |
1592 | ||
1593 | // release the converter | |
1594 | if(isDefaultConverter) { | |
1595 | u_releaseDefaultConverter(cnv); | |
1596 | } | |
1597 | ||
1598 | return length; | |
1599 | } | |
1600 | ||
1601 | void | |
1602 | UnicodeString::extractBetween(int32_t start, | |
1603 | int32_t limit, | |
1604 | UnicodeString& target) const | |
1605 | { doExtract(start, limit - start, target); } | |
1606 | ||
1607 | int32_t | |
1608 | UnicodeString::doExtract(int32_t start, int32_t length, | |
1609 | char *dest, int32_t destCapacity, | |
1610 | UConverter *cnv, | |
1611 | UErrorCode &errorCode) const { | |
1612 | if(U_FAILURE(errorCode)) { | |
1613 | if(destCapacity!=0) { | |
1614 | *dest=0; | |
1615 | } | |
1616 | return 0; | |
1617 | } | |
1618 | ||
1619 | const UChar *src=fArray+start, *srcLimit=src+length; | |
1620 | char *originalDest=dest; | |
1621 | const char *destLimit; | |
1622 | ||
1623 | if(destCapacity==0) { | |
1624 | destLimit=dest=0; | |
1625 | } else if(destCapacity==-1) { | |
1626 | // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. | |
1627 | destLimit=(char*)U_MAX_PTR(dest); | |
1628 | // for NUL-termination, translate into highest int32_t | |
1629 | destCapacity=0x7fffffff; | |
1630 | } else { | |
1631 | destLimit=dest+destCapacity; | |
1632 | } | |
1633 | ||
1634 | // perform the conversion | |
1635 | ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); | |
1636 | length=(int32_t)(dest-originalDest); | |
1637 | ||
1638 | // if an overflow occurs, then get the preflighting length | |
1639 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
1640 | char buffer[1024]; | |
1641 | ||
1642 | destLimit=buffer+sizeof(buffer); | |
1643 | do { | |
1644 | dest=buffer; | |
1645 | errorCode=U_ZERO_ERROR; | |
1646 | ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); | |
1647 | length+=(int32_t)(dest-buffer); | |
1648 | } while(errorCode==U_BUFFER_OVERFLOW_ERROR); | |
1649 | } | |
1650 | ||
1651 | return u_terminateChars(originalDest, destCapacity, length, &errorCode); | |
1652 | } | |
1653 | ||
1654 | void | |
1655 | UnicodeString::doCodepageCreate(const char *codepageData, | |
1656 | int32_t dataLength, | |
1657 | const char *codepage) | |
1658 | { | |
1659 | // if there's nothing to convert, do nothing | |
1660 | if(codepageData == 0 || dataLength <= 0) { | |
1661 | return; | |
1662 | } | |
1663 | ||
1664 | UErrorCode status = U_ZERO_ERROR; | |
1665 | ||
1666 | // create the converter | |
1667 | // if the codepage is the default, use our cache | |
1668 | // if it is an empty string, then use the "invariant character" conversion | |
1669 | UConverter *converter = (codepage == 0 ? | |
1670 | u_getDefaultConverter(&status) : | |
1671 | *codepage == 0 ? | |
1672 | 0 : | |
1673 | ucnv_open(codepage, &status)); | |
1674 | ||
1675 | // if we failed, set the appropriate flags and return | |
1676 | if(U_FAILURE(status)) { | |
1677 | setToBogus(); | |
1678 | return; | |
1679 | } | |
1680 | ||
1681 | // perform the conversion | |
1682 | if(converter == 0) { | |
1683 | // use the "invariant characters" conversion | |
1684 | if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { | |
1685 | u_charsToUChars(codepageData, getArrayStart(), dataLength); | |
1686 | fLength = dataLength; | |
1687 | } else { | |
1688 | setToBogus(); | |
1689 | } | |
1690 | return; | |
1691 | } | |
1692 | ||
1693 | // convert using the real converter | |
1694 | doCodepageCreate(codepageData, dataLength, converter, status); | |
1695 | if(U_FAILURE(status)) { | |
1696 | setToBogus(); | |
1697 | } | |
1698 | ||
1699 | // close the converter | |
1700 | if(codepage == 0) { | |
1701 | u_releaseDefaultConverter(converter); | |
1702 | } else { | |
1703 | ucnv_close(converter); | |
1704 | } | |
1705 | } | |
1706 | ||
1707 | void | |
1708 | UnicodeString::doCodepageCreate(const char *codepageData, | |
1709 | int32_t dataLength, | |
1710 | UConverter *converter, | |
1711 | UErrorCode &status) { | |
1712 | if(U_FAILURE(status)) { | |
1713 | return; | |
1714 | } | |
1715 | ||
1716 | // set up the conversion parameters | |
1717 | const char *mySource = codepageData; | |
1718 | const char *mySourceEnd = mySource + dataLength; | |
1719 | UChar *myTarget; | |
1720 | ||
1721 | // estimate the size needed: | |
1722 | // 1.25 UChar's per source byte should cover most cases | |
1723 | int32_t arraySize = dataLength + (dataLength >> 2); | |
1724 | ||
1725 | // we do not care about the current contents | |
1726 | UBool doCopyArray = FALSE; | |
1727 | for(;;) { | |
1728 | if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { | |
1729 | setToBogus(); | |
1730 | break; | |
1731 | } | |
1732 | ||
1733 | // perform the conversion | |
1734 | myTarget = fArray + fLength; | |
1735 | ucnv_toUnicode(converter, &myTarget, fArray + fCapacity, | |
1736 | &mySource, mySourceEnd, 0, TRUE, &status); | |
1737 | ||
1738 | // update the conversion parameters | |
1739 | fLength = (int32_t)(myTarget - fArray); | |
1740 | ||
1741 | // allocate more space and copy data, if needed | |
1742 | if(status == U_BUFFER_OVERFLOW_ERROR) { | |
1743 | // reset the error code | |
1744 | status = U_ZERO_ERROR; | |
1745 | ||
1746 | // keep the previous conversion results | |
1747 | doCopyArray = TRUE; | |
1748 | ||
1749 | // estimate the new size needed, larger than before | |
1750 | // try 2 UChar's per remaining source byte | |
1751 | arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource)); | |
1752 | } else { | |
1753 | break; | |
1754 | } | |
1755 | } | |
1756 | } | |
1757 | ||
1758 | //======================================== | |
1759 | // External Buffer | |
1760 | //======================================== | |
1761 | ||
1762 | UChar * | |
1763 | UnicodeString::getBuffer(int32_t minCapacity) { | |
1764 | if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { | |
1765 | fFlags|=kOpenGetBuffer; | |
1766 | fLength=0; | |
1767 | return fArray; | |
1768 | } else { | |
1769 | return 0; | |
1770 | } | |
1771 | } | |
1772 | ||
1773 | void | |
1774 | UnicodeString::releaseBuffer(int32_t newLength) { | |
1775 | if(fFlags&kOpenGetBuffer && newLength>=-1) { | |
1776 | // set the new fLength | |
1777 | if(newLength==-1) { | |
1778 | // the new length is the string length, capped by fCapacity | |
1779 | const UChar *p=fArray, *limit=fArray+fCapacity; | |
1780 | while(p<limit && *p!=0) { | |
1781 | ++p; | |
1782 | } | |
1783 | fLength=(int32_t)(p-fArray); | |
1784 | } else if(newLength<=fCapacity) { | |
1785 | fLength=newLength; | |
1786 | } else { | |
1787 | fLength=fCapacity; | |
1788 | } | |
1789 | fFlags&=~kOpenGetBuffer; | |
1790 | } | |
1791 | } | |
1792 | ||
1793 | //======================================== | |
1794 | // Miscellaneous | |
1795 | //======================================== | |
1796 | UBool | |
1797 | UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, | |
1798 | int32_t growCapacity, | |
1799 | UBool doCopyArray, | |
1800 | int32_t **pBufferToDelete, | |
1801 | UBool forceClone) { | |
1802 | // default parameters need to be static, therefore | |
1803 | // the defaults are -1 to have convenience defaults | |
1804 | if(newCapacity == -1) { | |
1805 | newCapacity = fCapacity; | |
1806 | } | |
1807 | ||
1808 | // while a getBuffer(minCapacity) is "open", | |
1809 | // prevent any modifications of the string by returning FALSE here | |
1810 | // if the string is bogus, then only an assignment or similar can revive it | |
1811 | if((fFlags&(kOpenGetBuffer|kIsBogus))!=0) { | |
1812 | return FALSE; | |
1813 | } | |
1814 | ||
1815 | /* | |
1816 | * We need to make a copy of the array if | |
1817 | * the buffer is read-only, or | |
1818 | * the buffer is refCounted (shared), and refCount>1, or | |
1819 | * the buffer is too small. | |
1820 | * Return FALSE if memory could not be allocated. | |
1821 | */ | |
1822 | if(forceClone || | |
1823 | fFlags & kBufferIsReadonly || | |
1824 | fFlags & kRefCounted && refCount() > 1 || | |
1825 | newCapacity > fCapacity | |
1826 | ) { | |
1827 | // save old values | |
1828 | UChar *array = fArray; | |
1829 | uint16_t flags = fFlags; | |
1830 | ||
1831 | // check growCapacity for default value and use of the stack buffer | |
1832 | if(growCapacity == -1) { | |
1833 | growCapacity = newCapacity; | |
1834 | } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { | |
1835 | growCapacity = US_STACKBUF_SIZE; | |
1836 | } | |
1837 | ||
1838 | // allocate a new array | |
1839 | if(allocate(growCapacity) || | |
1840 | newCapacity < growCapacity && allocate(newCapacity) | |
1841 | ) { | |
1842 | if(doCopyArray) { | |
1843 | // copy the contents | |
1844 | // do not copy more than what fits - it may be smaller than before | |
1845 | if(fCapacity < fLength) { | |
1846 | fLength = fCapacity; | |
1847 | } | |
1848 | us_arrayCopy(array, 0, fArray, 0, fLength); | |
1849 | } else { | |
1850 | fLength = 0; | |
1851 | } | |
1852 | ||
1853 | // release the old array | |
1854 | if(flags & kRefCounted) { | |
1855 | // the array is refCounted; decrement and release if 0 | |
1856 | int32_t *pRefCount = ((int32_t *)array - 1); | |
1857 | if(umtx_atomic_dec(pRefCount) == 0) { | |
1858 | if(pBufferToDelete == 0) { | |
1859 | uprv_free(pRefCount); | |
1860 | } else { | |
1861 | // the caller requested to delete it himself | |
1862 | *pBufferToDelete = pRefCount; | |
1863 | } | |
1864 | } | |
1865 | } | |
1866 | } else { | |
1867 | // not enough memory for growCapacity and not even for the smaller newCapacity | |
1868 | // reset the old values for setToBogus() to release the array | |
1869 | fArray = array; | |
1870 | fFlags = flags; | |
1871 | setToBogus(); | |
1872 | return FALSE; | |
1873 | } | |
1874 | } | |
1875 | return TRUE; | |
1876 | } | |
1877 | U_NAMESPACE_END |