]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustring.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / common / ustring.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustring.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 12/07/98 bertrand Creation.
17 ******************************************************************************
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 #include "cstring.h"
26 #include "cwchar.h"
27 #include "cmemory.h"
28 #include "ustr_imp.h"
29
30 /* ANSI string.h - style functions ------------------------------------------ */
31
32 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
33 #define U_BMP_MAX 0xffff
34
35 /* Forward binary string search functions ----------------------------------- */
36
37 /*
38 * Test if a substring match inside a string is at code point boundaries.
39 * All pointers refer to the same buffer.
40 * The limit pointer may be NULL, all others must be real pointers.
41 */
42 static inline UBool
43 isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
44 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
45 /* the leading edge of the match is in the middle of a surrogate pair */
46 return FALSE;
47 }
48 if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
49 /* the trailing edge of the match is in the middle of a surrogate pair */
50 return FALSE;
51 }
52 return TRUE;
53 }
54
55 U_CAPI UChar * U_EXPORT2
56 u_strFindFirst(const UChar *s, int32_t length,
57 const UChar *sub, int32_t subLength) {
58 const UChar *start, *p, *q, *subLimit;
59 UChar c, cs, cq;
60
61 if(sub==NULL || subLength<-1) {
62 return (UChar *)s;
63 }
64 if(s==NULL || length<-1) {
65 return NULL;
66 }
67
68 start=s;
69
70 if(length<0 && subLength<0) {
71 /* both strings are NUL-terminated */
72 if((cs=*sub++)==0) {
73 return (UChar *)s;
74 }
75 if(*sub==0 && !U16_IS_SURROGATE(cs)) {
76 /* the substring consists of a single, non-surrogate BMP code point */
77 return u_strchr(s, cs);
78 }
79
80 while((c=*s++)!=0) {
81 if(c==cs) {
82 /* found first substring UChar, compare rest */
83 p=s;
84 q=sub;
85 for(;;) {
86 if((cq=*q)==0) {
87 if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
88 return (UChar *)(s-1); /* well-formed match */
89 } else {
90 break; /* no match because surrogate pair is split */
91 }
92 }
93 if((c=*p)==0) {
94 return NULL; /* no match, and none possible after s */
95 }
96 if(c!=cq) {
97 break; /* no match */
98 }
99 ++p;
100 ++q;
101 }
102 }
103 }
104
105 /* not found */
106 return NULL;
107 }
108
109 if(subLength<0) {
110 subLength=u_strlen(sub);
111 }
112 if(subLength==0) {
113 return (UChar *)s;
114 }
115
116 /* get sub[0] to search for it fast */
117 cs=*sub++;
118 --subLength;
119 subLimit=sub+subLength;
120
121 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
122 /* the substring consists of a single, non-surrogate BMP code point */
123 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
124 }
125
126 if(length<0) {
127 /* s is NUL-terminated */
128 while((c=*s++)!=0) {
129 if(c==cs) {
130 /* found first substring UChar, compare rest */
131 p=s;
132 q=sub;
133 for(;;) {
134 if(q==subLimit) {
135 if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
136 return (UChar *)(s-1); /* well-formed match */
137 } else {
138 break; /* no match because surrogate pair is split */
139 }
140 }
141 if((c=*p)==0) {
142 return NULL; /* no match, and none possible after s */
143 }
144 if(c!=*q) {
145 break; /* no match */
146 }
147 ++p;
148 ++q;
149 }
150 }
151 }
152 } else {
153 const UChar *limit, *preLimit;
154
155 /* subLength was decremented above */
156 if(length<=subLength) {
157 return NULL; /* s is shorter than sub */
158 }
159
160 limit=s+length;
161
162 /* the substring must start before preLimit */
163 preLimit=limit-subLength;
164
165 while(s!=preLimit) {
166 c=*s++;
167 if(c==cs) {
168 /* found first substring UChar, compare rest */
169 p=s;
170 q=sub;
171 for(;;) {
172 if(q==subLimit) {
173 if(isMatchAtCPBoundary(start, s-1, p, limit)) {
174 return (UChar *)(s-1); /* well-formed match */
175 } else {
176 break; /* no match because surrogate pair is split */
177 }
178 }
179 if(*p!=*q) {
180 break; /* no match */
181 }
182 ++p;
183 ++q;
184 }
185 }
186 }
187 }
188
189 /* not found */
190 return NULL;
191 }
192
193 U_CAPI UChar * U_EXPORT2
194 u_strstr(const UChar *s, const UChar *substring) {
195 return u_strFindFirst(s, -1, substring, -1);
196 }
197
198 U_CAPI UChar * U_EXPORT2
199 u_strchr(const UChar *s, UChar c) {
200 if(U16_IS_SURROGATE(c)) {
201 /* make sure to not find half of a surrogate pair */
202 return u_strFindFirst(s, -1, &c, 1);
203 } else {
204 UChar cs;
205
206 /* trivial search for a BMP code point */
207 for(;;) {
208 if((cs=*s)==c) {
209 return (UChar *)s;
210 }
211 if(cs==0) {
212 return NULL;
213 }
214 ++s;
215 }
216 }
217 }
218
219 U_CAPI UChar * U_EXPORT2
220 u_strchr32(const UChar *s, UChar32 c) {
221 if((uint32_t)c<=U_BMP_MAX) {
222 /* find BMP code point */
223 return u_strchr(s, (UChar)c);
224 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
225 /* find supplementary code point as surrogate pair */
226 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
227
228 while((cs=*s++)!=0) {
229 if(cs==lead && *s==trail) {
230 return (UChar *)(s-1);
231 }
232 }
233 return NULL;
234 } else {
235 /* not a Unicode code point, not findable */
236 return NULL;
237 }
238 }
239
240 U_CAPI UChar * U_EXPORT2
241 u_memchr(const UChar *s, UChar c, int32_t count) {
242 if(count<=0) {
243 return NULL; /* no string */
244 } else if(U16_IS_SURROGATE(c)) {
245 /* make sure to not find half of a surrogate pair */
246 return u_strFindFirst(s, count, &c, 1);
247 } else {
248 /* trivial search for a BMP code point */
249 const UChar *limit=s+count;
250 do {
251 if(*s==c) {
252 return (UChar *)s;
253 }
254 } while(++s!=limit);
255 return NULL;
256 }
257 }
258
259 U_CAPI UChar * U_EXPORT2
260 u_memchr32(const UChar *s, UChar32 c, int32_t count) {
261 if((uint32_t)c<=U_BMP_MAX) {
262 /* find BMP code point */
263 return u_memchr(s, (UChar)c, count);
264 } else if(count<2) {
265 /* too short for a surrogate pair */
266 return NULL;
267 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
268 /* find supplementary code point as surrogate pair */
269 const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
270 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
271
272 do {
273 if(*s==lead && *(s+1)==trail) {
274 return (UChar *)s;
275 }
276 } while(++s!=limit);
277 return NULL;
278 } else {
279 /* not a Unicode code point, not findable */
280 return NULL;
281 }
282 }
283
284 /* Backward binary string search functions ---------------------------------- */
285
286 U_CAPI UChar * U_EXPORT2
287 u_strFindLast(const UChar *s, int32_t length,
288 const UChar *sub, int32_t subLength) {
289 const UChar *start, *limit, *p, *q, *subLimit;
290 UChar c, cs;
291
292 if(sub==NULL || subLength<-1) {
293 return (UChar *)s;
294 }
295 if(s==NULL || length<-1) {
296 return NULL;
297 }
298
299 /*
300 * This implementation is more lazy than the one for u_strFindFirst():
301 * There is no special search code for NUL-terminated strings.
302 * It does not seem to be worth it for searching substrings to
303 * search forward and find all matches like in u_strrchr() and similar.
304 * Therefore, we simply get both string lengths and search backward.
305 *
306 * markus 2002oct23
307 */
308
309 if(subLength<0) {
310 subLength=u_strlen(sub);
311 }
312 if(subLength==0) {
313 return (UChar *)s;
314 }
315
316 /* get sub[subLength-1] to search for it fast */
317 subLimit=sub+subLength;
318 cs=*(--subLimit);
319 --subLength;
320
321 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
322 /* the substring consists of a single, non-surrogate BMP code point */
323 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
324 }
325
326 if(length<0) {
327 length=u_strlen(s);
328 }
329
330 /* subLength was decremented above */
331 if(length<=subLength) {
332 return NULL; /* s is shorter than sub */
333 }
334
335 start=s;
336 limit=s+length;
337
338 /* the substring must start no later than s+subLength */
339 s+=subLength;
340
341 while(s!=limit) {
342 c=*(--limit);
343 if(c==cs) {
344 /* found last substring UChar, compare rest */
345 p=limit;
346 q=subLimit;
347 for(;;) {
348 if(q==sub) {
349 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
350 return (UChar *)p; /* well-formed match */
351 } else {
352 break; /* no match because surrogate pair is split */
353 }
354 }
355 if(*(--p)!=*(--q)) {
356 break; /* no match */
357 }
358 }
359 }
360 }
361
362 /* not found */
363 return NULL;
364 }
365
366 U_CAPI UChar * U_EXPORT2
367 u_strrstr(const UChar *s, const UChar *substring) {
368 return u_strFindLast(s, -1, substring, -1);
369 }
370
371 U_CAPI UChar * U_EXPORT2
372 u_strrchr(const UChar *s, UChar c) {
373 if(U16_IS_SURROGATE(c)) {
374 /* make sure to not find half of a surrogate pair */
375 return u_strFindLast(s, -1, &c, 1);
376 } else {
377 const UChar *result=NULL;
378 UChar cs;
379
380 /* trivial search for a BMP code point */
381 for(;;) {
382 if((cs=*s)==c) {
383 result=s;
384 }
385 if(cs==0) {
386 return (UChar *)result;
387 }
388 ++s;
389 }
390 }
391 }
392
393 U_CAPI UChar * U_EXPORT2
394 u_strrchr32(const UChar *s, UChar32 c) {
395 if((uint32_t)c<=U_BMP_MAX) {
396 /* find BMP code point */
397 return u_strrchr(s, (UChar)c);
398 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
399 /* find supplementary code point as surrogate pair */
400 const UChar *result=NULL;
401 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
402
403 while((cs=*s++)!=0) {
404 if(cs==lead && *s==trail) {
405 result=s-1;
406 }
407 }
408 return (UChar *)result;
409 } else {
410 /* not a Unicode code point, not findable */
411 return NULL;
412 }
413 }
414
415 U_CAPI UChar * U_EXPORT2
416 u_memrchr(const UChar *s, UChar c, int32_t count) {
417 if(count<=0) {
418 return NULL; /* no string */
419 } else if(U16_IS_SURROGATE(c)) {
420 /* make sure to not find half of a surrogate pair */
421 return u_strFindLast(s, count, &c, 1);
422 } else {
423 /* trivial search for a BMP code point */
424 const UChar *limit=s+count;
425 do {
426 if(*(--limit)==c) {
427 return (UChar *)limit;
428 }
429 } while(s!=limit);
430 return NULL;
431 }
432 }
433
434 U_CAPI UChar * U_EXPORT2
435 u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
436 if((uint32_t)c<=U_BMP_MAX) {
437 /* find BMP code point */
438 return u_memrchr(s, (UChar)c, count);
439 } else if(count<2) {
440 /* too short for a surrogate pair */
441 return NULL;
442 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
443 /* find supplementary code point as surrogate pair */
444 const UChar *limit=s+count-1;
445 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
446
447 do {
448 if(*limit==trail && *(limit-1)==lead) {
449 return (UChar *)(limit-1);
450 }
451 } while(s!=--limit);
452 return NULL;
453 } else {
454 /* not a Unicode code point, not findable */
455 return NULL;
456 }
457 }
458
459 /* Tokenization functions --------------------------------------------------- */
460
461 /*
462 * Match each code point in a string against each code point in the matchSet.
463 * Return the index of the first string code point that
464 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
465 * Return -(string length)-1 if there is no such code point.
466 */
467 static int32_t
468 _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
469 int32_t matchLen, matchBMPLen, strItr, matchItr;
470 UChar32 stringCh, matchCh;
471 UChar c, c2;
472
473 /* first part of matchSet contains only BMP code points */
474 matchBMPLen = 0;
475 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
476 ++matchBMPLen;
477 }
478
479 /* second part of matchSet contains BMP and supplementary code points */
480 matchLen = matchBMPLen;
481 while(matchSet[matchLen] != 0) {
482 ++matchLen;
483 }
484
485 for(strItr = 0; (c = string[strItr]) != 0;) {
486 ++strItr;
487 if(U16_IS_SINGLE(c)) {
488 if(polarity) {
489 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
490 if(c == matchSet[matchItr]) {
491 return strItr - 1; /* one matches */
492 }
493 }
494 } else {
495 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
496 if(c == matchSet[matchItr]) {
497 goto endloop;
498 }
499 }
500 return strItr - 1; /* none matches */
501 }
502 } else {
503 /*
504 * No need to check for string length before U16_IS_TRAIL
505 * because c2 could at worst be the terminating NUL.
506 */
507 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
508 ++strItr;
509 stringCh = U16_GET_SUPPLEMENTARY(c, c2);
510 } else {
511 stringCh = c; /* unpaired trail surrogate */
512 }
513
514 if(polarity) {
515 for(matchItr = matchBMPLen; matchItr < matchLen;) {
516 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
517 if(stringCh == matchCh) {
518 return strItr - U16_LENGTH(stringCh); /* one matches */
519 }
520 }
521 } else {
522 for(matchItr = matchBMPLen; matchItr < matchLen;) {
523 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
524 if(stringCh == matchCh) {
525 goto endloop;
526 }
527 }
528 return strItr - U16_LENGTH(stringCh); /* none matches */
529 }
530 }
531 endloop:
532 /* wish C had continue with labels like Java... */;
533 }
534
535 /* Didn't find it. */
536 return -strItr-1;
537 }
538
539 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540 U_CAPI UChar * U_EXPORT2
541 u_strpbrk(const UChar *string, const UChar *matchSet)
542 {
543 int32_t idx = _matchFromSet(string, matchSet, TRUE);
544 if(idx >= 0) {
545 return (UChar *)string + idx;
546 } else {
547 return NULL;
548 }
549 }
550
551 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552 U_CAPI int32_t U_EXPORT2
553 u_strcspn(const UChar *string, const UChar *matchSet)
554 {
555 int32_t idx = _matchFromSet(string, matchSet, TRUE);
556 if(idx >= 0) {
557 return idx;
558 } else {
559 return -idx - 1; /* == u_strlen(string) */
560 }
561 }
562
563 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564 U_CAPI int32_t U_EXPORT2
565 u_strspn(const UChar *string, const UChar *matchSet)
566 {
567 int32_t idx = _matchFromSet(string, matchSet, FALSE);
568 if(idx >= 0) {
569 return idx;
570 } else {
571 return -idx - 1; /* == u_strlen(string) */
572 }
573 }
574
575 /* ----- Text manipulation functions --- */
576
577 U_CAPI UChar* U_EXPORT2
578 u_strtok_r(UChar *src,
579 const UChar *delim,
580 UChar **saveState)
581 {
582 UChar *tokSource;
583 UChar *nextToken;
584 uint32_t nonDelimIdx;
585
586 /* If saveState is NULL, the user messed up. */
587 if (src != NULL) {
588 tokSource = src;
589 *saveState = src; /* Set to "src" in case there are no delimiters */
590 }
591 else if (*saveState) {
592 tokSource = *saveState;
593 }
594 else {
595 /* src == NULL && *saveState == NULL */
596 /* This shouldn't happen. We already finished tokenizing. */
597 return NULL;
598 }
599
600 /* Skip initial delimiters */
601 nonDelimIdx = u_strspn(tokSource, delim);
602 tokSource = &tokSource[nonDelimIdx];
603
604 if (*tokSource) {
605 nextToken = u_strpbrk(tokSource, delim);
606 if (nextToken != NULL) {
607 /* Create a token */
608 *(nextToken++) = 0;
609 *saveState = nextToken;
610 return tokSource;
611 }
612 else if (*saveState) {
613 /* Return the last token */
614 *saveState = NULL;
615 return tokSource;
616 }
617 }
618 else {
619 /* No tokens were found. Only delimiters were left. */
620 *saveState = NULL;
621 }
622 return NULL;
623 }
624
625 /* Miscellaneous functions -------------------------------------------------- */
626
627 U_CAPI UChar* U_EXPORT2
628 u_strcat(UChar *dst,
629 const UChar *src)
630 {
631 UChar *anchor = dst; /* save a pointer to start of dst */
632
633 while(*dst != 0) { /* To end of first string */
634 ++dst;
635 }
636 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
637 }
638
639 return anchor;
640 }
641
642 U_CAPI UChar* U_EXPORT2
643 u_strncat(UChar *dst,
644 const UChar *src,
645 int32_t n )
646 {
647 if(n > 0) {
648 UChar *anchor = dst; /* save a pointer to start of dst */
649
650 while(*dst != 0) { /* To end of first string */
651 ++dst;
652 }
653 while((*dst = *src) != 0) { /* copy string 2 over */
654 ++dst;
655 if(--n == 0) {
656 *dst = 0;
657 break;
658 }
659 ++src;
660 }
661
662 return anchor;
663 } else {
664 return dst;
665 }
666 }
667
668 /* ----- Text property functions --- */
669
670 U_CAPI int32_t U_EXPORT2
671 u_strcmp(const UChar *s1,
672 const UChar *s2)
673 {
674 UChar c1, c2;
675
676 for(;;) {
677 c1=*s1++;
678 c2=*s2++;
679 if (c1 != c2 || c1 == 0) {
680 break;
681 }
682 }
683 return (int32_t)c1 - (int32_t)c2;
684 }
685
686 U_CFUNC int32_t U_EXPORT2
687 uprv_strCompare(const UChar *s1, int32_t length1,
688 const UChar *s2, int32_t length2,
689 UBool strncmpStyle, UBool codePointOrder) {
690 const UChar *start1, *start2, *limit1, *limit2;
691 UChar c1, c2;
692
693 /* setup for fix-up */
694 start1=s1;
695 start2=s2;
696
697 /* compare identical prefixes - they do not need to be fixed up */
698 if(length1<0 && length2<0) {
699 /* strcmp style, both NUL-terminated */
700 if(s1==s2) {
701 return 0;
702 }
703
704 for(;;) {
705 c1=*s1;
706 c2=*s2;
707 if(c1!=c2) {
708 break;
709 }
710 if(c1==0) {
711 return 0;
712 }
713 ++s1;
714 ++s2;
715 }
716
717 /* setup for fix-up */
718 limit1=limit2=NULL;
719 } else if(strncmpStyle) {
720 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
721 if(s1==s2) {
722 return 0;
723 }
724
725 limit1=start1+length1;
726
727 for(;;) {
728 /* both lengths are same, check only one limit */
729 if(s1==limit1) {
730 return 0;
731 }
732
733 c1=*s1;
734 c2=*s2;
735 if(c1!=c2) {
736 break;
737 }
738 if(c1==0) {
739 return 0;
740 }
741 ++s1;
742 ++s2;
743 }
744
745 /* setup for fix-up */
746 limit2=start2+length1; /* use length1 here, too, to enforce assumption */
747 } else {
748 /* memcmp/UnicodeString style, both length-specified */
749 int32_t lengthResult;
750
751 if(length1<0) {
752 length1=u_strlen(s1);
753 }
754 if(length2<0) {
755 length2=u_strlen(s2);
756 }
757
758 /* limit1=start1+min(lenght1, length2) */
759 if(length1<length2) {
760 lengthResult=-1;
761 limit1=start1+length1;
762 } else if(length1==length2) {
763 lengthResult=0;
764 limit1=start1+length1;
765 } else /* length1>length2 */ {
766 lengthResult=1;
767 limit1=start1+length2;
768 }
769
770 if(s1==s2) {
771 return lengthResult;
772 }
773
774 for(;;) {
775 /* check pseudo-limit */
776 if(s1==limit1) {
777 return lengthResult;
778 }
779
780 c1=*s1;
781 c2=*s2;
782 if(c1!=c2) {
783 break;
784 }
785 ++s1;
786 ++s2;
787 }
788
789 /* setup for fix-up */
790 limit1=start1+length1;
791 limit2=start2+length2;
792 }
793
794 /* if both values are in or above the surrogate range, fix them up */
795 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
796 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
797 if(
798 (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
799 (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
800 ) {
801 /* part of a surrogate pair, leave >=d800 */
802 } else {
803 /* BMP code point - may be surrogate code point - make <d800 */
804 c1-=0x2800;
805 }
806
807 if(
808 (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
809 (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
810 ) {
811 /* part of a surrogate pair, leave >=d800 */
812 } else {
813 /* BMP code point - may be surrogate code point - make <d800 */
814 c2-=0x2800;
815 }
816 }
817
818 /* now c1 and c2 are in the requested (code unit or code point) order */
819 return (int32_t)c1-(int32_t)c2;
820 }
821
822 /*
823 * Compare two strings as presented by UCharIterators.
824 * Use code unit or code point order.
825 * When the function returns, it is undefined where the iterators
826 * have stopped.
827 */
828 U_CAPI int32_t U_EXPORT2
829 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
830 UChar32 c1, c2;
831
832 /* argument checking */
833 if(iter1==NULL || iter2==NULL) {
834 return 0; /* bad arguments */
835 }
836 if(iter1==iter2) {
837 return 0; /* identical iterators */
838 }
839
840 /* reset iterators to start? */
841 iter1->move(iter1, 0, UITER_START);
842 iter2->move(iter2, 0, UITER_START);
843
844 /* compare identical prefixes - they do not need to be fixed up */
845 for(;;) {
846 c1=iter1->next(iter1);
847 c2=iter2->next(iter2);
848 if(c1!=c2) {
849 break;
850 }
851 if(c1==-1) {
852 return 0;
853 }
854 }
855
856 /* if both values are in or above the surrogate range, fix them up */
857 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
858 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
859 if(
860 (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
861 (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
862 ) {
863 /* part of a surrogate pair, leave >=d800 */
864 } else {
865 /* BMP code point - may be surrogate code point - make <d800 */
866 c1-=0x2800;
867 }
868
869 if(
870 (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
871 (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
872 ) {
873 /* part of a surrogate pair, leave >=d800 */
874 } else {
875 /* BMP code point - may be surrogate code point - make <d800 */
876 c2-=0x2800;
877 }
878 }
879
880 /* now c1 and c2 are in the requested (code unit or code point) order */
881 return (int32_t)c1-(int32_t)c2;
882 }
883
884 #if 0
885 /*
886 * u_strCompareIter() does not leave the iterators _on_ the different units.
887 * This is possible but would cost a few extra indirect function calls to back
888 * up if the last unit (c1 or c2 respectively) was >=0.
889 *
890 * Consistently leaving them _behind_ the different units is not an option
891 * because the current "unit" is the end of the string if that is reached,
892 * and in such a case the iterator does not move.
893 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894 * of their strings. Calling previous() on each does not move them to where
895 * the comparison fails.
896 *
897 * So the simplest semantics is to not define where the iterators end up.
898 *
899 * The following fragment is part of what would need to be done for backing up.
900 */
901 void fragment {
902 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
903 if(c1<=0xdbff) {
904 if(!U16_IS_TRAIL(iter1->current(iter1))) {
905 /* lead surrogate code point - make <d800 */
906 c1-=0x2800;
907 }
908 } else if(c1<=0xdfff) {
909 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
910 iter1->previous(iter1); /* ==c1 */
911 if(!U16_IS_LEAD(iter1->previous(iter1))) {
912 /* trail surrogate code point - make <d800 */
913 c1-=0x2800;
914 }
915 /* go back to behind where the difference is */
916 iter1->move(iter1, idx, UITER_ZERO);
917 } else /* 0xe000<=c1<=0xffff */ {
918 /* BMP code point - make <d800 */
919 c1-=0x2800;
920 }
921 }
922 #endif
923
924 U_CAPI int32_t U_EXPORT2
925 u_strCompare(const UChar *s1, int32_t length1,
926 const UChar *s2, int32_t length2,
927 UBool codePointOrder) {
928 /* argument checking */
929 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
930 return 0;
931 }
932 return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
933 }
934
935 /* String compare in code point order - u_strcmp() compares in code unit order. */
936 U_CAPI int32_t U_EXPORT2
937 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
938 return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
939 }
940
941 U_CAPI int32_t U_EXPORT2
942 u_strncmp(const UChar *s1,
943 const UChar *s2,
944 int32_t n)
945 {
946 if(n > 0) {
947 int32_t rc;
948 for(;;) {
949 rc = (int32_t)*s1 - (int32_t)*s2;
950 if(rc != 0 || *s1 == 0 || --n == 0) {
951 return rc;
952 }
953 ++s1;
954 ++s2;
955 }
956 } else {
957 return 0;
958 }
959 }
960
961 U_CAPI int32_t U_EXPORT2
962 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
963 return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
964 }
965
966 U_CAPI UChar* U_EXPORT2
967 u_strcpy(UChar *dst,
968 const UChar *src)
969 {
970 UChar *anchor = dst; /* save a pointer to start of dst */
971
972 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
973 }
974
975 return anchor;
976 }
977
978 U_CAPI UChar* U_EXPORT2
979 u_strncpy(UChar *dst,
980 const UChar *src,
981 int32_t n)
982 {
983 UChar *anchor = dst; /* save a pointer to start of dst */
984
985 /* copy string 2 over */
986 while(n > 0 && (*(dst++) = *(src++)) != 0) {
987 --n;
988 }
989
990 return anchor;
991 }
992
993 U_CAPI int32_t U_EXPORT2
994 u_strlen(const UChar *s)
995 {
996 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997 return (int32_t)uprv_wcslen((const wchar_t *)s);
998 #else
999 const UChar *t = s;
1000 while(*t != 0) {
1001 ++t;
1002 }
1003 return t - s;
1004 #endif
1005 }
1006
1007 U_CAPI int32_t U_EXPORT2
1008 u_countChar32(const UChar *s, int32_t length) {
1009 int32_t count;
1010
1011 if(s==NULL || length<-1) {
1012 return 0;
1013 }
1014
1015 count=0;
1016 if(length>=0) {
1017 while(length>0) {
1018 ++count;
1019 if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1020 s+=2;
1021 length-=2;
1022 } else {
1023 ++s;
1024 --length;
1025 }
1026 }
1027 } else /* length==-1 */ {
1028 UChar c;
1029
1030 for(;;) {
1031 if((c=*s++)==0) {
1032 break;
1033 }
1034 ++count;
1035
1036 /*
1037 * sufficient to look ahead one because of UTF-16;
1038 * safe to look ahead one because at worst that would be the terminating NUL
1039 */
1040 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1041 ++s;
1042 }
1043 }
1044 }
1045 return count;
1046 }
1047
1048 U_CAPI UBool U_EXPORT2
1049 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1050
1051 if(number<0) {
1052 return TRUE;
1053 }
1054 if(s==NULL || length<-1) {
1055 return FALSE;
1056 }
1057
1058 if(length==-1) {
1059 /* s is NUL-terminated */
1060 UChar c;
1061
1062 /* count code points until they exceed */
1063 for(;;) {
1064 if((c=*s++)==0) {
1065 return FALSE;
1066 }
1067 if(number==0) {
1068 return TRUE;
1069 }
1070 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1071 ++s;
1072 }
1073 --number;
1074 }
1075 } else {
1076 /* length>=0 known */
1077 const UChar *limit;
1078 int32_t maxSupplementary;
1079
1080 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081 if(((length+1)/2)>number) {
1082 return TRUE;
1083 }
1084
1085 /* check if s does not even contain enough UChars */
1086 maxSupplementary=length-number;
1087 if(maxSupplementary<=0) {
1088 return FALSE;
1089 }
1090 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1091
1092 /*
1093 * count code points until they exceed and also check that there are
1094 * no more than maxSupplementary supplementary code points (UChar pairs)
1095 */
1096 limit=s+length;
1097 for(;;) {
1098 if(s==limit) {
1099 return FALSE;
1100 }
1101 if(number==0) {
1102 return TRUE;
1103 }
1104 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1105 ++s;
1106 if(--maxSupplementary<=0) {
1107 /* too many pairs - too few code points */
1108 return FALSE;
1109 }
1110 }
1111 --number;
1112 }
1113 }
1114 }
1115
1116 /* ----- String validation functions --- */
1117
1118 /*
1119 * Check whether the string is well-formed according to various criteria:
1120 * - No code points that are defined as non-characters (e.g. 0xFFFF) or are undefined in
1121 * the version of Unicode currently supported.
1122 * - No isolated surrogate code points.
1123 * - No overly-long sequences of non-starter combining marks, i.e. more than 30 characters
1124 * in a row with non-zero combining class (which may have category Mn or Mc); this
1125 * violates Stream-Safe Text Format per UAX #15. This test does not ensure that the
1126 * string satisfies Stream-Safe Text Format (because it does not convert to NFKC first),
1127 * but any string that fails this test is certainly not Stream-Safe.
1128 * - No emoji variation selectors applied to non-emoji code points. This function may
1129 * also check for other non-standard variation sequences.
1130 * - No tag sequences that are ill-formed per definition ED-14a in UTS #51 (e.g. tag
1131 * sequences must have an emoji base and a terminator).
1132 *
1133 * @internal Apple only
1134 */
1135 enum { kBidiMaxDepth = 125 };
1136
1137 static UBool isWellFormed(UChar32 c, UChar32 cLast, int32_t *nonStarterCountP, UBool *inTagSeqP,
1138 uint8_t* dirStatus, int32_t* dirStatusIndexP, int32_t* validIsolateCountP) {
1139 if (*inTagSeqP) {
1140 // can only have tag_spec or tag_term
1141 if (c == 0xE007F) { // tag_term
1142 *inTagSeqP = FALSE;
1143 } else if (c < 0xE0020 || c > 0xE007E) {
1144 return FALSE;
1145 }
1146 } else if (c < 0x0300) {
1147 // Everything in this range (includes ASCII) is a valid character with combining class 0
1148 *nonStarterCountP = 0;
1149 if (c == 0x000A || c == 0x000D || c == 0x0085 || (c >= 0x001C && c <= 0x001E)) {
1150 // paragraph sep, reset bidi
1151 *dirStatusIndexP = 0;
1152 *validIsolateCountP = 0;
1153 }
1154 } else if ((c >= 0x2029 && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069)) {
1155 // para sep & bidi controls, all have combining class 0. The bidi control actions here
1156 // are from [https://www.unicode.org/reports/tr9/#Explicit_Levels_and_Directions]
1157 *nonStarterCountP = 0;
1158 if (c == 0x2029) { // paragraph sep, reset bidi
1159 *dirStatusIndexP = 0;
1160 *validIsolateCountP = 0;
1161 } else if (c == 0x2069) { // PDI
1162 if (*validIsolateCountP > 0) {
1163 while (*dirStatusIndexP > 0 && (dirStatus[(*dirStatusIndexP)--] & 0x80) == 0);
1164 (*validIsolateCountP)--;
1165 }
1166 } else if (c == 0x202C) { // PDF
1167 if (*dirStatusIndexP > 0 && (dirStatus[*dirStatusIndexP] & 0x80) == 0) {
1168 (*dirStatusIndexP)--;
1169 }
1170 } else {
1171 // embedding/override initiator. Need to increment the level by at least 1, and possibly 2 if the
1172 // embedding/override direction matches the current direction (i.e. R and current odd, or L and current even).
1173 // Since we increment first, the test for odd/even is flipped. For FSI, we do not actually determine
1174 // whether it should be treated as RLI or LRI, so we just do the minimum increment.
1175 uint8_t newEntry = (dirStatus[*dirStatusIndexP] & 0x7F) + 1; // min increment, flips odd/even status compared to current
1176 if ( ((c == 0x202B || c == 0x202E || c == 0x2067) && (newEntry & 0x01) == 0) || // RLE/RLO/RLI and current was odd
1177 ((c == 0x202A || c == 0x202D || c == 0x2066) && (newEntry & 0x01) != 0) ) { // LRE/LRO/LRI and current was even
1178 newEntry++;
1179 }
1180 if (newEntry > kBidiMaxDepth || *dirStatusIndexP > kBidiMaxDepth) {
1181 return FALSE; // Checking for this is the whole point.
1182 }
1183 if (c >= 0x2066 && c <= 0x2068) { // LRI/RLI/FSI
1184 newEntry |= 0x80; // set directional isolate status
1185 (*validIsolateCountP)++;
1186 }
1187 dirStatus[++(*dirStatusIndexP)] = newEntry;
1188 }
1189 } else if (c == 0xFE0F) { // emoji variation selector
1190 if (!u_isEmoji(cLast)) { // previous char must be emoji
1191 return FALSE;
1192 }
1193 // previous character would have set *nonStarterCountP = 0;
1194 } else if (c >= 0xE0020 && c <= 0xE007E) { // tag_spec
1195 if (!u_isEmoji(cLast) && cLast != 0xFE0F) { // previous char must be emoji or FE0F
1196 return FALSE;
1197 }
1198 *inTagSeqP = TRUE;
1199 // previous character would have set *nonStarterCountP = 0;
1200 } else if (c == 0xE007F) { // tag_term
1201 return FALSE;
1202 } else {
1203 // we have checked specific ranges/chars, now check general info for others
1204 int8_t genCat = u_charType(c);
1205 if (genCat == U_UNASSIGNED || genCat == U_SURROGATE) {
1206 return FALSE;
1207 }
1208 if ((genCat == U_NON_SPACING_MARK || genCat == U_COMBINING_SPACING_MARK) && u_getCombiningClass(c) != 0) {
1209 // non-starter
1210 if (++(*nonStarterCountP) > 30) {
1211 return FALSE;
1212 }
1213 } else {
1214 *nonStarterCountP = 0;
1215 }
1216 }
1217 return TRUE;
1218 }
1219
1220 U_CAPI UBool U_EXPORT2
1221 u_strIsWellFormed(const UChar *s, int32_t length) {
1222 if (s==NULL || length<-1) {
1223 return FALSE;
1224 }
1225 UChar32 c, c2, cLast = 0;
1226 int32_t nonStarterCount = 0;
1227 UBool inTagSeq = FALSE;
1228 uint8_t dirStatus[kBidiMaxDepth + 3]; // low 7 bits is embed level, high bit is direction override status
1229 int32_t dirStatusIndex = 0;
1230 int32_t validIsolateCount = 0;
1231 dirStatus[0] = 0; // assume initial paragraph direction L (most conservative)
1232 if (length < 0) {
1233 // NUL terminated
1234 while ((c = *s++) != 0) {
1235 // get next UChar32 c
1236 if (U16_IS_LEAD(c)) {
1237 if (U16_IS_TRAIL(c2 = *s)) {
1238 s++;
1239 c = U16_GET_SUPPLEMENTARY(c,c2);
1240 }
1241 }
1242 // check current c
1243 if (!isWellFormed(c, cLast, &nonStarterCount, &inTagSeq, dirStatus, &dirStatusIndex, &validIsolateCount)) {
1244 return FALSE;
1245 }
1246 // setup next iteration
1247 cLast = c;
1248 }
1249 } else {
1250 // use length
1251 const UChar *sLimit = s + length;
1252 while (s < sLimit) {
1253 // get next UChar32 c
1254 c = *s++;
1255 if (U16_IS_LEAD(c)) {
1256 if (s < sLimit && U16_IS_TRAIL(c2 = *s)) {
1257 s++;
1258 c = U16_GET_SUPPLEMENTARY(c,c2);
1259 }
1260 }
1261 // check current c
1262 if (!isWellFormed(c, cLast, &nonStarterCount, &inTagSeq, dirStatus, &dirStatusIndex, &validIsolateCount)) {
1263 return FALSE;
1264 }
1265 // setup next iteration
1266 cLast = c;
1267 }
1268 }
1269 return TRUE;
1270 }
1271
1272 /* ----- U_mem functions --- */
1273
1274 U_CAPI UChar * U_EXPORT2
1275 u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1276 if(count > 0) {
1277 uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1278 }
1279 return dest;
1280 }
1281
1282 U_CAPI UChar * U_EXPORT2
1283 u_memmove(UChar *dest, const UChar *src, int32_t count) {
1284 if(count > 0) {
1285 uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1286 }
1287 return dest;
1288 }
1289
1290 U_CAPI UChar * U_EXPORT2
1291 u_memset(UChar *dest, UChar c, int32_t count) {
1292 if(count > 0) {
1293 UChar *ptr = dest;
1294 UChar *limit = dest + count;
1295
1296 while (ptr < limit) {
1297 *(ptr++) = c;
1298 }
1299 }
1300 return dest;
1301 }
1302
1303 U_CAPI int32_t U_EXPORT2
1304 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1305 if(count > 0) {
1306 const UChar *limit = buf1 + count;
1307 int32_t result;
1308
1309 while (buf1 < limit) {
1310 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1311 if (result != 0) {
1312 return result;
1313 }
1314 buf1++;
1315 buf2++;
1316 }
1317 }
1318 return 0;
1319 }
1320
1321 U_CAPI int32_t U_EXPORT2
1322 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1323 return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1324 }
1325
1326 /* u_unescape & support fns ------------------------------------------------- */
1327
1328 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1329 static const UChar UNESCAPE_MAP[] = {
1330 /*" 0x22, 0x22 */
1331 /*' 0x27, 0x27 */
1332 /*? 0x3F, 0x3F */
1333 /*\ 0x5C, 0x5C */
1334 /*a*/ 0x61, 0x07,
1335 /*b*/ 0x62, 0x08,
1336 /*e*/ 0x65, 0x1b,
1337 /*f*/ 0x66, 0x0c,
1338 /*n*/ 0x6E, 0x0a,
1339 /*r*/ 0x72, 0x0d,
1340 /*t*/ 0x74, 0x09,
1341 /*v*/ 0x76, 0x0b
1342 };
1343 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1344
1345 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1346 static int8_t _digit8(UChar c) {
1347 if (c >= 0x0030 && c <= 0x0037) {
1348 return (int8_t)(c - 0x0030);
1349 }
1350 return -1;
1351 }
1352
1353 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1354 static int8_t _digit16(UChar c) {
1355 if (c >= 0x0030 && c <= 0x0039) {
1356 return (int8_t)(c - 0x0030);
1357 }
1358 if (c >= 0x0041 && c <= 0x0046) {
1359 return (int8_t)(c - (0x0041 - 10));
1360 }
1361 if (c >= 0x0061 && c <= 0x0066) {
1362 return (int8_t)(c - (0x0061 - 10));
1363 }
1364 return -1;
1365 }
1366
1367 /* Parse a single escape sequence. Although this method deals in
1368 * UChars, it does not use C++ or UnicodeString. This allows it to
1369 * be used from C contexts. */
1370 U_CAPI UChar32 U_EXPORT2
1371 u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1372 int32_t *offset,
1373 int32_t length,
1374 void *context) {
1375
1376 int32_t start = *offset;
1377 UChar c;
1378 UChar32 result = 0;
1379 int8_t n = 0;
1380 int8_t minDig = 0;
1381 int8_t maxDig = 0;
1382 int8_t bitsPerDigit = 4;
1383 int8_t dig;
1384 int32_t i;
1385 UBool braces = FALSE;
1386
1387 /* Check that offset is in range */
1388 if (*offset < 0 || *offset >= length) {
1389 goto err;
1390 }
1391
1392 /* Fetch first UChar after '\\' */
1393 c = charAt((*offset)++, context);
1394
1395 /* Convert hexadecimal and octal escapes */
1396 switch (c) {
1397 case 0x0075 /*'u'*/:
1398 minDig = maxDig = 4;
1399 break;
1400 case 0x0055 /*'U'*/:
1401 minDig = maxDig = 8;
1402 break;
1403 case 0x0078 /*'x'*/:
1404 minDig = 1;
1405 if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1406 ++(*offset);
1407 braces = TRUE;
1408 maxDig = 8;
1409 } else {
1410 maxDig = 2;
1411 }
1412 break;
1413 default:
1414 dig = _digit8(c);
1415 if (dig >= 0) {
1416 minDig = 1;
1417 maxDig = 3;
1418 n = 1; /* Already have first octal digit */
1419 bitsPerDigit = 3;
1420 result = dig;
1421 }
1422 break;
1423 }
1424 if (minDig != 0) {
1425 while (*offset < length && n < maxDig) {
1426 c = charAt(*offset, context);
1427 dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1428 if (dig < 0) {
1429 break;
1430 }
1431 result = (result << bitsPerDigit) | dig;
1432 ++(*offset);
1433 ++n;
1434 }
1435 if (n < minDig) {
1436 goto err;
1437 }
1438 if (braces) {
1439 if (c != 0x7D /*}*/) {
1440 goto err;
1441 }
1442 ++(*offset);
1443 }
1444 if (result < 0 || result >= 0x110000) {
1445 goto err;
1446 }
1447 /* If an escape sequence specifies a lead surrogate, see if
1448 * there is a trail surrogate after it, either as an escape or
1449 * as a literal. If so, join them up into a supplementary.
1450 */
1451 if (*offset < length && U16_IS_LEAD(result)) {
1452 int32_t ahead = *offset + 1;
1453 c = charAt(*offset, context);
1454 if (c == 0x5C /*'\\'*/ && ahead < length) {
1455 c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1456 }
1457 if (U16_IS_TRAIL(c)) {
1458 *offset = ahead;
1459 result = U16_GET_SUPPLEMENTARY(result, c);
1460 }
1461 }
1462 return result;
1463 }
1464
1465 /* Convert C-style escapes in table */
1466 for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1467 if (c == UNESCAPE_MAP[i]) {
1468 return UNESCAPE_MAP[i+1];
1469 } else if (c < UNESCAPE_MAP[i]) {
1470 break;
1471 }
1472 }
1473
1474 /* Map \cX to control-X: X & 0x1F */
1475 if (c == 0x0063 /*'c'*/ && *offset < length) {
1476 c = charAt((*offset)++, context);
1477 if (U16_IS_LEAD(c) && *offset < length) {
1478 UChar c2 = charAt(*offset, context);
1479 if (U16_IS_TRAIL(c2)) {
1480 ++(*offset);
1481 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1482 }
1483 }
1484 return 0x1F & c;
1485 }
1486
1487 /* If no special forms are recognized, then consider
1488 * the backslash to generically escape the next character.
1489 * Deal with surrogate pairs. */
1490 if (U16_IS_LEAD(c) && *offset < length) {
1491 UChar c2 = charAt(*offset, context);
1492 if (U16_IS_TRAIL(c2)) {
1493 ++(*offset);
1494 return U16_GET_SUPPLEMENTARY(c, c2);
1495 }
1496 }
1497 return c;
1498
1499 err:
1500 /* Invalid escape sequence */
1501 *offset = start; /* Reset to initial value */
1502 return (UChar32)0xFFFFFFFF;
1503 }
1504
1505 /* u_unescapeAt() callback to return a UChar from a char* */
1506 static UChar U_CALLCONV
1507 _charPtr_charAt(int32_t offset, void *context) {
1508 UChar c16;
1509 /* It would be more efficient to access the invariant tables
1510 * directly but there is no API for that. */
1511 u_charsToUChars(((char*) context) + offset, &c16, 1);
1512 return c16;
1513 }
1514
1515 /* Append an escape-free segment of the text; used by u_unescape() */
1516 static void _appendUChars(UChar *dest, int32_t destCapacity,
1517 const char *src, int32_t srcLen) {
1518 if (destCapacity < 0) {
1519 destCapacity = 0;
1520 }
1521 if (srcLen > destCapacity) {
1522 srcLen = destCapacity;
1523 }
1524 u_charsToUChars(src, dest, srcLen);
1525 }
1526
1527 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1528 U_CAPI int32_t U_EXPORT2
1529 u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1530 const char *segment = src;
1531 int32_t i = 0;
1532 char c;
1533
1534 while ((c=*src) != 0) {
1535 /* '\\' intentionally written as compiler-specific
1536 * character constant to correspond to compiler-specific
1537 * char* constants. */
1538 if (c == '\\') {
1539 int32_t lenParsed = 0;
1540 UChar32 c32;
1541 if (src != segment) {
1542 if (dest != NULL) {
1543 _appendUChars(dest + i, destCapacity - i,
1544 segment, (int32_t)(src - segment));
1545 }
1546 i += (int32_t)(src - segment);
1547 }
1548 ++src; /* advance past '\\' */
1549 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1550 if (lenParsed == 0) {
1551 goto err;
1552 }
1553 src += lenParsed; /* advance past escape seq. */
1554 if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1555 U16_APPEND_UNSAFE(dest, i, c32);
1556 } else {
1557 i += U16_LENGTH(c32);
1558 }
1559 segment = src;
1560 } else {
1561 ++src;
1562 }
1563 }
1564 if (src != segment) {
1565 if (dest != NULL) {
1566 _appendUChars(dest + i, destCapacity - i,
1567 segment, (int32_t)(src - segment));
1568 }
1569 i += (int32_t)(src - segment);
1570 }
1571 if (dest != NULL && i < destCapacity) {
1572 dest[i] = 0;
1573 }
1574 return i;
1575
1576 err:
1577 if (dest != NULL && destCapacity > 0) {
1578 *dest = 0;
1579 }
1580 return 0;
1581 }
1582
1583 /* NUL-termination of strings ----------------------------------------------- */
1584
1585 /**
1586 * NUL-terminate a string no matter what its type.
1587 * Set warning and error codes accordingly.
1588 */
1589 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1590 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1591 /* not a public function, so no complete argument checking */ \
1592 \
1593 if(length<0) { \
1594 /* assume that the caller handles this */ \
1595 } else if(length<destCapacity) { \
1596 /* NUL-terminate the string, the NUL fits */ \
1597 dest[length]=0; \
1598 /* unset the not-terminated warning but leave all others */ \
1599 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1600 *pErrorCode=U_ZERO_ERROR; \
1601 } \
1602 } else if(length==destCapacity) { \
1603 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1604 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1605 } else /* length>destCapacity */ { \
1606 /* even the string itself did not fit - set an error code */ \
1607 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1608 } \
1609 }
1610
1611 U_CAPI int32_t U_EXPORT2
1612 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1613 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1614 return length;
1615 }
1616
1617 U_CAPI int32_t U_EXPORT2
1618 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1619 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1620 return length;
1621 }
1622
1623 U_CAPI int32_t U_EXPORT2
1624 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1625 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1626 return length;
1627 }
1628
1629 U_CAPI int32_t U_EXPORT2
1630 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1631 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1632 return length;
1633 }
1634
1635 // Compute the hash code for a string -------------------------------------- ***
1636
1637 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1638 // on UHashtable code.
1639
1640 /*
1641 Compute the hash by iterating sparsely over about 32 (up to 63)
1642 characters spaced evenly through the string. For each character,
1643 multiply the previous hash value by a prime number and add the new
1644 character in, like a linear congruential random number generator,
1645 producing a pseudorandom deterministic value well distributed over
1646 the output range. [LIU]
1647 */
1648
1649 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1650 uint32_t hash = 0; \
1651 const TYPE *p = (const TYPE*) STR; \
1652 if (p != NULL) { \
1653 int32_t len = (int32_t)(STRLEN); \
1654 int32_t inc = ((len - 32) / 32) + 1; \
1655 const TYPE *limit = p + len; \
1656 while (p<limit) { \
1657 hash = (hash * 37) + DEREF; \
1658 p += inc; \
1659 } \
1660 } \
1661 return static_cast<int32_t>(hash)
1662
1663 /* Used by UnicodeString to compute its hashcode - Not public API. */
1664 U_CAPI int32_t U_EXPORT2
1665 ustr_hashUCharsN(const UChar *str, int32_t length) {
1666 STRING_HASH(UChar, str, length, *p);
1667 }
1668
1669 U_CAPI int32_t U_EXPORT2
1670 ustr_hashCharsN(const char *str, int32_t length) {
1671 STRING_HASH(uint8_t, str, length, *p);
1672 }
1673
1674 U_CAPI int32_t U_EXPORT2
1675 ustr_hashICharsN(const char *str, int32_t length) {
1676 STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1677 }