]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustring.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 12/07/98 bertrand Creation.
17 ******************************************************************************
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
30 /* ANSI string.h - style functions ------------------------------------------ */
32 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
33 #define U_BMP_MAX 0xffff
35 /* Forward binary string search functions ----------------------------------- */
38 * Test if a substring match inside a string is at code point boundaries.
39 * All pointers refer to the same buffer.
40 * The limit pointer may be NULL, all others must be real pointers.
43 isMatchAtCPBoundary(const UChar
*start
, const UChar
*match
, const UChar
*matchLimit
, const UChar
*limit
) {
44 if(U16_IS_TRAIL(*match
) && start
!=match
&& U16_IS_LEAD(*(match
-1))) {
45 /* the leading edge of the match is in the middle of a surrogate pair */
48 if(U16_IS_LEAD(*(matchLimit
-1)) && match
!=limit
&& U16_IS_TRAIL(*matchLimit
)) {
49 /* the trailing edge of the match is in the middle of a surrogate pair */
55 U_CAPI UChar
* U_EXPORT2
56 u_strFindFirst(const UChar
*s
, int32_t length
,
57 const UChar
*sub
, int32_t subLength
) {
58 const UChar
*start
, *p
, *q
, *subLimit
;
61 if(sub
==NULL
|| subLength
<-1) {
64 if(s
==NULL
|| length
<-1) {
70 if(length
<0 && subLength
<0) {
71 /* both strings are NUL-terminated */
75 if(*sub
==0 && !U16_IS_SURROGATE(cs
)) {
76 /* the substring consists of a single, non-surrogate BMP code point */
77 return u_strchr(s
, cs
);
82 /* found first substring UChar, compare rest */
87 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
88 return (UChar
*)(s
-1); /* well-formed match */
90 break; /* no match because surrogate pair is split */
94 return NULL
; /* no match, and none possible after s */
110 subLength
=u_strlen(sub
);
116 /* get sub[0] to search for it fast */
119 subLimit
=sub
+subLength
;
121 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
122 /* the substring consists of a single, non-surrogate BMP code point */
123 return length
<0 ? u_strchr(s
, cs
) : u_memchr(s
, cs
, length
);
127 /* s is NUL-terminated */
130 /* found first substring UChar, compare rest */
135 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
136 return (UChar
*)(s
-1); /* well-formed match */
138 break; /* no match because surrogate pair is split */
142 return NULL
; /* no match, and none possible after s */
145 break; /* no match */
153 const UChar
*limit
, *preLimit
;
155 /* subLength was decremented above */
156 if(length
<=subLength
) {
157 return NULL
; /* s is shorter than sub */
162 /* the substring must start before preLimit */
163 preLimit
=limit
-subLength
;
168 /* found first substring UChar, compare rest */
173 if(isMatchAtCPBoundary(start
, s
-1, p
, limit
)) {
174 return (UChar
*)(s
-1); /* well-formed match */
176 break; /* no match because surrogate pair is split */
180 break; /* no match */
193 U_CAPI UChar
* U_EXPORT2
194 u_strstr(const UChar
*s
, const UChar
*substring
) {
195 return u_strFindFirst(s
, -1, substring
, -1);
198 U_CAPI UChar
* U_EXPORT2
199 u_strchr(const UChar
*s
, UChar c
) {
200 if(U16_IS_SURROGATE(c
)) {
201 /* make sure to not find half of a surrogate pair */
202 return u_strFindFirst(s
, -1, &c
, 1);
206 /* trivial search for a BMP code point */
219 U_CAPI UChar
* U_EXPORT2
220 u_strchr32(const UChar
*s
, UChar32 c
) {
221 if((uint32_t)c
<=U_BMP_MAX
) {
222 /* find BMP code point */
223 return u_strchr(s
, (UChar
)c
);
224 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
225 /* find supplementary code point as surrogate pair */
226 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
228 while((cs
=*s
++)!=0) {
229 if(cs
==lead
&& *s
==trail
) {
230 return (UChar
*)(s
-1);
235 /* not a Unicode code point, not findable */
240 U_CAPI UChar
* U_EXPORT2
241 u_memchr(const UChar
*s
, UChar c
, int32_t count
) {
243 return NULL
; /* no string */
244 } else if(U16_IS_SURROGATE(c
)) {
245 /* make sure to not find half of a surrogate pair */
246 return u_strFindFirst(s
, count
, &c
, 1);
248 /* trivial search for a BMP code point */
249 const UChar
*limit
=s
+count
;
259 U_CAPI UChar
* U_EXPORT2
260 u_memchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
261 if((uint32_t)c
<=U_BMP_MAX
) {
262 /* find BMP code point */
263 return u_memchr(s
, (UChar
)c
, count
);
265 /* too short for a surrogate pair */
267 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
268 /* find supplementary code point as surrogate pair */
269 const UChar
*limit
=s
+count
-1; /* -1 so that we do not need a separate check for the trail unit */
270 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
273 if(*s
==lead
&& *(s
+1)==trail
) {
279 /* not a Unicode code point, not findable */
284 /* Backward binary string search functions ---------------------------------- */
286 U_CAPI UChar
* U_EXPORT2
287 u_strFindLast(const UChar
*s
, int32_t length
,
288 const UChar
*sub
, int32_t subLength
) {
289 const UChar
*start
, *limit
, *p
, *q
, *subLimit
;
292 if(sub
==NULL
|| subLength
<-1) {
295 if(s
==NULL
|| length
<-1) {
300 * This implementation is more lazy than the one for u_strFindFirst():
301 * There is no special search code for NUL-terminated strings.
302 * It does not seem to be worth it for searching substrings to
303 * search forward and find all matches like in u_strrchr() and similar.
304 * Therefore, we simply get both string lengths and search backward.
310 subLength
=u_strlen(sub
);
316 /* get sub[subLength-1] to search for it fast */
317 subLimit
=sub
+subLength
;
321 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
322 /* the substring consists of a single, non-surrogate BMP code point */
323 return length
<0 ? u_strrchr(s
, cs
) : u_memrchr(s
, cs
, length
);
330 /* subLength was decremented above */
331 if(length
<=subLength
) {
332 return NULL
; /* s is shorter than sub */
338 /* the substring must start no later than s+subLength */
344 /* found last substring UChar, compare rest */
349 if(isMatchAtCPBoundary(start
, p
, limit
+1, start
+length
)) {
350 return (UChar
*)p
; /* well-formed match */
352 break; /* no match because surrogate pair is split */
356 break; /* no match */
366 U_CAPI UChar
* U_EXPORT2
367 u_strrstr(const UChar
*s
, const UChar
*substring
) {
368 return u_strFindLast(s
, -1, substring
, -1);
371 U_CAPI UChar
* U_EXPORT2
372 u_strrchr(const UChar
*s
, UChar c
) {
373 if(U16_IS_SURROGATE(c
)) {
374 /* make sure to not find half of a surrogate pair */
375 return u_strFindLast(s
, -1, &c
, 1);
377 const UChar
*result
=NULL
;
380 /* trivial search for a BMP code point */
386 return (UChar
*)result
;
393 U_CAPI UChar
* U_EXPORT2
394 u_strrchr32(const UChar
*s
, UChar32 c
) {
395 if((uint32_t)c
<=U_BMP_MAX
) {
396 /* find BMP code point */
397 return u_strrchr(s
, (UChar
)c
);
398 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
399 /* find supplementary code point as surrogate pair */
400 const UChar
*result
=NULL
;
401 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
403 while((cs
=*s
++)!=0) {
404 if(cs
==lead
&& *s
==trail
) {
408 return (UChar
*)result
;
410 /* not a Unicode code point, not findable */
415 U_CAPI UChar
* U_EXPORT2
416 u_memrchr(const UChar
*s
, UChar c
, int32_t count
) {
418 return NULL
; /* no string */
419 } else if(U16_IS_SURROGATE(c
)) {
420 /* make sure to not find half of a surrogate pair */
421 return u_strFindLast(s
, count
, &c
, 1);
423 /* trivial search for a BMP code point */
424 const UChar
*limit
=s
+count
;
427 return (UChar
*)limit
;
434 U_CAPI UChar
* U_EXPORT2
435 u_memrchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
436 if((uint32_t)c
<=U_BMP_MAX
) {
437 /* find BMP code point */
438 return u_memrchr(s
, (UChar
)c
, count
);
440 /* too short for a surrogate pair */
442 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
443 /* find supplementary code point as surrogate pair */
444 const UChar
*limit
=s
+count
-1;
445 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
448 if(*limit
==trail
&& *(limit
-1)==lead
) {
449 return (UChar
*)(limit
-1);
454 /* not a Unicode code point, not findable */
459 /* Tokenization functions --------------------------------------------------- */
462 * Match each code point in a string against each code point in the matchSet.
463 * Return the index of the first string code point that
464 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
465 * Return -(string length)-1 if there is no such code point.
468 _matchFromSet(const UChar
*string
, const UChar
*matchSet
, UBool polarity
) {
469 int32_t matchLen
, matchBMPLen
, strItr
, matchItr
;
470 UChar32 stringCh
, matchCh
;
473 /* first part of matchSet contains only BMP code points */
475 while((c
= matchSet
[matchBMPLen
]) != 0 && U16_IS_SINGLE(c
)) {
479 /* second part of matchSet contains BMP and supplementary code points */
480 matchLen
= matchBMPLen
;
481 while(matchSet
[matchLen
] != 0) {
485 for(strItr
= 0; (c
= string
[strItr
]) != 0;) {
487 if(U16_IS_SINGLE(c
)) {
489 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
490 if(c
== matchSet
[matchItr
]) {
491 return strItr
- 1; /* one matches */
495 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
496 if(c
== matchSet
[matchItr
]) {
500 return strItr
- 1; /* none matches */
504 * No need to check for string length before U16_IS_TRAIL
505 * because c2 could at worst be the terminating NUL.
507 if(U16_IS_SURROGATE_LEAD(c
) && U16_IS_TRAIL(c2
= string
[strItr
])) {
509 stringCh
= U16_GET_SUPPLEMENTARY(c
, c2
);
511 stringCh
= c
; /* unpaired trail surrogate */
515 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
516 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
517 if(stringCh
== matchCh
) {
518 return strItr
- U16_LENGTH(stringCh
); /* one matches */
522 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
523 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
524 if(stringCh
== matchCh
) {
528 return strItr
- U16_LENGTH(stringCh
); /* none matches */
532 /* wish C had continue with labels like Java... */;
535 /* Didn't find it. */
539 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540 U_CAPI UChar
* U_EXPORT2
541 u_strpbrk(const UChar
*string
, const UChar
*matchSet
)
543 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
545 return (UChar
*)string
+ idx
;
551 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552 U_CAPI
int32_t U_EXPORT2
553 u_strcspn(const UChar
*string
, const UChar
*matchSet
)
555 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
559 return -idx
- 1; /* == u_strlen(string) */
563 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564 U_CAPI
int32_t U_EXPORT2
565 u_strspn(const UChar
*string
, const UChar
*matchSet
)
567 int32_t idx
= _matchFromSet(string
, matchSet
, FALSE
);
571 return -idx
- 1; /* == u_strlen(string) */
575 /* ----- Text manipulation functions --- */
577 U_CAPI UChar
* U_EXPORT2
578 u_strtok_r(UChar
*src
,
584 uint32_t nonDelimIdx
;
586 /* If saveState is NULL, the user messed up. */
589 *saveState
= src
; /* Set to "src" in case there are no delimiters */
591 else if (*saveState
) {
592 tokSource
= *saveState
;
595 /* src == NULL && *saveState == NULL */
596 /* This shouldn't happen. We already finished tokenizing. */
600 /* Skip initial delimiters */
601 nonDelimIdx
= u_strspn(tokSource
, delim
);
602 tokSource
= &tokSource
[nonDelimIdx
];
605 nextToken
= u_strpbrk(tokSource
, delim
);
606 if (nextToken
!= NULL
) {
609 *saveState
= nextToken
;
612 else if (*saveState
) {
613 /* Return the last token */
619 /* No tokens were found. Only delimiters were left. */
625 /* Miscellaneous functions -------------------------------------------------- */
627 U_CAPI UChar
* U_EXPORT2
631 UChar
*anchor
= dst
; /* save a pointer to start of dst */
633 while(*dst
!= 0) { /* To end of first string */
636 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
642 U_CAPI UChar
* U_EXPORT2
643 u_strncat(UChar
*dst
,
648 UChar
*anchor
= dst
; /* save a pointer to start of dst */
650 while(*dst
!= 0) { /* To end of first string */
653 while((*dst
= *src
) != 0) { /* copy string 2 over */
668 /* ----- Text property functions --- */
670 U_CAPI
int32_t U_EXPORT2
671 u_strcmp(const UChar
*s1
,
679 if (c1
!= c2
|| c1
== 0) {
683 return (int32_t)c1
- (int32_t)c2
;
686 U_CFUNC
int32_t U_EXPORT2
687 uprv_strCompare(const UChar
*s1
, int32_t length1
,
688 const UChar
*s2
, int32_t length2
,
689 UBool strncmpStyle
, UBool codePointOrder
) {
690 const UChar
*start1
, *start2
, *limit1
, *limit2
;
693 /* setup for fix-up */
697 /* compare identical prefixes - they do not need to be fixed up */
698 if(length1
<0 && length2
<0) {
699 /* strcmp style, both NUL-terminated */
717 /* setup for fix-up */
719 } else if(strncmpStyle
) {
720 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
725 limit1
=start1
+length1
;
728 /* both lengths are same, check only one limit */
745 /* setup for fix-up */
746 limit2
=start2
+length1
; /* use length1 here, too, to enforce assumption */
748 /* memcmp/UnicodeString style, both length-specified */
749 int32_t lengthResult
;
752 length1
=u_strlen(s1
);
755 length2
=u_strlen(s2
);
758 /* limit1=start1+min(lenght1, length2) */
759 if(length1
<length2
) {
761 limit1
=start1
+length1
;
762 } else if(length1
==length2
) {
764 limit1
=start1
+length1
;
765 } else /* length1>length2 */ {
767 limit1
=start1
+length2
;
775 /* check pseudo-limit */
789 /* setup for fix-up */
790 limit1
=start1
+length1
;
791 limit2
=start2
+length2
;
794 /* if both values are in or above the surrogate range, fix them up */
795 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
796 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
798 (c1
<=0xdbff && (s1
+1)!=limit1
&& U16_IS_TRAIL(*(s1
+1))) ||
799 (U16_IS_TRAIL(c1
) && start1
!=s1
&& U16_IS_LEAD(*(s1
-1)))
801 /* part of a surrogate pair, leave >=d800 */
803 /* BMP code point - may be surrogate code point - make <d800 */
808 (c2
<=0xdbff && (s2
+1)!=limit2
&& U16_IS_TRAIL(*(s2
+1))) ||
809 (U16_IS_TRAIL(c2
) && start2
!=s2
&& U16_IS_LEAD(*(s2
-1)))
811 /* part of a surrogate pair, leave >=d800 */
813 /* BMP code point - may be surrogate code point - make <d800 */
818 /* now c1 and c2 are in the requested (code unit or code point) order */
819 return (int32_t)c1
-(int32_t)c2
;
823 * Compare two strings as presented by UCharIterators.
824 * Use code unit or code point order.
825 * When the function returns, it is undefined where the iterators
828 U_CAPI
int32_t U_EXPORT2
829 u_strCompareIter(UCharIterator
*iter1
, UCharIterator
*iter2
, UBool codePointOrder
) {
832 /* argument checking */
833 if(iter1
==NULL
|| iter2
==NULL
) {
834 return 0; /* bad arguments */
837 return 0; /* identical iterators */
840 /* reset iterators to start? */
841 iter1
->move(iter1
, 0, UITER_START
);
842 iter2
->move(iter2
, 0, UITER_START
);
844 /* compare identical prefixes - they do not need to be fixed up */
846 c1
=iter1
->next(iter1
);
847 c2
=iter2
->next(iter2
);
856 /* if both values are in or above the surrogate range, fix them up */
857 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
858 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
860 (c1
<=0xdbff && U16_IS_TRAIL(iter1
->current(iter1
))) ||
861 (U16_IS_TRAIL(c1
) && (iter1
->previous(iter1
), U16_IS_LEAD(iter1
->previous(iter1
))))
863 /* part of a surrogate pair, leave >=d800 */
865 /* BMP code point - may be surrogate code point - make <d800 */
870 (c2
<=0xdbff && U16_IS_TRAIL(iter2
->current(iter2
))) ||
871 (U16_IS_TRAIL(c2
) && (iter2
->previous(iter2
), U16_IS_LEAD(iter2
->previous(iter2
))))
873 /* part of a surrogate pair, leave >=d800 */
875 /* BMP code point - may be surrogate code point - make <d800 */
880 /* now c1 and c2 are in the requested (code unit or code point) order */
881 return (int32_t)c1
-(int32_t)c2
;
886 * u_strCompareIter() does not leave the iterators _on_ the different units.
887 * This is possible but would cost a few extra indirect function calls to back
888 * up if the last unit (c1 or c2 respectively) was >=0.
890 * Consistently leaving them _behind_ the different units is not an option
891 * because the current "unit" is the end of the string if that is reached,
892 * and in such a case the iterator does not move.
893 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894 * of their strings. Calling previous() on each does not move them to where
895 * the comparison fails.
897 * So the simplest semantics is to not define where the iterators end up.
899 * The following fragment is part of what would need to be done for backing up.
902 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
904 if(!U16_IS_TRAIL(iter1
->current(iter1
))) {
905 /* lead surrogate code point - make <d800 */
908 } else if(c1
<=0xdfff) {
909 int32_t idx
=iter1
->getIndex(iter1
, UITER_CURRENT
);
910 iter1
->previous(iter1
); /* ==c1 */
911 if(!U16_IS_LEAD(iter1
->previous(iter1
))) {
912 /* trail surrogate code point - make <d800 */
915 /* go back to behind where the difference is */
916 iter1
->move(iter1
, idx
, UITER_ZERO
);
917 } else /* 0xe000<=c1<=0xffff */ {
918 /* BMP code point - make <d800 */
924 U_CAPI
int32_t U_EXPORT2
925 u_strCompare(const UChar
*s1
, int32_t length1
,
926 const UChar
*s2
, int32_t length2
,
927 UBool codePointOrder
) {
928 /* argument checking */
929 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
932 return uprv_strCompare(s1
, length1
, s2
, length2
, FALSE
, codePointOrder
);
935 /* String compare in code point order - u_strcmp() compares in code unit order. */
936 U_CAPI
int32_t U_EXPORT2
937 u_strcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
) {
938 return uprv_strCompare(s1
, -1, s2
, -1, FALSE
, TRUE
);
941 U_CAPI
int32_t U_EXPORT2
942 u_strncmp(const UChar
*s1
,
949 rc
= (int32_t)*s1
- (int32_t)*s2
;
950 if(rc
!= 0 || *s1
== 0 || --n
== 0) {
961 U_CAPI
int32_t U_EXPORT2
962 u_strncmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t n
) {
963 return uprv_strCompare(s1
, n
, s2
, n
, TRUE
, TRUE
);
966 U_CAPI UChar
* U_EXPORT2
970 UChar
*anchor
= dst
; /* save a pointer to start of dst */
972 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
978 U_CAPI UChar
* U_EXPORT2
979 u_strncpy(UChar
*dst
,
983 UChar
*anchor
= dst
; /* save a pointer to start of dst */
985 /* copy string 2 over */
986 while(n
> 0 && (*(dst
++) = *(src
++)) != 0) {
993 U_CAPI
int32_t U_EXPORT2
994 u_strlen(const UChar
*s
)
996 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997 return (int32_t)uprv_wcslen((const wchar_t *)s
);
1007 U_CAPI
int32_t U_EXPORT2
1008 u_countChar32(const UChar
*s
, int32_t length
) {
1011 if(s
==NULL
|| length
<-1) {
1019 if(U16_IS_LEAD(*s
) && length
>=2 && U16_IS_TRAIL(*(s
+1))) {
1027 } else /* length==-1 */ {
1037 * sufficient to look ahead one because of UTF-16;
1038 * safe to look ahead one because at worst that would be the terminating NUL
1040 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1048 U_CAPI UBool U_EXPORT2
1049 u_strHasMoreChar32Than(const UChar
*s
, int32_t length
, int32_t number
) {
1054 if(s
==NULL
|| length
<-1) {
1059 /* s is NUL-terminated */
1062 /* count code points until they exceed */
1070 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1076 /* length>=0 known */
1078 int32_t maxSupplementary
;
1080 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081 if(((length
+1)/2)>number
) {
1085 /* check if s does not even contain enough UChars */
1086 maxSupplementary
=length
-number
;
1087 if(maxSupplementary
<=0) {
1090 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1093 * count code points until they exceed and also check that there are
1094 * no more than maxSupplementary supplementary code points (UChar pairs)
1104 if(U16_IS_LEAD(*s
++) && s
!=limit
&& U16_IS_TRAIL(*s
)) {
1106 if(--maxSupplementary
<=0) {
1107 /* too many pairs - too few code points */
1116 /* ----- String validation functions --- */
1119 * Check whether the string is well-formed according to various criteria:
1120 * - No code points that are defined as non-characters (e.g. 0xFFFF) or are undefined in
1121 * the version of Unicode currently supported.
1122 * - No isolated surrogate code points.
1123 * - No overly-long sequences of non-starter combining marks, i.e. more than 30 characters
1124 * in a row with non-zero combining class (which may have category Mn or Mc); this
1125 * violates Stream-Safe Text Format per UAX #15. This test does not ensure that the
1126 * string satisfies Stream-Safe Text Format (because it does not convert to NFKC first),
1127 * but any string that fails this test is certainly not Stream-Safe.
1128 * - No emoji variation selectors applied to non-emoji code points. This function may
1129 * also check for other non-standard variation sequences.
1130 * - No tag sequences that are ill-formed per definition ED-14a in UTS #51 (e.g. tag
1131 * sequences must have an emoji base and a terminator).
1133 * @internal Apple only
1135 enum { kBidiMaxDepth
= 125 };
1137 static UBool
isWellFormed(UChar32 c
, UChar32 cLast
, int32_t *nonStarterCountP
, UBool
*inTagSeqP
,
1138 uint8_t* dirStatus
, int32_t* dirStatusIndexP
, int32_t* validIsolateCountP
) {
1140 // can only have tag_spec or tag_term
1141 if (c
== 0xE007F) { // tag_term
1143 } else if (c
< 0xE0020 || c
> 0xE007E) {
1146 } else if (c
< 0x0300) {
1147 // Everything in this range (includes ASCII) is a valid character with combining class 0
1148 *nonStarterCountP
= 0;
1149 if (c
== 0x000A || c
== 0x000D || c
== 0x0085 || (c
>= 0x001C && c
<= 0x001E)) {
1150 // paragraph sep, reset bidi
1151 *dirStatusIndexP
= 0;
1152 *validIsolateCountP
= 0;
1154 } else if ((c
>= 0x2029 && c
<= 0x202E) || (c
>= 0x2066 && c
<= 0x2069)) {
1155 // para sep & bidi controls, all have combining class 0. The bidi control actions here
1156 // are from [https://www.unicode.org/reports/tr9/#Explicit_Levels_and_Directions]
1157 *nonStarterCountP
= 0;
1158 if (c
== 0x2029) { // paragraph sep, reset bidi
1159 *dirStatusIndexP
= 0;
1160 *validIsolateCountP
= 0;
1161 } else if (c
== 0x2069) { // PDI
1162 if (*validIsolateCountP
> 0) {
1163 while (*dirStatusIndexP
> 0 && (dirStatus
[(*dirStatusIndexP
)--] & 0x80) == 0);
1164 (*validIsolateCountP
)--;
1166 } else if (c
== 0x202C) { // PDF
1167 if (*dirStatusIndexP
> 0 && (dirStatus
[*dirStatusIndexP
] & 0x80) == 0) {
1168 (*dirStatusIndexP
)--;
1171 // embedding/override initiator. Need to increment the level by at least 1, and possibly 2 if the
1172 // embedding/override direction matches the current direction (i.e. R and current odd, or L and current even).
1173 // Since we increment first, the test for odd/even is flipped. For FSI, we do not actually determine
1174 // whether it should be treated as RLI or LRI, so we just do the minimum increment.
1175 uint8_t newEntry
= (dirStatus
[*dirStatusIndexP
] & 0x7F) + 1; // min increment, flips odd/even status compared to current
1176 if ( ((c
== 0x202B || c
== 0x202E || c
== 0x2067) && (newEntry
& 0x01) == 0) || // RLE/RLO/RLI and current was odd
1177 ((c
== 0x202A || c
== 0x202D || c
== 0x2066) && (newEntry
& 0x01) != 0) ) { // LRE/LRO/LRI and current was even
1180 if (newEntry
> kBidiMaxDepth
|| *dirStatusIndexP
> kBidiMaxDepth
) {
1181 return FALSE
; // Checking for this is the whole point.
1183 if (c
>= 0x2066 && c
<= 0x2068) { // LRI/RLI/FSI
1184 newEntry
|= 0x80; // set directional isolate status
1185 (*validIsolateCountP
)++;
1187 dirStatus
[++(*dirStatusIndexP
)] = newEntry
;
1189 } else if (c
== 0xFE0F) { // emoji variation selector
1190 if (!u_isEmoji(cLast
)) { // previous char must be emoji
1193 // previous character would have set *nonStarterCountP = 0;
1194 } else if (c
>= 0xE0020 && c
<= 0xE007E) { // tag_spec
1195 if (!u_isEmoji(cLast
) && cLast
!= 0xFE0F) { // previous char must be emoji or FE0F
1199 // previous character would have set *nonStarterCountP = 0;
1200 } else if (c
== 0xE007F) { // tag_term
1203 // we have checked specific ranges/chars, now check general info for others
1204 int8_t genCat
= u_charType(c
);
1205 if (genCat
== U_UNASSIGNED
|| genCat
== U_SURROGATE
) {
1208 if ((genCat
== U_NON_SPACING_MARK
|| genCat
== U_COMBINING_SPACING_MARK
) && u_getCombiningClass(c
) != 0) {
1210 if (++(*nonStarterCountP
) > 30) {
1214 *nonStarterCountP
= 0;
1220 U_CAPI UBool U_EXPORT2
1221 u_strIsWellFormed(const UChar
*s
, int32_t length
) {
1222 if (s
==NULL
|| length
<-1) {
1225 UChar32 c
, c2
, cLast
= 0;
1226 int32_t nonStarterCount
= 0;
1227 UBool inTagSeq
= FALSE
;
1228 uint8_t dirStatus
[kBidiMaxDepth
+ 3]; // low 7 bits is embed level, high bit is direction override status
1229 int32_t dirStatusIndex
= 0;
1230 int32_t validIsolateCount
= 0;
1231 dirStatus
[0] = 0; // assume initial paragraph direction L (most conservative)
1234 while ((c
= *s
++) != 0) {
1235 // get next UChar32 c
1236 if (U16_IS_LEAD(c
)) {
1237 if (U16_IS_TRAIL(c2
= *s
)) {
1239 c
= U16_GET_SUPPLEMENTARY(c
,c2
);
1243 if (!isWellFormed(c
, cLast
, &nonStarterCount
, &inTagSeq
, dirStatus
, &dirStatusIndex
, &validIsolateCount
)) {
1246 // setup next iteration
1251 const UChar
*sLimit
= s
+ length
;
1252 while (s
< sLimit
) {
1253 // get next UChar32 c
1255 if (U16_IS_LEAD(c
)) {
1256 if (s
< sLimit
&& U16_IS_TRAIL(c2
= *s
)) {
1258 c
= U16_GET_SUPPLEMENTARY(c
,c2
);
1262 if (!isWellFormed(c
, cLast
, &nonStarterCount
, &inTagSeq
, dirStatus
, &dirStatusIndex
, &validIsolateCount
)) {
1265 // setup next iteration
1272 /* ----- U_mem functions --- */
1274 U_CAPI UChar
* U_EXPORT2
1275 u_memcpy(UChar
*dest
, const UChar
*src
, int32_t count
) {
1277 uprv_memcpy(dest
, src
, (size_t)count
*U_SIZEOF_UCHAR
);
1282 U_CAPI UChar
* U_EXPORT2
1283 u_memmove(UChar
*dest
, const UChar
*src
, int32_t count
) {
1285 uprv_memmove(dest
, src
, (size_t)count
*U_SIZEOF_UCHAR
);
1290 U_CAPI UChar
* U_EXPORT2
1291 u_memset(UChar
*dest
, UChar c
, int32_t count
) {
1294 UChar
*limit
= dest
+ count
;
1296 while (ptr
< limit
) {
1303 U_CAPI
int32_t U_EXPORT2
1304 u_memcmp(const UChar
*buf1
, const UChar
*buf2
, int32_t count
) {
1306 const UChar
*limit
= buf1
+ count
;
1309 while (buf1
< limit
) {
1310 result
= (int32_t)(uint16_t)*buf1
- (int32_t)(uint16_t)*buf2
;
1321 U_CAPI
int32_t U_EXPORT2
1322 u_memcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t count
) {
1323 return uprv_strCompare(s1
, count
, s2
, count
, FALSE
, TRUE
);
1326 /* u_unescape & support fns ------------------------------------------------- */
1328 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1329 static const UChar UNESCAPE_MAP
[] = {
1343 enum { UNESCAPE_MAP_LENGTH
= UPRV_LENGTHOF(UNESCAPE_MAP
) };
1345 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1346 static int8_t _digit8(UChar c
) {
1347 if (c
>= 0x0030 && c
<= 0x0037) {
1348 return (int8_t)(c
- 0x0030);
1353 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1354 static int8_t _digit16(UChar c
) {
1355 if (c
>= 0x0030 && c
<= 0x0039) {
1356 return (int8_t)(c
- 0x0030);
1358 if (c
>= 0x0041 && c
<= 0x0046) {
1359 return (int8_t)(c
- (0x0041 - 10));
1361 if (c
>= 0x0061 && c
<= 0x0066) {
1362 return (int8_t)(c
- (0x0061 - 10));
1367 /* Parse a single escape sequence. Although this method deals in
1368 * UChars, it does not use C++ or UnicodeString. This allows it to
1369 * be used from C contexts. */
1370 U_CAPI UChar32 U_EXPORT2
1371 u_unescapeAt(UNESCAPE_CHAR_AT charAt
,
1376 int32_t start
= *offset
;
1382 int8_t bitsPerDigit
= 4;
1385 UBool braces
= FALSE
;
1387 /* Check that offset is in range */
1388 if (*offset
< 0 || *offset
>= length
) {
1392 /* Fetch first UChar after '\\' */
1393 c
= charAt((*offset
)++, context
);
1395 /* Convert hexadecimal and octal escapes */
1397 case 0x0075 /*'u'*/:
1398 minDig
= maxDig
= 4;
1400 case 0x0055 /*'U'*/:
1401 minDig
= maxDig
= 8;
1403 case 0x0078 /*'x'*/:
1405 if (*offset
< length
&& charAt(*offset
, context
) == 0x7B /*{*/) {
1418 n
= 1; /* Already have first octal digit */
1425 while (*offset
< length
&& n
< maxDig
) {
1426 c
= charAt(*offset
, context
);
1427 dig
= (int8_t)((bitsPerDigit
== 3) ? _digit8(c
) : _digit16(c
));
1431 result
= (result
<< bitsPerDigit
) | dig
;
1439 if (c
!= 0x7D /*}*/) {
1444 if (result
< 0 || result
>= 0x110000) {
1447 /* If an escape sequence specifies a lead surrogate, see if
1448 * there is a trail surrogate after it, either as an escape or
1449 * as a literal. If so, join them up into a supplementary.
1451 if (*offset
< length
&& U16_IS_LEAD(result
)) {
1452 int32_t ahead
= *offset
+ 1;
1453 c
= charAt(*offset
, context
);
1454 if (c
== 0x5C /*'\\'*/ && ahead
< length
) {
1455 c
= (UChar
) u_unescapeAt(charAt
, &ahead
, length
, context
);
1457 if (U16_IS_TRAIL(c
)) {
1459 result
= U16_GET_SUPPLEMENTARY(result
, c
);
1465 /* Convert C-style escapes in table */
1466 for (i
=0; i
<UNESCAPE_MAP_LENGTH
; i
+=2) {
1467 if (c
== UNESCAPE_MAP
[i
]) {
1468 return UNESCAPE_MAP
[i
+1];
1469 } else if (c
< UNESCAPE_MAP
[i
]) {
1474 /* Map \cX to control-X: X & 0x1F */
1475 if (c
== 0x0063 /*'c'*/ && *offset
< length
) {
1476 c
= charAt((*offset
)++, context
);
1477 if (U16_IS_LEAD(c
) && *offset
< length
) {
1478 UChar c2
= charAt(*offset
, context
);
1479 if (U16_IS_TRAIL(c2
)) {
1481 c
= (UChar
) U16_GET_SUPPLEMENTARY(c
, c2
); /* [sic] */
1487 /* If no special forms are recognized, then consider
1488 * the backslash to generically escape the next character.
1489 * Deal with surrogate pairs. */
1490 if (U16_IS_LEAD(c
) && *offset
< length
) {
1491 UChar c2
= charAt(*offset
, context
);
1492 if (U16_IS_TRAIL(c2
)) {
1494 return U16_GET_SUPPLEMENTARY(c
, c2
);
1500 /* Invalid escape sequence */
1501 *offset
= start
; /* Reset to initial value */
1502 return (UChar32
)0xFFFFFFFF;
1505 /* u_unescapeAt() callback to return a UChar from a char* */
1506 static UChar U_CALLCONV
1507 _charPtr_charAt(int32_t offset
, void *context
) {
1509 /* It would be more efficient to access the invariant tables
1510 * directly but there is no API for that. */
1511 u_charsToUChars(((char*) context
) + offset
, &c16
, 1);
1515 /* Append an escape-free segment of the text; used by u_unescape() */
1516 static void _appendUChars(UChar
*dest
, int32_t destCapacity
,
1517 const char *src
, int32_t srcLen
) {
1518 if (destCapacity
< 0) {
1521 if (srcLen
> destCapacity
) {
1522 srcLen
= destCapacity
;
1524 u_charsToUChars(src
, dest
, srcLen
);
1527 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1528 U_CAPI
int32_t U_EXPORT2
1529 u_unescape(const char *src
, UChar
*dest
, int32_t destCapacity
) {
1530 const char *segment
= src
;
1534 while ((c
=*src
) != 0) {
1535 /* '\\' intentionally written as compiler-specific
1536 * character constant to correspond to compiler-specific
1537 * char* constants. */
1539 int32_t lenParsed
= 0;
1541 if (src
!= segment
) {
1543 _appendUChars(dest
+ i
, destCapacity
- i
,
1544 segment
, (int32_t)(src
- segment
));
1546 i
+= (int32_t)(src
- segment
);
1548 ++src
; /* advance past '\\' */
1549 c32
= (UChar32
)u_unescapeAt(_charPtr_charAt
, &lenParsed
, (int32_t)uprv_strlen(src
), (void*)src
);
1550 if (lenParsed
== 0) {
1553 src
+= lenParsed
; /* advance past escape seq. */
1554 if (dest
!= NULL
&& U16_LENGTH(c32
) <= (destCapacity
- i
)) {
1555 U16_APPEND_UNSAFE(dest
, i
, c32
);
1557 i
+= U16_LENGTH(c32
);
1564 if (src
!= segment
) {
1566 _appendUChars(dest
+ i
, destCapacity
- i
,
1567 segment
, (int32_t)(src
- segment
));
1569 i
+= (int32_t)(src
- segment
);
1571 if (dest
!= NULL
&& i
< destCapacity
) {
1577 if (dest
!= NULL
&& destCapacity
> 0) {
1583 /* NUL-termination of strings ----------------------------------------------- */
1586 * NUL-terminate a string no matter what its type.
1587 * Set warning and error codes accordingly.
1589 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1590 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1591 /* not a public function, so no complete argument checking */ \
1594 /* assume that the caller handles this */ \
1595 } else if(length<destCapacity) { \
1596 /* NUL-terminate the string, the NUL fits */ \
1598 /* unset the not-terminated warning but leave all others */ \
1599 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1600 *pErrorCode=U_ZERO_ERROR; \
1602 } else if(length==destCapacity) { \
1603 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1604 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1605 } else /* length>destCapacity */ { \
1606 /* even the string itself did not fit - set an error code */ \
1607 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1611 U_CAPI
int32_t U_EXPORT2
1612 u_terminateUChars(UChar
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1613 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1617 U_CAPI
int32_t U_EXPORT2
1618 u_terminateChars(char *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1619 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1623 U_CAPI
int32_t U_EXPORT2
1624 u_terminateUChar32s(UChar32
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1625 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1629 U_CAPI
int32_t U_EXPORT2
1630 u_terminateWChars(wchar_t *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1631 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1635 // Compute the hash code for a string -------------------------------------- ***
1637 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1638 // on UHashtable code.
1641 Compute the hash by iterating sparsely over about 32 (up to 63)
1642 characters spaced evenly through the string. For each character,
1643 multiply the previous hash value by a prime number and add the new
1644 character in, like a linear congruential random number generator,
1645 producing a pseudorandom deterministic value well distributed over
1646 the output range. [LIU]
1649 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1650 uint32_t hash = 0; \
1651 const TYPE *p = (const TYPE*) STR; \
1653 int32_t len = (int32_t)(STRLEN); \
1654 int32_t inc = ((len - 32) / 32) + 1; \
1655 const TYPE *limit = p + len; \
1657 hash = (hash * 37) + DEREF; \
1661 return static_cast<int32_t>(hash)
1663 /* Used by UnicodeString to compute its hashcode - Not public API. */
1664 U_CAPI
int32_t U_EXPORT2
1665 ustr_hashUCharsN(const UChar
*str
, int32_t length
) {
1666 STRING_HASH(UChar
, str
, length
, *p
);
1669 U_CAPI
int32_t U_EXPORT2
1670 ustr_hashCharsN(const char *str
, int32_t length
) {
1671 STRING_HASH(uint8_t, str
, length
, *p
);
1674 U_CAPI
int32_t U_EXPORT2
1675 ustr_hashICharsN(const char *str
, int32_t length
) {
1676 STRING_HASH(char, str
, length
, (uint8_t)uprv_tolower(*p
));