]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustring.cpp
2 ******************************************************************************
4 * Copyright (C) 1998-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 12/07/98 bertrand Creation.
15 ******************************************************************************
18 #include "unicode/utypes.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/utf16.h"
27 /* ANSI string.h - style functions ------------------------------------------ */
29 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
30 #define U_BMP_MAX 0xffff
32 /* Forward binary string search functions ----------------------------------- */
35 * Test if a substring match inside a string is at code point boundaries.
36 * All pointers refer to the same buffer.
37 * The limit pointer may be NULL, all others must be real pointers.
40 isMatchAtCPBoundary(const UChar
*start
, const UChar
*match
, const UChar
*matchLimit
, const UChar
*limit
) {
41 if(U16_IS_TRAIL(*match
) && start
!=match
&& U16_IS_LEAD(*(match
-1))) {
42 /* the leading edge of the match is in the middle of a surrogate pair */
45 if(U16_IS_LEAD(*(matchLimit
-1)) && match
!=limit
&& U16_IS_TRAIL(*matchLimit
)) {
46 /* the trailing edge of the match is in the middle of a surrogate pair */
52 U_CAPI UChar
* U_EXPORT2
53 u_strFindFirst(const UChar
*s
, int32_t length
,
54 const UChar
*sub
, int32_t subLength
) {
55 const UChar
*start
, *p
, *q
, *subLimit
;
58 if(sub
==NULL
|| subLength
<-1) {
61 if(s
==NULL
|| length
<-1) {
67 if(length
<0 && subLength
<0) {
68 /* both strings are NUL-terminated */
72 if(*sub
==0 && !U16_IS_SURROGATE(cs
)) {
73 /* the substring consists of a single, non-surrogate BMP code point */
74 return u_strchr(s
, cs
);
79 /* found first substring UChar, compare rest */
84 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
85 return (UChar
*)(s
-1); /* well-formed match */
87 break; /* no match because surrogate pair is split */
91 return NULL
; /* no match, and none possible after s */
107 subLength
=u_strlen(sub
);
113 /* get sub[0] to search for it fast */
116 subLimit
=sub
+subLength
;
118 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
119 /* the substring consists of a single, non-surrogate BMP code point */
120 return length
<0 ? u_strchr(s
, cs
) : u_memchr(s
, cs
, length
);
124 /* s is NUL-terminated */
127 /* found first substring UChar, compare rest */
132 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
133 return (UChar
*)(s
-1); /* well-formed match */
135 break; /* no match because surrogate pair is split */
139 return NULL
; /* no match, and none possible after s */
142 break; /* no match */
150 const UChar
*limit
, *preLimit
;
152 /* subLength was decremented above */
153 if(length
<=subLength
) {
154 return NULL
; /* s is shorter than sub */
159 /* the substring must start before preLimit */
160 preLimit
=limit
-subLength
;
165 /* found first substring UChar, compare rest */
170 if(isMatchAtCPBoundary(start
, s
-1, p
, limit
)) {
171 return (UChar
*)(s
-1); /* well-formed match */
173 break; /* no match because surrogate pair is split */
177 break; /* no match */
190 U_CAPI UChar
* U_EXPORT2
191 u_strstr(const UChar
*s
, const UChar
*substring
) {
192 return u_strFindFirst(s
, -1, substring
, -1);
195 U_CAPI UChar
* U_EXPORT2
196 u_strchr(const UChar
*s
, UChar c
) {
197 if(U16_IS_SURROGATE(c
)) {
198 /* make sure to not find half of a surrogate pair */
199 return u_strFindFirst(s
, -1, &c
, 1);
203 /* trivial search for a BMP code point */
216 U_CAPI UChar
* U_EXPORT2
217 u_strchr32(const UChar
*s
, UChar32 c
) {
218 if((uint32_t)c
<=U_BMP_MAX
) {
219 /* find BMP code point */
220 return u_strchr(s
, (UChar
)c
);
221 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
222 /* find supplementary code point as surrogate pair */
223 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
225 while((cs
=*s
++)!=0) {
226 if(cs
==lead
&& *s
==trail
) {
227 return (UChar
*)(s
-1);
232 /* not a Unicode code point, not findable */
237 U_CAPI UChar
* U_EXPORT2
238 u_memchr(const UChar
*s
, UChar c
, int32_t count
) {
240 return NULL
; /* no string */
241 } else if(U16_IS_SURROGATE(c
)) {
242 /* make sure to not find half of a surrogate pair */
243 return u_strFindFirst(s
, count
, &c
, 1);
245 /* trivial search for a BMP code point */
246 const UChar
*limit
=s
+count
;
256 U_CAPI UChar
* U_EXPORT2
257 u_memchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
258 if((uint32_t)c
<=U_BMP_MAX
) {
259 /* find BMP code point */
260 return u_memchr(s
, (UChar
)c
, count
);
262 /* too short for a surrogate pair */
264 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
265 /* find supplementary code point as surrogate pair */
266 const UChar
*limit
=s
+count
-1; /* -1 so that we do not need a separate check for the trail unit */
267 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
270 if(*s
==lead
&& *(s
+1)==trail
) {
276 /* not a Unicode code point, not findable */
281 /* Backward binary string search functions ---------------------------------- */
283 U_CAPI UChar
* U_EXPORT2
284 u_strFindLast(const UChar
*s
, int32_t length
,
285 const UChar
*sub
, int32_t subLength
) {
286 const UChar
*start
, *limit
, *p
, *q
, *subLimit
;
289 if(sub
==NULL
|| subLength
<-1) {
292 if(s
==NULL
|| length
<-1) {
297 * This implementation is more lazy than the one for u_strFindFirst():
298 * There is no special search code for NUL-terminated strings.
299 * It does not seem to be worth it for searching substrings to
300 * search forward and find all matches like in u_strrchr() and similar.
301 * Therefore, we simply get both string lengths and search backward.
307 subLength
=u_strlen(sub
);
313 /* get sub[subLength-1] to search for it fast */
314 subLimit
=sub
+subLength
;
318 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
319 /* the substring consists of a single, non-surrogate BMP code point */
320 return length
<0 ? u_strrchr(s
, cs
) : u_memrchr(s
, cs
, length
);
327 /* subLength was decremented above */
328 if(length
<=subLength
) {
329 return NULL
; /* s is shorter than sub */
335 /* the substring must start no later than s+subLength */
341 /* found last substring UChar, compare rest */
346 if(isMatchAtCPBoundary(start
, p
, limit
+1, start
+length
)) {
347 return (UChar
*)p
; /* well-formed match */
349 break; /* no match because surrogate pair is split */
353 break; /* no match */
363 U_CAPI UChar
* U_EXPORT2
364 u_strrstr(const UChar
*s
, const UChar
*substring
) {
365 return u_strFindLast(s
, -1, substring
, -1);
368 U_CAPI UChar
* U_EXPORT2
369 u_strrchr(const UChar
*s
, UChar c
) {
370 if(U16_IS_SURROGATE(c
)) {
371 /* make sure to not find half of a surrogate pair */
372 return u_strFindLast(s
, -1, &c
, 1);
374 const UChar
*result
=NULL
;
377 /* trivial search for a BMP code point */
383 return (UChar
*)result
;
390 U_CAPI UChar
* U_EXPORT2
391 u_strrchr32(const UChar
*s
, UChar32 c
) {
392 if((uint32_t)c
<=U_BMP_MAX
) {
393 /* find BMP code point */
394 return u_strrchr(s
, (UChar
)c
);
395 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
396 /* find supplementary code point as surrogate pair */
397 const UChar
*result
=NULL
;
398 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
400 while((cs
=*s
++)!=0) {
401 if(cs
==lead
&& *s
==trail
) {
405 return (UChar
*)result
;
407 /* not a Unicode code point, not findable */
412 U_CAPI UChar
* U_EXPORT2
413 u_memrchr(const UChar
*s
, UChar c
, int32_t count
) {
415 return NULL
; /* no string */
416 } else if(U16_IS_SURROGATE(c
)) {
417 /* make sure to not find half of a surrogate pair */
418 return u_strFindLast(s
, count
, &c
, 1);
420 /* trivial search for a BMP code point */
421 const UChar
*limit
=s
+count
;
424 return (UChar
*)limit
;
431 U_CAPI UChar
* U_EXPORT2
432 u_memrchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
433 if((uint32_t)c
<=U_BMP_MAX
) {
434 /* find BMP code point */
435 return u_memrchr(s
, (UChar
)c
, count
);
437 /* too short for a surrogate pair */
439 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
440 /* find supplementary code point as surrogate pair */
441 const UChar
*limit
=s
+count
-1;
442 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
445 if(*limit
==trail
&& *(limit
-1)==lead
) {
446 return (UChar
*)(limit
-1);
451 /* not a Unicode code point, not findable */
456 /* Tokenization functions --------------------------------------------------- */
459 * Match each code point in a string against each code point in the matchSet.
460 * Return the index of the first string code point that
461 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
462 * Return -(string length)-1 if there is no such code point.
465 _matchFromSet(const UChar
*string
, const UChar
*matchSet
, UBool polarity
) {
466 int32_t matchLen
, matchBMPLen
, strItr
, matchItr
;
467 UChar32 stringCh
, matchCh
;
470 /* first part of matchSet contains only BMP code points */
472 while((c
= matchSet
[matchBMPLen
]) != 0 && U16_IS_SINGLE(c
)) {
476 /* second part of matchSet contains BMP and supplementary code points */
477 matchLen
= matchBMPLen
;
478 while(matchSet
[matchLen
] != 0) {
482 for(strItr
= 0; (c
= string
[strItr
]) != 0;) {
484 if(U16_IS_SINGLE(c
)) {
486 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
487 if(c
== matchSet
[matchItr
]) {
488 return strItr
- 1; /* one matches */
492 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
493 if(c
== matchSet
[matchItr
]) {
497 return strItr
- 1; /* none matches */
501 * No need to check for string length before U16_IS_TRAIL
502 * because c2 could at worst be the terminating NUL.
504 if(U16_IS_SURROGATE_LEAD(c
) && U16_IS_TRAIL(c2
= string
[strItr
])) {
506 stringCh
= U16_GET_SUPPLEMENTARY(c
, c2
);
508 stringCh
= c
; /* unpaired trail surrogate */
512 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
513 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
514 if(stringCh
== matchCh
) {
515 return strItr
- U16_LENGTH(stringCh
); /* one matches */
519 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
520 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
521 if(stringCh
== matchCh
) {
525 return strItr
- U16_LENGTH(stringCh
); /* none matches */
529 /* wish C had continue with labels like Java... */;
532 /* Didn't find it. */
536 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
537 U_CAPI UChar
* U_EXPORT2
538 u_strpbrk(const UChar
*string
, const UChar
*matchSet
)
540 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
542 return (UChar
*)string
+ idx
;
548 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
549 U_CAPI
int32_t U_EXPORT2
550 u_strcspn(const UChar
*string
, const UChar
*matchSet
)
552 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
556 return -idx
- 1; /* == u_strlen(string) */
560 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
561 U_CAPI
int32_t U_EXPORT2
562 u_strspn(const UChar
*string
, const UChar
*matchSet
)
564 int32_t idx
= _matchFromSet(string
, matchSet
, FALSE
);
568 return -idx
- 1; /* == u_strlen(string) */
572 /* ----- Text manipulation functions --- */
574 U_CAPI UChar
* U_EXPORT2
575 u_strtok_r(UChar
*src
,
581 uint32_t nonDelimIdx
;
583 /* If saveState is NULL, the user messed up. */
586 *saveState
= src
; /* Set to "src" in case there are no delimiters */
588 else if (*saveState
) {
589 tokSource
= *saveState
;
592 /* src == NULL && *saveState == NULL */
593 /* This shouldn't happen. We already finished tokenizing. */
597 /* Skip initial delimiters */
598 nonDelimIdx
= u_strspn(tokSource
, delim
);
599 tokSource
= &tokSource
[nonDelimIdx
];
602 nextToken
= u_strpbrk(tokSource
, delim
);
603 if (nextToken
!= NULL
) {
606 *saveState
= nextToken
;
609 else if (*saveState
) {
610 /* Return the last token */
616 /* No tokens were found. Only delimiters were left. */
622 /* Miscellaneous functions -------------------------------------------------- */
624 U_CAPI UChar
* U_EXPORT2
628 UChar
*anchor
= dst
; /* save a pointer to start of dst */
630 while(*dst
!= 0) { /* To end of first string */
633 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
639 U_CAPI UChar
* U_EXPORT2
640 u_strncat(UChar
*dst
,
645 UChar
*anchor
= dst
; /* save a pointer to start of dst */
647 while(*dst
!= 0) { /* To end of first string */
650 while((*dst
= *src
) != 0) { /* copy string 2 over */
665 /* ----- Text property functions --- */
667 U_CAPI
int32_t U_EXPORT2
668 u_strcmp(const UChar
*s1
,
676 if (c1
!= c2
|| c1
== 0) {
680 return (int32_t)c1
- (int32_t)c2
;
683 U_CFUNC
int32_t U_EXPORT2
684 uprv_strCompare(const UChar
*s1
, int32_t length1
,
685 const UChar
*s2
, int32_t length2
,
686 UBool strncmpStyle
, UBool codePointOrder
) {
687 const UChar
*start1
, *start2
, *limit1
, *limit2
;
690 /* setup for fix-up */
694 /* compare identical prefixes - they do not need to be fixed up */
695 if(length1
<0 && length2
<0) {
696 /* strcmp style, both NUL-terminated */
714 /* setup for fix-up */
716 } else if(strncmpStyle
) {
717 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
722 limit1
=start1
+length1
;
725 /* both lengths are same, check only one limit */
742 /* setup for fix-up */
743 limit2
=start2
+length1
; /* use length1 here, too, to enforce assumption */
745 /* memcmp/UnicodeString style, both length-specified */
746 int32_t lengthResult
;
749 length1
=u_strlen(s1
);
752 length2
=u_strlen(s2
);
755 /* limit1=start1+min(lenght1, length2) */
756 if(length1
<length2
) {
758 limit1
=start1
+length1
;
759 } else if(length1
==length2
) {
761 limit1
=start1
+length1
;
762 } else /* length1>length2 */ {
764 limit1
=start1
+length2
;
772 /* check pseudo-limit */
786 /* setup for fix-up */
787 limit1
=start1
+length1
;
788 limit2
=start2
+length2
;
791 /* if both values are in or above the surrogate range, fix them up */
792 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
793 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
795 (c1
<=0xdbff && (s1
+1)!=limit1
&& U16_IS_TRAIL(*(s1
+1))) ||
796 (U16_IS_TRAIL(c1
) && start1
!=s1
&& U16_IS_LEAD(*(s1
-1)))
798 /* part of a surrogate pair, leave >=d800 */
800 /* BMP code point - may be surrogate code point - make <d800 */
805 (c2
<=0xdbff && (s2
+1)!=limit2
&& U16_IS_TRAIL(*(s2
+1))) ||
806 (U16_IS_TRAIL(c2
) && start2
!=s2
&& U16_IS_LEAD(*(s2
-1)))
808 /* part of a surrogate pair, leave >=d800 */
810 /* BMP code point - may be surrogate code point - make <d800 */
815 /* now c1 and c2 are in the requested (code unit or code point) order */
816 return (int32_t)c1
-(int32_t)c2
;
820 * Compare two strings as presented by UCharIterators.
821 * Use code unit or code point order.
822 * When the function returns, it is undefined where the iterators
825 U_CAPI
int32_t U_EXPORT2
826 u_strCompareIter(UCharIterator
*iter1
, UCharIterator
*iter2
, UBool codePointOrder
) {
829 /* argument checking */
830 if(iter1
==NULL
|| iter2
==NULL
) {
831 return 0; /* bad arguments */
834 return 0; /* identical iterators */
837 /* reset iterators to start? */
838 iter1
->move(iter1
, 0, UITER_START
);
839 iter2
->move(iter2
, 0, UITER_START
);
841 /* compare identical prefixes - they do not need to be fixed up */
843 c1
=iter1
->next(iter1
);
844 c2
=iter2
->next(iter2
);
853 /* if both values are in or above the surrogate range, fix them up */
854 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
855 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
857 (c1
<=0xdbff && U16_IS_TRAIL(iter1
->current(iter1
))) ||
858 (U16_IS_TRAIL(c1
) && (iter1
->previous(iter1
), U16_IS_LEAD(iter1
->previous(iter1
))))
860 /* part of a surrogate pair, leave >=d800 */
862 /* BMP code point - may be surrogate code point - make <d800 */
867 (c2
<=0xdbff && U16_IS_TRAIL(iter2
->current(iter2
))) ||
868 (U16_IS_TRAIL(c2
) && (iter2
->previous(iter2
), U16_IS_LEAD(iter2
->previous(iter2
))))
870 /* part of a surrogate pair, leave >=d800 */
872 /* BMP code point - may be surrogate code point - make <d800 */
877 /* now c1 and c2 are in the requested (code unit or code point) order */
878 return (int32_t)c1
-(int32_t)c2
;
883 * u_strCompareIter() does not leave the iterators _on_ the different units.
884 * This is possible but would cost a few extra indirect function calls to back
885 * up if the last unit (c1 or c2 respectively) was >=0.
887 * Consistently leaving them _behind_ the different units is not an option
888 * because the current "unit" is the end of the string if that is reached,
889 * and in such a case the iterator does not move.
890 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
891 * of their strings. Calling previous() on each does not move them to where
892 * the comparison fails.
894 * So the simplest semantics is to not define where the iterators end up.
896 * The following fragment is part of what would need to be done for backing up.
899 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
901 if(!U16_IS_TRAIL(iter1
->current(iter1
))) {
902 /* lead surrogate code point - make <d800 */
905 } else if(c1
<=0xdfff) {
906 int32_t idx
=iter1
->getIndex(iter1
, UITER_CURRENT
);
907 iter1
->previous(iter1
); /* ==c1 */
908 if(!U16_IS_LEAD(iter1
->previous(iter1
))) {
909 /* trail surrogate code point - make <d800 */
912 /* go back to behind where the difference is */
913 iter1
->move(iter1
, idx
, UITER_ZERO
);
914 } else /* 0xe000<=c1<=0xffff */ {
915 /* BMP code point - make <d800 */
921 U_CAPI
int32_t U_EXPORT2
922 u_strCompare(const UChar
*s1
, int32_t length1
,
923 const UChar
*s2
, int32_t length2
,
924 UBool codePointOrder
) {
925 /* argument checking */
926 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
929 return uprv_strCompare(s1
, length1
, s2
, length2
, FALSE
, codePointOrder
);
932 /* String compare in code point order - u_strcmp() compares in code unit order. */
933 U_CAPI
int32_t U_EXPORT2
934 u_strcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
) {
935 return uprv_strCompare(s1
, -1, s2
, -1, FALSE
, TRUE
);
938 U_CAPI
int32_t U_EXPORT2
939 u_strncmp(const UChar
*s1
,
946 rc
= (int32_t)*s1
- (int32_t)*s2
;
947 if(rc
!= 0 || *s1
== 0 || --n
== 0) {
958 U_CAPI
int32_t U_EXPORT2
959 u_strncmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t n
) {
960 return uprv_strCompare(s1
, n
, s2
, n
, TRUE
, TRUE
);
963 U_CAPI UChar
* U_EXPORT2
967 UChar
*anchor
= dst
; /* save a pointer to start of dst */
969 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
975 U_CAPI UChar
* U_EXPORT2
976 u_strncpy(UChar
*dst
,
980 UChar
*anchor
= dst
; /* save a pointer to start of dst */
982 /* copy string 2 over */
983 while(n
> 0 && (*(dst
++) = *(src
++)) != 0) {
990 U_CAPI
int32_t U_EXPORT2
991 u_strlen(const UChar
*s
)
993 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
994 return (int32_t)uprv_wcslen(s
);
1004 U_CAPI
int32_t U_EXPORT2
1005 u_countChar32(const UChar
*s
, int32_t length
) {
1008 if(s
==NULL
|| length
<-1) {
1016 if(U16_IS_LEAD(*s
) && length
>=2 && U16_IS_TRAIL(*(s
+1))) {
1024 } else /* length==-1 */ {
1034 * sufficient to look ahead one because of UTF-16;
1035 * safe to look ahead one because at worst that would be the terminating NUL
1037 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1045 U_CAPI UBool U_EXPORT2
1046 u_strHasMoreChar32Than(const UChar
*s
, int32_t length
, int32_t number
) {
1051 if(s
==NULL
|| length
<-1) {
1056 /* s is NUL-terminated */
1059 /* count code points until they exceed */
1067 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1073 /* length>=0 known */
1075 int32_t maxSupplementary
;
1077 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1078 if(((length
+1)/2)>number
) {
1082 /* check if s does not even contain enough UChars */
1083 maxSupplementary
=length
-number
;
1084 if(maxSupplementary
<=0) {
1087 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1090 * count code points until they exceed and also check that there are
1091 * no more than maxSupplementary supplementary code points (UChar pairs)
1101 if(U16_IS_LEAD(*s
++) && s
!=limit
&& U16_IS_TRAIL(*s
)) {
1103 if(--maxSupplementary
<=0) {
1104 /* too many pairs - too few code points */
1113 U_CAPI UChar
* U_EXPORT2
1114 u_memcpy(UChar
*dest
, const UChar
*src
, int32_t count
) {
1116 uprv_memcpy(dest
, src
, (size_t)count
*U_SIZEOF_UCHAR
);
1121 U_CAPI UChar
* U_EXPORT2
1122 u_memmove(UChar
*dest
, const UChar
*src
, int32_t count
) {
1124 uprv_memmove(dest
, src
, (size_t)count
*U_SIZEOF_UCHAR
);
1129 U_CAPI UChar
* U_EXPORT2
1130 u_memset(UChar
*dest
, UChar c
, int32_t count
) {
1133 UChar
*limit
= dest
+ count
;
1135 while (ptr
< limit
) {
1142 U_CAPI
int32_t U_EXPORT2
1143 u_memcmp(const UChar
*buf1
, const UChar
*buf2
, int32_t count
) {
1145 const UChar
*limit
= buf1
+ count
;
1148 while (buf1
< limit
) {
1149 result
= (int32_t)(uint16_t)*buf1
- (int32_t)(uint16_t)*buf2
;
1160 U_CAPI
int32_t U_EXPORT2
1161 u_memcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t count
) {
1162 return uprv_strCompare(s1
, count
, s2
, count
, FALSE
, TRUE
);
1165 /* u_unescape & support fns ------------------------------------------------- */
1167 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1168 static const UChar UNESCAPE_MAP
[] = {
1182 enum { UNESCAPE_MAP_LENGTH
= UPRV_LENGTHOF(UNESCAPE_MAP
) };
1184 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1185 static int8_t _digit8(UChar c
) {
1186 if (c
>= 0x0030 && c
<= 0x0037) {
1187 return (int8_t)(c
- 0x0030);
1192 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1193 static int8_t _digit16(UChar c
) {
1194 if (c
>= 0x0030 && c
<= 0x0039) {
1195 return (int8_t)(c
- 0x0030);
1197 if (c
>= 0x0041 && c
<= 0x0046) {
1198 return (int8_t)(c
- (0x0041 - 10));
1200 if (c
>= 0x0061 && c
<= 0x0066) {
1201 return (int8_t)(c
- (0x0061 - 10));
1206 /* Parse a single escape sequence. Although this method deals in
1207 * UChars, it does not use C++ or UnicodeString. This allows it to
1208 * be used from C contexts. */
1209 U_CAPI UChar32 U_EXPORT2
1210 u_unescapeAt(UNESCAPE_CHAR_AT charAt
,
1215 int32_t start
= *offset
;
1221 int8_t bitsPerDigit
= 4;
1224 UBool braces
= FALSE
;
1226 /* Check that offset is in range */
1227 if (*offset
< 0 || *offset
>= length
) {
1231 /* Fetch first UChar after '\\' */
1232 c
= charAt((*offset
)++, context
);
1234 /* Convert hexadecimal and octal escapes */
1236 case 0x0075 /*'u'*/:
1237 minDig
= maxDig
= 4;
1239 case 0x0055 /*'U'*/:
1240 minDig
= maxDig
= 8;
1242 case 0x0078 /*'x'*/:
1244 if (*offset
< length
&& charAt(*offset
, context
) == 0x7B /*{*/) {
1257 n
= 1; /* Already have first octal digit */
1264 while (*offset
< length
&& n
< maxDig
) {
1265 c
= charAt(*offset
, context
);
1266 dig
= (int8_t)((bitsPerDigit
== 3) ? _digit8(c
) : _digit16(c
));
1270 result
= (result
<< bitsPerDigit
) | dig
;
1278 if (c
!= 0x7D /*}*/) {
1283 if (result
< 0 || result
>= 0x110000) {
1286 /* If an escape sequence specifies a lead surrogate, see if
1287 * there is a trail surrogate after it, either as an escape or
1288 * as a literal. If so, join them up into a supplementary.
1290 if (*offset
< length
&& U16_IS_LEAD(result
)) {
1291 int32_t ahead
= *offset
+ 1;
1292 c
= charAt(*offset
, context
);
1293 if (c
== 0x5C /*'\\'*/ && ahead
< length
) {
1294 c
= (UChar
) u_unescapeAt(charAt
, &ahead
, length
, context
);
1296 if (U16_IS_TRAIL(c
)) {
1298 result
= U16_GET_SUPPLEMENTARY(result
, c
);
1304 /* Convert C-style escapes in table */
1305 for (i
=0; i
<UNESCAPE_MAP_LENGTH
; i
+=2) {
1306 if (c
== UNESCAPE_MAP
[i
]) {
1307 return UNESCAPE_MAP
[i
+1];
1308 } else if (c
< UNESCAPE_MAP
[i
]) {
1313 /* Map \cX to control-X: X & 0x1F */
1314 if (c
== 0x0063 /*'c'*/ && *offset
< length
) {
1315 c
= charAt((*offset
)++, context
);
1316 if (U16_IS_LEAD(c
) && *offset
< length
) {
1317 UChar c2
= charAt(*offset
, context
);
1318 if (U16_IS_TRAIL(c2
)) {
1320 c
= (UChar
) U16_GET_SUPPLEMENTARY(c
, c2
); /* [sic] */
1326 /* If no special forms are recognized, then consider
1327 * the backslash to generically escape the next character.
1328 * Deal with surrogate pairs. */
1329 if (U16_IS_LEAD(c
) && *offset
< length
) {
1330 UChar c2
= charAt(*offset
, context
);
1331 if (U16_IS_TRAIL(c2
)) {
1333 return U16_GET_SUPPLEMENTARY(c
, c2
);
1339 /* Invalid escape sequence */
1340 *offset
= start
; /* Reset to initial value */
1341 return (UChar32
)0xFFFFFFFF;
1344 /* u_unescapeAt() callback to return a UChar from a char* */
1345 static UChar U_CALLCONV
1346 _charPtr_charAt(int32_t offset
, void *context
) {
1348 /* It would be more efficient to access the invariant tables
1349 * directly but there is no API for that. */
1350 u_charsToUChars(((char*) context
) + offset
, &c16
, 1);
1354 /* Append an escape-free segment of the text; used by u_unescape() */
1355 static void _appendUChars(UChar
*dest
, int32_t destCapacity
,
1356 const char *src
, int32_t srcLen
) {
1357 if (destCapacity
< 0) {
1360 if (srcLen
> destCapacity
) {
1361 srcLen
= destCapacity
;
1363 u_charsToUChars(src
, dest
, srcLen
);
1366 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1367 U_CAPI
int32_t U_EXPORT2
1368 u_unescape(const char *src
, UChar
*dest
, int32_t destCapacity
) {
1369 const char *segment
= src
;
1373 while ((c
=*src
) != 0) {
1374 /* '\\' intentionally written as compiler-specific
1375 * character constant to correspond to compiler-specific
1376 * char* constants. */
1378 int32_t lenParsed
= 0;
1380 if (src
!= segment
) {
1382 _appendUChars(dest
+ i
, destCapacity
- i
,
1383 segment
, (int32_t)(src
- segment
));
1385 i
+= (int32_t)(src
- segment
);
1387 ++src
; /* advance past '\\' */
1388 c32
= (UChar32
)u_unescapeAt(_charPtr_charAt
, &lenParsed
, (int32_t)uprv_strlen(src
), (void*)src
);
1389 if (lenParsed
== 0) {
1392 src
+= lenParsed
; /* advance past escape seq. */
1393 if (dest
!= NULL
&& U16_LENGTH(c32
) <= (destCapacity
- i
)) {
1394 U16_APPEND_UNSAFE(dest
, i
, c32
);
1396 i
+= U16_LENGTH(c32
);
1403 if (src
!= segment
) {
1405 _appendUChars(dest
+ i
, destCapacity
- i
,
1406 segment
, (int32_t)(src
- segment
));
1408 i
+= (int32_t)(src
- segment
);
1410 if (dest
!= NULL
&& i
< destCapacity
) {
1416 if (dest
!= NULL
&& destCapacity
> 0) {
1422 /* NUL-termination of strings ----------------------------------------------- */
1425 * NUL-terminate a string no matter what its type.
1426 * Set warning and error codes accordingly.
1428 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1429 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1430 /* not a public function, so no complete argument checking */ \
1433 /* assume that the caller handles this */ \
1434 } else if(length<destCapacity) { \
1435 /* NUL-terminate the string, the NUL fits */ \
1437 /* unset the not-terminated warning but leave all others */ \
1438 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1439 *pErrorCode=U_ZERO_ERROR; \
1441 } else if(length==destCapacity) { \
1442 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1443 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1444 } else /* length>destCapacity */ { \
1445 /* even the string itself did not fit - set an error code */ \
1446 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1450 U_CAPI
int32_t U_EXPORT2
1451 u_terminateUChars(UChar
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1452 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1456 U_CAPI
int32_t U_EXPORT2
1457 u_terminateChars(char *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1458 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1462 U_CAPI
int32_t U_EXPORT2
1463 u_terminateUChar32s(UChar32
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1464 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1468 U_CAPI
int32_t U_EXPORT2
1469 u_terminateWChars(wchar_t *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1470 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1474 // Compute the hash code for a string -------------------------------------- ***
1476 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1477 // on UHashtable code.
1480 Compute the hash by iterating sparsely over about 32 (up to 63)
1481 characters spaced evenly through the string. For each character,
1482 multiply the previous hash value by a prime number and add the new
1483 character in, like a linear congruential random number generator,
1484 producing a pseudorandom deterministic value well distributed over
1485 the output range. [LIU]
1488 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1489 uint32_t hash = 0; \
1490 const TYPE *p = (const TYPE*) STR; \
1492 int32_t len = (int32_t)(STRLEN); \
1493 int32_t inc = ((len - 32) / 32) + 1; \
1494 const TYPE *limit = p + len; \
1496 hash = (hash * 37) + DEREF; \
1500 return static_cast<int32_t>(hash)
1502 /* Used by UnicodeString to compute its hashcode - Not public API. */
1503 U_CAPI
int32_t U_EXPORT2
1504 ustr_hashUCharsN(const UChar
*str
, int32_t length
) {
1505 STRING_HASH(UChar
, str
, length
, *p
);
1508 U_CAPI
int32_t U_EXPORT2
1509 ustr_hashCharsN(const char *str
, int32_t length
) {
1510 STRING_HASH(uint8_t, str
, length
, *p
);
1513 U_CAPI
int32_t U_EXPORT2
1514 ustr_hashICharsN(const char *str
, int32_t length
) {
1515 STRING_HASH(char, str
, length
, (uint8_t)uprv_tolower(*p
));