]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustring.cpp
2 ******************************************************************************
4 * Copyright (C) 1998-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 12/07/98 bertrand Creation.
15 ******************************************************************************
18 #include "unicode/utypes.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/utf16.h"
27 /* ANSI string.h - style functions ------------------------------------------ */
29 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
30 #define U_BMP_MAX 0xffff
32 /* Forward binary string search functions ----------------------------------- */
35 * Test if a substring match inside a string is at code point boundaries.
36 * All pointers refer to the same buffer.
37 * The limit pointer may be NULL, all others must be real pointers.
40 isMatchAtCPBoundary(const UChar
*start
, const UChar
*match
, const UChar
*matchLimit
, const UChar
*limit
) {
41 if(U16_IS_TRAIL(*match
) && start
!=match
&& U16_IS_LEAD(*(match
-1))) {
42 /* the leading edge of the match is in the middle of a surrogate pair */
45 if(U16_IS_LEAD(*(matchLimit
-1)) && match
!=limit
&& U16_IS_TRAIL(*matchLimit
)) {
46 /* the trailing edge of the match is in the middle of a surrogate pair */
52 U_CAPI UChar
* U_EXPORT2
53 u_strFindFirst(const UChar
*s
, int32_t length
,
54 const UChar
*sub
, int32_t subLength
) {
55 const UChar
*start
, *p
, *q
, *subLimit
;
58 if(sub
==NULL
|| subLength
<-1) {
61 if(s
==NULL
|| length
<-1) {
67 if(length
<0 && subLength
<0) {
68 /* both strings are NUL-terminated */
72 if(*sub
==0 && !U16_IS_SURROGATE(cs
)) {
73 /* the substring consists of a single, non-surrogate BMP code point */
74 return u_strchr(s
, cs
);
79 /* found first substring UChar, compare rest */
84 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
85 return (UChar
*)(s
-1); /* well-formed match */
87 break; /* no match because surrogate pair is split */
91 return NULL
; /* no match, and none possible after s */
107 subLength
=u_strlen(sub
);
113 /* get sub[0] to search for it fast */
116 subLimit
=sub
+subLength
;
118 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
119 /* the substring consists of a single, non-surrogate BMP code point */
120 return length
<0 ? u_strchr(s
, cs
) : u_memchr(s
, cs
, length
);
124 /* s is NUL-terminated */
127 /* found first substring UChar, compare rest */
132 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
133 return (UChar
*)(s
-1); /* well-formed match */
135 break; /* no match because surrogate pair is split */
139 return NULL
; /* no match, and none possible after s */
142 break; /* no match */
150 const UChar
*limit
, *preLimit
;
152 /* subLength was decremented above */
153 if(length
<=subLength
) {
154 return NULL
; /* s is shorter than sub */
159 /* the substring must start before preLimit */
160 preLimit
=limit
-subLength
;
165 /* found first substring UChar, compare rest */
170 if(isMatchAtCPBoundary(start
, s
-1, p
, limit
)) {
171 return (UChar
*)(s
-1); /* well-formed match */
173 break; /* no match because surrogate pair is split */
177 break; /* no match */
190 U_CAPI UChar
* U_EXPORT2
191 u_strstr(const UChar
*s
, const UChar
*substring
) {
192 return u_strFindFirst(s
, -1, substring
, -1);
195 U_CAPI UChar
* U_EXPORT2
196 u_strchr(const UChar
*s
, UChar c
) {
197 if(U16_IS_SURROGATE(c
)) {
198 /* make sure to not find half of a surrogate pair */
199 return u_strFindFirst(s
, -1, &c
, 1);
203 /* trivial search for a BMP code point */
216 U_CAPI UChar
* U_EXPORT2
217 u_strchr32(const UChar
*s
, UChar32 c
) {
218 if((uint32_t)c
<=U_BMP_MAX
) {
219 /* find BMP code point */
220 return u_strchr(s
, (UChar
)c
);
221 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
222 /* find supplementary code point as surrogate pair */
223 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
225 while((cs
=*s
++)!=0) {
226 if(cs
==lead
&& *s
==trail
) {
227 return (UChar
*)(s
-1);
232 /* not a Unicode code point, not findable */
237 U_CAPI UChar
* U_EXPORT2
238 u_memchr(const UChar
*s
, UChar c
, int32_t count
) {
240 return NULL
; /* no string */
241 } else if(U16_IS_SURROGATE(c
)) {
242 /* make sure to not find half of a surrogate pair */
243 return u_strFindFirst(s
, count
, &c
, 1);
245 /* trivial search for a BMP code point */
246 const UChar
*limit
=s
+count
;
256 U_CAPI UChar
* U_EXPORT2
257 u_memchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
258 if((uint32_t)c
<=U_BMP_MAX
) {
259 /* find BMP code point */
260 return u_memchr(s
, (UChar
)c
, count
);
262 /* too short for a surrogate pair */
264 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
265 /* find supplementary code point as surrogate pair */
266 const UChar
*limit
=s
+count
-1; /* -1 so that we do not need a separate check for the trail unit */
267 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
270 if(*s
==lead
&& *(s
+1)==trail
) {
276 /* not a Unicode code point, not findable */
281 /* Backward binary string search functions ---------------------------------- */
283 U_CAPI UChar
* U_EXPORT2
284 u_strFindLast(const UChar
*s
, int32_t length
,
285 const UChar
*sub
, int32_t subLength
) {
286 const UChar
*start
, *limit
, *p
, *q
, *subLimit
;
289 if(sub
==NULL
|| subLength
<-1) {
292 if(s
==NULL
|| length
<-1) {
297 * This implementation is more lazy than the one for u_strFindFirst():
298 * There is no special search code for NUL-terminated strings.
299 * It does not seem to be worth it for searching substrings to
300 * search forward and find all matches like in u_strrchr() and similar.
301 * Therefore, we simply get both string lengths and search backward.
307 subLength
=u_strlen(sub
);
313 /* get sub[subLength-1] to search for it fast */
314 subLimit
=sub
+subLength
;
318 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
319 /* the substring consists of a single, non-surrogate BMP code point */
320 return length
<0 ? u_strrchr(s
, cs
) : u_memrchr(s
, cs
, length
);
327 /* subLength was decremented above */
328 if(length
<=subLength
) {
329 return NULL
; /* s is shorter than sub */
335 /* the substring must start no later than s+subLength */
341 /* found last substring UChar, compare rest */
346 if(isMatchAtCPBoundary(start
, p
, limit
+1, start
+length
)) {
347 return (UChar
*)p
; /* well-formed match */
349 break; /* no match because surrogate pair is split */
353 break; /* no match */
363 U_CAPI UChar
* U_EXPORT2
364 u_strrstr(const UChar
*s
, const UChar
*substring
) {
365 return u_strFindLast(s
, -1, substring
, -1);
368 U_CAPI UChar
* U_EXPORT2
369 u_strrchr(const UChar
*s
, UChar c
) {
370 if(U16_IS_SURROGATE(c
)) {
371 /* make sure to not find half of a surrogate pair */
372 return u_strFindLast(s
, -1, &c
, 1);
374 const UChar
*result
=NULL
;
377 /* trivial search for a BMP code point */
383 return (UChar
*)result
;
390 U_CAPI UChar
* U_EXPORT2
391 u_strrchr32(const UChar
*s
, UChar32 c
) {
392 if((uint32_t)c
<=U_BMP_MAX
) {
393 /* find BMP code point */
394 return u_strrchr(s
, (UChar
)c
);
395 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
396 /* find supplementary code point as surrogate pair */
397 const UChar
*result
=NULL
;
398 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
400 while((cs
=*s
++)!=0) {
401 if(cs
==lead
&& *s
==trail
) {
405 return (UChar
*)result
;
407 /* not a Unicode code point, not findable */
412 U_CAPI UChar
* U_EXPORT2
413 u_memrchr(const UChar
*s
, UChar c
, int32_t count
) {
415 return NULL
; /* no string */
416 } else if(U16_IS_SURROGATE(c
)) {
417 /* make sure to not find half of a surrogate pair */
418 return u_strFindLast(s
, count
, &c
, 1);
420 /* trivial search for a BMP code point */
421 const UChar
*limit
=s
+count
;
424 return (UChar
*)limit
;
431 U_CAPI UChar
* U_EXPORT2
432 u_memrchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
433 if((uint32_t)c
<=U_BMP_MAX
) {
434 /* find BMP code point */
435 return u_memrchr(s
, (UChar
)c
, count
);
437 /* too short for a surrogate pair */
439 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
440 /* find supplementary code point as surrogate pair */
441 const UChar
*limit
=s
+count
-1;
442 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
445 if(*limit
==trail
&& *(limit
-1)==lead
) {
446 return (UChar
*)(limit
-1);
451 /* not a Unicode code point, not findable */
456 /* Tokenization functions --------------------------------------------------- */
459 * Match each code point in a string against each code point in the matchSet.
460 * Return the index of the first string code point that
461 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
462 * Return -(string length)-1 if there is no such code point.
465 _matchFromSet(const UChar
*string
, const UChar
*matchSet
, UBool polarity
) {
466 int32_t matchLen
, matchBMPLen
, strItr
, matchItr
;
467 UChar32 stringCh
, matchCh
;
470 /* first part of matchSet contains only BMP code points */
472 while((c
= matchSet
[matchBMPLen
]) != 0 && U16_IS_SINGLE(c
)) {
476 /* second part of matchSet contains BMP and supplementary code points */
477 matchLen
= matchBMPLen
;
478 while(matchSet
[matchLen
] != 0) {
482 for(strItr
= 0; (c
= string
[strItr
]) != 0;) {
484 if(U16_IS_SINGLE(c
)) {
486 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
487 if(c
== matchSet
[matchItr
]) {
488 return strItr
- 1; /* one matches */
492 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
493 if(c
== matchSet
[matchItr
]) {
497 return strItr
- 1; /* none matches */
501 * No need to check for string length before U16_IS_TRAIL
502 * because c2 could at worst be the terminating NUL.
504 if(U16_IS_SURROGATE_LEAD(c
) && U16_IS_TRAIL(c2
= string
[strItr
])) {
506 stringCh
= U16_GET_SUPPLEMENTARY(c
, c2
);
508 stringCh
= c
; /* unpaired trail surrogate */
512 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
513 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
514 if(stringCh
== matchCh
) {
515 return strItr
- U16_LENGTH(stringCh
); /* one matches */
519 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
520 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
521 if(stringCh
== matchCh
) {
525 return strItr
- U16_LENGTH(stringCh
); /* none matches */
529 /* wish C had continue with labels like Java... */;
532 /* Didn't find it. */
536 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
537 U_CAPI UChar
* U_EXPORT2
538 u_strpbrk(const UChar
*string
, const UChar
*matchSet
)
540 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
542 return (UChar
*)string
+ idx
;
548 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
549 U_CAPI
int32_t U_EXPORT2
550 u_strcspn(const UChar
*string
, const UChar
*matchSet
)
552 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
556 return -idx
- 1; /* == u_strlen(string) */
560 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
561 U_CAPI
int32_t U_EXPORT2
562 u_strspn(const UChar
*string
, const UChar
*matchSet
)
564 int32_t idx
= _matchFromSet(string
, matchSet
, FALSE
);
568 return -idx
- 1; /* == u_strlen(string) */
572 /* ----- Text manipulation functions --- */
574 U_CAPI UChar
* U_EXPORT2
575 u_strtok_r(UChar
*src
,
581 uint32_t nonDelimIdx
;
583 /* If saveState is NULL, the user messed up. */
586 *saveState
= src
; /* Set to "src" in case there are no delimiters */
588 else if (*saveState
) {
589 tokSource
= *saveState
;
592 /* src == NULL && *saveState == NULL */
593 /* This shouldn't happen. We already finished tokenizing. */
597 /* Skip initial delimiters */
598 nonDelimIdx
= u_strspn(tokSource
, delim
);
599 tokSource
= &tokSource
[nonDelimIdx
];
602 nextToken
= u_strpbrk(tokSource
, delim
);
603 if (nextToken
!= NULL
) {
606 *saveState
= nextToken
;
609 else if (*saveState
) {
610 /* Return the last token */
616 /* No tokens were found. Only delimiters were left. */
622 /* Miscellaneous functions -------------------------------------------------- */
624 U_CAPI UChar
* U_EXPORT2
628 UChar
*anchor
= dst
; /* save a pointer to start of dst */
630 while(*dst
!= 0) { /* To end of first string */
633 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
639 U_CAPI UChar
* U_EXPORT2
640 u_strncat(UChar
*dst
,
645 UChar
*anchor
= dst
; /* save a pointer to start of dst */
647 while(*dst
!= 0) { /* To end of first string */
650 while((*dst
= *src
) != 0) { /* copy string 2 over */
665 /* ----- Text property functions --- */
667 U_CAPI
int32_t U_EXPORT2
668 u_strcmp(const UChar
*s1
,
676 if (c1
!= c2
|| c1
== 0) {
680 return (int32_t)c1
- (int32_t)c2
;
683 U_CFUNC
int32_t U_EXPORT2
684 uprv_strCompare(const UChar
*s1
, int32_t length1
,
685 const UChar
*s2
, int32_t length2
,
686 UBool strncmpStyle
, UBool codePointOrder
) {
687 const UChar
*start1
, *start2
, *limit1
, *limit2
;
690 /* setup for fix-up */
694 /* compare identical prefixes - they do not need to be fixed up */
695 if(length1
<0 && length2
<0) {
696 /* strcmp style, both NUL-terminated */
714 /* setup for fix-up */
716 } else if(strncmpStyle
) {
717 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
722 limit1
=start1
+length1
;
725 /* both lengths are same, check only one limit */
742 /* setup for fix-up */
743 limit2
=start2
+length1
; /* use length1 here, too, to enforce assumption */
745 /* memcmp/UnicodeString style, both length-specified */
746 int32_t lengthResult
;
749 length1
=u_strlen(s1
);
752 length2
=u_strlen(s2
);
755 /* limit1=start1+min(lenght1, length2) */
756 if(length1
<length2
) {
758 limit1
=start1
+length1
;
759 } else if(length1
==length2
) {
761 limit1
=start1
+length1
;
762 } else /* length1>length2 */ {
764 limit1
=start1
+length2
;
772 /* check pseudo-limit */
786 /* setup for fix-up */
787 limit1
=start1
+length1
;
788 limit2
=start2
+length2
;
791 /* if both values are in or above the surrogate range, fix them up */
792 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
793 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
795 (c1
<=0xdbff && (s1
+1)!=limit1
&& U16_IS_TRAIL(*(s1
+1))) ||
796 (U16_IS_TRAIL(c1
) && start1
!=s1
&& U16_IS_LEAD(*(s1
-1)))
798 /* part of a surrogate pair, leave >=d800 */
800 /* BMP code point - may be surrogate code point - make <d800 */
805 (c2
<=0xdbff && (s2
+1)!=limit2
&& U16_IS_TRAIL(*(s2
+1))) ||
806 (U16_IS_TRAIL(c2
) && start2
!=s2
&& U16_IS_LEAD(*(s2
-1)))
808 /* part of a surrogate pair, leave >=d800 */
810 /* BMP code point - may be surrogate code point - make <d800 */
815 /* now c1 and c2 are in the requested (code unit or code point) order */
816 return (int32_t)c1
-(int32_t)c2
;
820 * Compare two strings as presented by UCharIterators.
821 * Use code unit or code point order.
822 * When the function returns, it is undefined where the iterators
825 U_CAPI
int32_t U_EXPORT2
826 u_strCompareIter(UCharIterator
*iter1
, UCharIterator
*iter2
, UBool codePointOrder
) {
829 /* argument checking */
830 if(iter1
==NULL
|| iter2
==NULL
) {
831 return 0; /* bad arguments */
834 return 0; /* identical iterators */
837 /* reset iterators to start? */
838 iter1
->move(iter1
, 0, UITER_START
);
839 iter2
->move(iter2
, 0, UITER_START
);
841 /* compare identical prefixes - they do not need to be fixed up */
843 c1
=iter1
->next(iter1
);
844 c2
=iter2
->next(iter2
);
853 /* if both values are in or above the surrogate range, fix them up */
854 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
855 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
857 (c1
<=0xdbff && U16_IS_TRAIL(iter1
->current(iter1
))) ||
858 (U16_IS_TRAIL(c1
) && (iter1
->previous(iter1
), U16_IS_LEAD(iter1
->previous(iter1
))))
860 /* part of a surrogate pair, leave >=d800 */
862 /* BMP code point - may be surrogate code point - make <d800 */
867 (c2
<=0xdbff && U16_IS_TRAIL(iter2
->current(iter2
))) ||
868 (U16_IS_TRAIL(c2
) && (iter2
->previous(iter2
), U16_IS_LEAD(iter2
->previous(iter2
))))
870 /* part of a surrogate pair, leave >=d800 */
872 /* BMP code point - may be surrogate code point - make <d800 */
877 /* now c1 and c2 are in the requested (code unit or code point) order */
878 return (int32_t)c1
-(int32_t)c2
;
883 * u_strCompareIter() does not leave the iterators _on_ the different units.
884 * This is possible but would cost a few extra indirect function calls to back
885 * up if the last unit (c1 or c2 respectively) was >=0.
887 * Consistently leaving them _behind_ the different units is not an option
888 * because the current "unit" is the end of the string if that is reached,
889 * and in such a case the iterator does not move.
890 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
891 * of their strings. Calling previous() on each does not move them to where
892 * the comparison fails.
894 * So the simplest semantics is to not define where the iterators end up.
896 * The following fragment is part of what would need to be done for backing up.
899 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
901 if(!U16_IS_TRAIL(iter1
->current(iter1
))) {
902 /* lead surrogate code point - make <d800 */
905 } else if(c1
<=0xdfff) {
906 int32_t idx
=iter1
->getIndex(iter1
, UITER_CURRENT
);
907 iter1
->previous(iter1
); /* ==c1 */
908 if(!U16_IS_LEAD(iter1
->previous(iter1
))) {
909 /* trail surrogate code point - make <d800 */
912 /* go back to behind where the difference is */
913 iter1
->move(iter1
, idx
, UITER_ZERO
);
914 } else /* 0xe000<=c1<=0xffff */ {
915 /* BMP code point - make <d800 */
921 U_CAPI
int32_t U_EXPORT2
922 u_strCompare(const UChar
*s1
, int32_t length1
,
923 const UChar
*s2
, int32_t length2
,
924 UBool codePointOrder
) {
925 /* argument checking */
926 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
929 return uprv_strCompare(s1
, length1
, s2
, length2
, FALSE
, codePointOrder
);
932 /* String compare in code point order - u_strcmp() compares in code unit order. */
933 U_CAPI
int32_t U_EXPORT2
934 u_strcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
) {
935 return uprv_strCompare(s1
, -1, s2
, -1, FALSE
, TRUE
);
938 U_CAPI
int32_t U_EXPORT2
939 u_strncmp(const UChar
*s1
,
946 rc
= (int32_t)*s1
- (int32_t)*s2
;
947 if(rc
!= 0 || *s1
== 0 || --n
== 0) {
958 U_CAPI
int32_t U_EXPORT2
959 u_strncmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t n
) {
960 return uprv_strCompare(s1
, n
, s2
, n
, TRUE
, TRUE
);
963 U_CAPI UChar
* U_EXPORT2
967 UChar
*anchor
= dst
; /* save a pointer to start of dst */
969 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
975 U_CAPI UChar
* U_EXPORT2
976 u_strncpy(UChar
*dst
,
980 UChar
*anchor
= dst
; /* save a pointer to start of dst */
982 /* copy string 2 over */
983 while(n
> 0 && (*(dst
++) = *(src
++)) != 0) {
990 U_CAPI
int32_t U_EXPORT2
991 u_strlen(const UChar
*s
)
993 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
994 return (int32_t)uprv_wcslen(s
);
1004 U_CAPI
int32_t U_EXPORT2
1005 u_countChar32(const UChar
*s
, int32_t length
) {
1008 if(s
==NULL
|| length
<-1) {
1016 if(U16_IS_LEAD(*s
) && length
>=2 && U16_IS_TRAIL(*(s
+1))) {
1024 } else /* length==-1 */ {
1034 * sufficient to look ahead one because of UTF-16;
1035 * safe to look ahead one because at worst that would be the terminating NUL
1037 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1045 U_CAPI UBool U_EXPORT2
1046 u_strHasMoreChar32Than(const UChar
*s
, int32_t length
, int32_t number
) {
1051 if(s
==NULL
|| length
<-1) {
1056 /* s is NUL-terminated */
1059 /* count code points until they exceed */
1067 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1073 /* length>=0 known */
1075 int32_t maxSupplementary
;
1077 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1078 if(((length
+1)/2)>number
) {
1082 /* check if s does not even contain enough UChars */
1083 maxSupplementary
=length
-number
;
1084 if(maxSupplementary
<=0) {
1087 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1090 * count code points until they exceed and also check that there are
1091 * no more than maxSupplementary supplementary code points (UChar pairs)
1101 if(U16_IS_LEAD(*s
++) && s
!=limit
&& U16_IS_TRAIL(*s
)) {
1103 if(--maxSupplementary
<=0) {
1104 /* too many pairs - too few code points */
1113 U_CAPI UChar
* U_EXPORT2
1114 u_memcpy(UChar
*dest
, const UChar
*src
, int32_t count
) {
1115 return (UChar
*)uprv_memcpy(dest
, src
, count
*U_SIZEOF_UCHAR
);
1118 U_CAPI UChar
* U_EXPORT2
1119 u_memmove(UChar
*dest
, const UChar
*src
, int32_t count
) {
1120 return (UChar
*)uprv_memmove(dest
, src
, count
*U_SIZEOF_UCHAR
);
1123 U_CAPI UChar
* U_EXPORT2
1124 u_memset(UChar
*dest
, UChar c
, int32_t count
) {
1127 UChar
*limit
= dest
+ count
;
1129 while (ptr
< limit
) {
1136 U_CAPI
int32_t U_EXPORT2
1137 u_memcmp(const UChar
*buf1
, const UChar
*buf2
, int32_t count
) {
1139 const UChar
*limit
= buf1
+ count
;
1142 while (buf1
< limit
) {
1143 result
= (int32_t)(uint16_t)*buf1
- (int32_t)(uint16_t)*buf2
;
1154 U_CAPI
int32_t U_EXPORT2
1155 u_memcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t count
) {
1156 return uprv_strCompare(s1
, count
, s2
, count
, FALSE
, TRUE
);
1159 /* u_unescape & support fns ------------------------------------------------- */
1161 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1162 static const UChar UNESCAPE_MAP
[] = {
1176 enum { UNESCAPE_MAP_LENGTH
= sizeof(UNESCAPE_MAP
) / sizeof(UNESCAPE_MAP
[0]) };
1178 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1179 static int8_t _digit8(UChar c
) {
1180 if (c
>= 0x0030 && c
<= 0x0037) {
1181 return (int8_t)(c
- 0x0030);
1186 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1187 static int8_t _digit16(UChar c
) {
1188 if (c
>= 0x0030 && c
<= 0x0039) {
1189 return (int8_t)(c
- 0x0030);
1191 if (c
>= 0x0041 && c
<= 0x0046) {
1192 return (int8_t)(c
- (0x0041 - 10));
1194 if (c
>= 0x0061 && c
<= 0x0066) {
1195 return (int8_t)(c
- (0x0061 - 10));
1200 /* Parse a single escape sequence. Although this method deals in
1201 * UChars, it does not use C++ or UnicodeString. This allows it to
1202 * be used from C contexts. */
1203 U_CAPI UChar32 U_EXPORT2
1204 u_unescapeAt(UNESCAPE_CHAR_AT charAt
,
1209 int32_t start
= *offset
;
1215 int8_t bitsPerDigit
= 4;
1218 UBool braces
= FALSE
;
1220 /* Check that offset is in range */
1221 if (*offset
< 0 || *offset
>= length
) {
1225 /* Fetch first UChar after '\\' */
1226 c
= charAt((*offset
)++, context
);
1228 /* Convert hexadecimal and octal escapes */
1230 case 0x0075 /*'u'*/:
1231 minDig
= maxDig
= 4;
1233 case 0x0055 /*'U'*/:
1234 minDig
= maxDig
= 8;
1236 case 0x0078 /*'x'*/:
1238 if (*offset
< length
&& charAt(*offset
, context
) == 0x7B /*{*/) {
1251 n
= 1; /* Already have first octal digit */
1258 while (*offset
< length
&& n
< maxDig
) {
1259 c
= charAt(*offset
, context
);
1260 dig
= (int8_t)((bitsPerDigit
== 3) ? _digit8(c
) : _digit16(c
));
1264 result
= (result
<< bitsPerDigit
) | dig
;
1272 if (c
!= 0x7D /*}*/) {
1277 if (result
< 0 || result
>= 0x110000) {
1280 /* If an escape sequence specifies a lead surrogate, see if
1281 * there is a trail surrogate after it, either as an escape or
1282 * as a literal. If so, join them up into a supplementary.
1284 if (*offset
< length
&& U16_IS_LEAD(result
)) {
1285 int32_t ahead
= *offset
+ 1;
1286 c
= charAt(*offset
, context
);
1287 if (c
== 0x5C /*'\\'*/ && ahead
< length
) {
1288 c
= (UChar
) u_unescapeAt(charAt
, &ahead
, length
, context
);
1290 if (U16_IS_TRAIL(c
)) {
1292 result
= U16_GET_SUPPLEMENTARY(result
, c
);
1298 /* Convert C-style escapes in table */
1299 for (i
=0; i
<UNESCAPE_MAP_LENGTH
; i
+=2) {
1300 if (c
== UNESCAPE_MAP
[i
]) {
1301 return UNESCAPE_MAP
[i
+1];
1302 } else if (c
< UNESCAPE_MAP
[i
]) {
1307 /* Map \cX to control-X: X & 0x1F */
1308 if (c
== 0x0063 /*'c'*/ && *offset
< length
) {
1309 c
= charAt((*offset
)++, context
);
1310 if (U16_IS_LEAD(c
) && *offset
< length
) {
1311 UChar c2
= charAt(*offset
, context
);
1312 if (U16_IS_TRAIL(c2
)) {
1314 c
= (UChar
) U16_GET_SUPPLEMENTARY(c
, c2
); /* [sic] */
1320 /* If no special forms are recognized, then consider
1321 * the backslash to generically escape the next character.
1322 * Deal with surrogate pairs. */
1323 if (U16_IS_LEAD(c
) && *offset
< length
) {
1324 UChar c2
= charAt(*offset
, context
);
1325 if (U16_IS_TRAIL(c2
)) {
1327 return U16_GET_SUPPLEMENTARY(c
, c2
);
1333 /* Invalid escape sequence */
1334 *offset
= start
; /* Reset to initial value */
1335 return (UChar32
)0xFFFFFFFF;
1338 /* u_unescapeAt() callback to return a UChar from a char* */
1339 static UChar U_CALLCONV
1340 _charPtr_charAt(int32_t offset
, void *context
) {
1342 /* It would be more efficient to access the invariant tables
1343 * directly but there is no API for that. */
1344 u_charsToUChars(((char*) context
) + offset
, &c16
, 1);
1348 /* Append an escape-free segment of the text; used by u_unescape() */
1349 static void _appendUChars(UChar
*dest
, int32_t destCapacity
,
1350 const char *src
, int32_t srcLen
) {
1351 if (destCapacity
< 0) {
1354 if (srcLen
> destCapacity
) {
1355 srcLen
= destCapacity
;
1357 u_charsToUChars(src
, dest
, srcLen
);
1360 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1361 U_CAPI
int32_t U_EXPORT2
1362 u_unescape(const char *src
, UChar
*dest
, int32_t destCapacity
) {
1363 const char *segment
= src
;
1367 while ((c
=*src
) != 0) {
1368 /* '\\' intentionally written as compiler-specific
1369 * character constant to correspond to compiler-specific
1370 * char* constants. */
1372 int32_t lenParsed
= 0;
1374 if (src
!= segment
) {
1376 _appendUChars(dest
+ i
, destCapacity
- i
,
1377 segment
, (int32_t)(src
- segment
));
1379 i
+= (int32_t)(src
- segment
);
1381 ++src
; /* advance past '\\' */
1382 c32
= (UChar32
)u_unescapeAt(_charPtr_charAt
, &lenParsed
, (int32_t)uprv_strlen(src
), (void*)src
);
1383 if (lenParsed
== 0) {
1386 src
+= lenParsed
; /* advance past escape seq. */
1387 if (dest
!= NULL
&& U16_LENGTH(c32
) <= (destCapacity
- i
)) {
1388 U16_APPEND_UNSAFE(dest
, i
, c32
);
1390 i
+= U16_LENGTH(c32
);
1397 if (src
!= segment
) {
1399 _appendUChars(dest
+ i
, destCapacity
- i
,
1400 segment
, (int32_t)(src
- segment
));
1402 i
+= (int32_t)(src
- segment
);
1404 if (dest
!= NULL
&& i
< destCapacity
) {
1410 if (dest
!= NULL
&& destCapacity
> 0) {
1416 /* NUL-termination of strings ----------------------------------------------- */
1419 * NUL-terminate a string no matter what its type.
1420 * Set warning and error codes accordingly.
1422 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1423 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1424 /* not a public function, so no complete argument checking */ \
1427 /* assume that the caller handles this */ \
1428 } else if(length<destCapacity) { \
1429 /* NUL-terminate the string, the NUL fits */ \
1431 /* unset the not-terminated warning but leave all others */ \
1432 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1433 *pErrorCode=U_ZERO_ERROR; \
1435 } else if(length==destCapacity) { \
1436 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1437 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1438 } else /* length>destCapacity */ { \
1439 /* even the string itself did not fit - set an error code */ \
1440 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1444 U_CAPI
int32_t U_EXPORT2
1445 u_terminateUChars(UChar
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1446 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1450 U_CAPI
int32_t U_EXPORT2
1451 u_terminateChars(char *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1452 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1456 U_CAPI
int32_t U_EXPORT2
1457 u_terminateUChar32s(UChar32
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1458 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1462 U_CAPI
int32_t U_EXPORT2
1463 u_terminateWChars(wchar_t *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1464 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1468 // Compute the hash code for a string -------------------------------------- ***
1470 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1471 // on UHashtable code.
1474 Compute the hash by iterating sparsely over about 32 (up to 63)
1475 characters spaced evenly through the string. For each character,
1476 multiply the previous hash value by a prime number and add the new
1477 character in, like a linear congruential random number generator,
1478 producing a pseudorandom deterministic value well distributed over
1479 the output range. [LIU]
1482 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1484 const TYPE *p = (const TYPE*) STR; \
1486 int32_t len = (int32_t)(STRLEN); \
1487 int32_t inc = ((len - 32) / 32) + 1; \
1488 const TYPE *limit = p + len; \
1490 hash = (hash * 37) + DEREF; \
1496 /* Used by UnicodeString to compute its hashcode - Not public API. */
1497 U_CAPI
int32_t U_EXPORT2
1498 ustr_hashUCharsN(const UChar
*str
, int32_t length
) {
1499 STRING_HASH(UChar
, str
, length
, *p
);
1502 U_CAPI
int32_t U_EXPORT2
1503 ustr_hashCharsN(const char *str
, int32_t length
) {
1504 STRING_HASH(uint8_t, str
, length
, *p
);
1507 U_CAPI
int32_t U_EXPORT2
1508 ustr_hashICharsN(const char *str
, int32_t length
) {
1509 STRING_HASH(char, str
, length
, (uint8_t)uprv_tolower(*p
));