]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustring.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 12/07/98 bertrand Creation.
17 ******************************************************************************
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
30 /* ANSI string.h - style functions ------------------------------------------ */
32 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
33 #define U_BMP_MAX 0xffff
35 /* Forward binary string search functions ----------------------------------- */
38 * Test if a substring match inside a string is at code point boundaries.
39 * All pointers refer to the same buffer.
40 * The limit pointer may be NULL, all others must be real pointers.
43 isMatchAtCPBoundary(const UChar
*start
, const UChar
*match
, const UChar
*matchLimit
, const UChar
*limit
) {
44 if(U16_IS_TRAIL(*match
) && start
!=match
&& U16_IS_LEAD(*(match
-1))) {
45 /* the leading edge of the match is in the middle of a surrogate pair */
48 if(U16_IS_LEAD(*(matchLimit
-1)) && match
!=limit
&& U16_IS_TRAIL(*matchLimit
)) {
49 /* the trailing edge of the match is in the middle of a surrogate pair */
55 U_CAPI UChar
* U_EXPORT2
56 u_strFindFirst(const UChar
*s
, int32_t length
,
57 const UChar
*sub
, int32_t subLength
) {
58 const UChar
*start
, *p
, *q
, *subLimit
;
61 if(sub
==NULL
|| subLength
<-1) {
64 if(s
==NULL
|| length
<-1) {
70 if(length
<0 && subLength
<0) {
71 /* both strings are NUL-terminated */
75 if(*sub
==0 && !U16_IS_SURROGATE(cs
)) {
76 /* the substring consists of a single, non-surrogate BMP code point */
77 return u_strchr(s
, cs
);
82 /* found first substring UChar, compare rest */
87 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
88 return (UChar
*)(s
-1); /* well-formed match */
90 break; /* no match because surrogate pair is split */
94 return NULL
; /* no match, and none possible after s */
110 subLength
=u_strlen(sub
);
116 /* get sub[0] to search for it fast */
119 subLimit
=sub
+subLength
;
121 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
122 /* the substring consists of a single, non-surrogate BMP code point */
123 return length
<0 ? u_strchr(s
, cs
) : u_memchr(s
, cs
, length
);
127 /* s is NUL-terminated */
130 /* found first substring UChar, compare rest */
135 if(isMatchAtCPBoundary(start
, s
-1, p
, NULL
)) {
136 return (UChar
*)(s
-1); /* well-formed match */
138 break; /* no match because surrogate pair is split */
142 return NULL
; /* no match, and none possible after s */
145 break; /* no match */
153 const UChar
*limit
, *preLimit
;
155 /* subLength was decremented above */
156 if(length
<=subLength
) {
157 return NULL
; /* s is shorter than sub */
162 /* the substring must start before preLimit */
163 preLimit
=limit
-subLength
;
168 /* found first substring UChar, compare rest */
173 if(isMatchAtCPBoundary(start
, s
-1, p
, limit
)) {
174 return (UChar
*)(s
-1); /* well-formed match */
176 break; /* no match because surrogate pair is split */
180 break; /* no match */
193 U_CAPI UChar
* U_EXPORT2
194 u_strstr(const UChar
*s
, const UChar
*substring
) {
195 return u_strFindFirst(s
, -1, substring
, -1);
198 U_CAPI UChar
* U_EXPORT2
199 u_strchr(const UChar
*s
, UChar c
) {
200 if(U16_IS_SURROGATE(c
)) {
201 /* make sure to not find half of a surrogate pair */
202 return u_strFindFirst(s
, -1, &c
, 1);
206 /* trivial search for a BMP code point */
219 U_CAPI UChar
* U_EXPORT2
220 u_strchr32(const UChar
*s
, UChar32 c
) {
221 if((uint32_t)c
<=U_BMP_MAX
) {
222 /* find BMP code point */
223 return u_strchr(s
, (UChar
)c
);
224 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
225 /* find supplementary code point as surrogate pair */
226 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
228 while((cs
=*s
++)!=0) {
229 if(cs
==lead
&& *s
==trail
) {
230 return (UChar
*)(s
-1);
235 /* not a Unicode code point, not findable */
240 U_CAPI UChar
* U_EXPORT2
241 u_memchr(const UChar
*s
, UChar c
, int32_t count
) {
243 return NULL
; /* no string */
244 } else if(U16_IS_SURROGATE(c
)) {
245 /* make sure to not find half of a surrogate pair */
246 return u_strFindFirst(s
, count
, &c
, 1);
248 /* trivial search for a BMP code point */
249 const UChar
*limit
=s
+count
;
259 U_CAPI UChar
* U_EXPORT2
260 u_memchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
261 if((uint32_t)c
<=U_BMP_MAX
) {
262 /* find BMP code point */
263 return u_memchr(s
, (UChar
)c
, count
);
265 /* too short for a surrogate pair */
267 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
268 /* find supplementary code point as surrogate pair */
269 const UChar
*limit
=s
+count
-1; /* -1 so that we do not need a separate check for the trail unit */
270 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
273 if(*s
==lead
&& *(s
+1)==trail
) {
279 /* not a Unicode code point, not findable */
284 /* Backward binary string search functions ---------------------------------- */
286 U_CAPI UChar
* U_EXPORT2
287 u_strFindLast(const UChar
*s
, int32_t length
,
288 const UChar
*sub
, int32_t subLength
) {
289 const UChar
*start
, *limit
, *p
, *q
, *subLimit
;
292 if(sub
==NULL
|| subLength
<-1) {
295 if(s
==NULL
|| length
<-1) {
300 * This implementation is more lazy than the one for u_strFindFirst():
301 * There is no special search code for NUL-terminated strings.
302 * It does not seem to be worth it for searching substrings to
303 * search forward and find all matches like in u_strrchr() and similar.
304 * Therefore, we simply get both string lengths and search backward.
310 subLength
=u_strlen(sub
);
316 /* get sub[subLength-1] to search for it fast */
317 subLimit
=sub
+subLength
;
321 if(subLength
==0 && !U16_IS_SURROGATE(cs
)) {
322 /* the substring consists of a single, non-surrogate BMP code point */
323 return length
<0 ? u_strrchr(s
, cs
) : u_memrchr(s
, cs
, length
);
330 /* subLength was decremented above */
331 if(length
<=subLength
) {
332 return NULL
; /* s is shorter than sub */
338 /* the substring must start no later than s+subLength */
344 /* found last substring UChar, compare rest */
349 if(isMatchAtCPBoundary(start
, p
, limit
+1, start
+length
)) {
350 return (UChar
*)p
; /* well-formed match */
352 break; /* no match because surrogate pair is split */
356 break; /* no match */
366 U_CAPI UChar
* U_EXPORT2
367 u_strrstr(const UChar
*s
, const UChar
*substring
) {
368 return u_strFindLast(s
, -1, substring
, -1);
371 U_CAPI UChar
* U_EXPORT2
372 u_strrchr(const UChar
*s
, UChar c
) {
373 if(U16_IS_SURROGATE(c
)) {
374 /* make sure to not find half of a surrogate pair */
375 return u_strFindLast(s
, -1, &c
, 1);
377 const UChar
*result
=NULL
;
380 /* trivial search for a BMP code point */
386 return (UChar
*)result
;
393 U_CAPI UChar
* U_EXPORT2
394 u_strrchr32(const UChar
*s
, UChar32 c
) {
395 if((uint32_t)c
<=U_BMP_MAX
) {
396 /* find BMP code point */
397 return u_strrchr(s
, (UChar
)c
);
398 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
399 /* find supplementary code point as surrogate pair */
400 const UChar
*result
=NULL
;
401 UChar cs
, lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
403 while((cs
=*s
++)!=0) {
404 if(cs
==lead
&& *s
==trail
) {
408 return (UChar
*)result
;
410 /* not a Unicode code point, not findable */
415 U_CAPI UChar
* U_EXPORT2
416 u_memrchr(const UChar
*s
, UChar c
, int32_t count
) {
418 return NULL
; /* no string */
419 } else if(U16_IS_SURROGATE(c
)) {
420 /* make sure to not find half of a surrogate pair */
421 return u_strFindLast(s
, count
, &c
, 1);
423 /* trivial search for a BMP code point */
424 const UChar
*limit
=s
+count
;
427 return (UChar
*)limit
;
434 U_CAPI UChar
* U_EXPORT2
435 u_memrchr32(const UChar
*s
, UChar32 c
, int32_t count
) {
436 if((uint32_t)c
<=U_BMP_MAX
) {
437 /* find BMP code point */
438 return u_memrchr(s
, (UChar
)c
, count
);
440 /* too short for a surrogate pair */
442 } else if((uint32_t)c
<=UCHAR_MAX_VALUE
) {
443 /* find supplementary code point as surrogate pair */
444 const UChar
*limit
=s
+count
-1;
445 UChar lead
=U16_LEAD(c
), trail
=U16_TRAIL(c
);
448 if(*limit
==trail
&& *(limit
-1)==lead
) {
449 return (UChar
*)(limit
-1);
454 /* not a Unicode code point, not findable */
459 /* Tokenization functions --------------------------------------------------- */
462 * Match each code point in a string against each code point in the matchSet.
463 * Return the index of the first string code point that
464 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
465 * Return -(string length)-1 if there is no such code point.
468 _matchFromSet(const UChar
*string
, const UChar
*matchSet
, UBool polarity
) {
469 int32_t matchLen
, matchBMPLen
, strItr
, matchItr
;
470 UChar32 stringCh
, matchCh
;
473 /* first part of matchSet contains only BMP code points */
475 while((c
= matchSet
[matchBMPLen
]) != 0 && U16_IS_SINGLE(c
)) {
479 /* second part of matchSet contains BMP and supplementary code points */
480 matchLen
= matchBMPLen
;
481 while(matchSet
[matchLen
] != 0) {
485 for(strItr
= 0; (c
= string
[strItr
]) != 0;) {
487 if(U16_IS_SINGLE(c
)) {
489 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
490 if(c
== matchSet
[matchItr
]) {
491 return strItr
- 1; /* one matches */
495 for(matchItr
= 0; matchItr
< matchLen
; ++matchItr
) {
496 if(c
== matchSet
[matchItr
]) {
500 return strItr
- 1; /* none matches */
504 * No need to check for string length before U16_IS_TRAIL
505 * because c2 could at worst be the terminating NUL.
507 if(U16_IS_SURROGATE_LEAD(c
) && U16_IS_TRAIL(c2
= string
[strItr
])) {
509 stringCh
= U16_GET_SUPPLEMENTARY(c
, c2
);
511 stringCh
= c
; /* unpaired trail surrogate */
515 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
516 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
517 if(stringCh
== matchCh
) {
518 return strItr
- U16_LENGTH(stringCh
); /* one matches */
522 for(matchItr
= matchBMPLen
; matchItr
< matchLen
;) {
523 U16_NEXT(matchSet
, matchItr
, matchLen
, matchCh
);
524 if(stringCh
== matchCh
) {
528 return strItr
- U16_LENGTH(stringCh
); /* none matches */
532 /* wish C had continue with labels like Java... */;
535 /* Didn't find it. */
539 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540 U_CAPI UChar
* U_EXPORT2
541 u_strpbrk(const UChar
*string
, const UChar
*matchSet
)
543 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
545 return (UChar
*)string
+ idx
;
551 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552 U_CAPI
int32_t U_EXPORT2
553 u_strcspn(const UChar
*string
, const UChar
*matchSet
)
555 int32_t idx
= _matchFromSet(string
, matchSet
, TRUE
);
559 return -idx
- 1; /* == u_strlen(string) */
563 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564 U_CAPI
int32_t U_EXPORT2
565 u_strspn(const UChar
*string
, const UChar
*matchSet
)
567 int32_t idx
= _matchFromSet(string
, matchSet
, FALSE
);
571 return -idx
- 1; /* == u_strlen(string) */
575 /* ----- Text manipulation functions --- */
577 U_CAPI UChar
* U_EXPORT2
578 u_strtok_r(UChar
*src
,
584 uint32_t nonDelimIdx
;
586 /* If saveState is NULL, the user messed up. */
589 *saveState
= src
; /* Set to "src" in case there are no delimiters */
591 else if (*saveState
) {
592 tokSource
= *saveState
;
595 /* src == NULL && *saveState == NULL */
596 /* This shouldn't happen. We already finished tokenizing. */
600 /* Skip initial delimiters */
601 nonDelimIdx
= u_strspn(tokSource
, delim
);
602 tokSource
= &tokSource
[nonDelimIdx
];
605 nextToken
= u_strpbrk(tokSource
, delim
);
606 if (nextToken
!= NULL
) {
609 *saveState
= nextToken
;
612 else if (*saveState
) {
613 /* Return the last token */
619 /* No tokens were found. Only delimiters were left. */
625 /* Miscellaneous functions -------------------------------------------------- */
627 U_CAPI UChar
* U_EXPORT2
631 UChar
*anchor
= dst
; /* save a pointer to start of dst */
633 while(*dst
!= 0) { /* To end of first string */
636 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
642 U_CAPI UChar
* U_EXPORT2
643 u_strncat(UChar
*dst
,
648 UChar
*anchor
= dst
; /* save a pointer to start of dst */
650 while(*dst
!= 0) { /* To end of first string */
653 while((*dst
= *src
) != 0) { /* copy string 2 over */
668 /* ----- Text property functions --- */
670 U_CAPI
int32_t U_EXPORT2
671 u_strcmp(const UChar
*s1
,
679 if (c1
!= c2
|| c1
== 0) {
683 return (int32_t)c1
- (int32_t)c2
;
686 U_CFUNC
int32_t U_EXPORT2
687 uprv_strCompare(const UChar
*s1
, int32_t length1
,
688 const UChar
*s2
, int32_t length2
,
689 UBool strncmpStyle
, UBool codePointOrder
) {
690 const UChar
*start1
, *start2
, *limit1
, *limit2
;
693 /* setup for fix-up */
697 /* compare identical prefixes - they do not need to be fixed up */
698 if(length1
<0 && length2
<0) {
699 /* strcmp style, both NUL-terminated */
717 /* setup for fix-up */
719 } else if(strncmpStyle
) {
720 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
725 limit1
=start1
+length1
;
728 /* both lengths are same, check only one limit */
745 /* setup for fix-up */
746 limit2
=start2
+length1
; /* use length1 here, too, to enforce assumption */
748 /* memcmp/UnicodeString style, both length-specified */
749 int32_t lengthResult
;
752 length1
=u_strlen(s1
);
755 length2
=u_strlen(s2
);
758 /* limit1=start1+min(lenght1, length2) */
759 if(length1
<length2
) {
761 limit1
=start1
+length1
;
762 } else if(length1
==length2
) {
764 limit1
=start1
+length1
;
765 } else /* length1>length2 */ {
767 limit1
=start1
+length2
;
775 /* check pseudo-limit */
789 /* setup for fix-up */
790 limit1
=start1
+length1
;
791 limit2
=start2
+length2
;
794 /* if both values are in or above the surrogate range, fix them up */
795 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
796 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
798 (c1
<=0xdbff && (s1
+1)!=limit1
&& U16_IS_TRAIL(*(s1
+1))) ||
799 (U16_IS_TRAIL(c1
) && start1
!=s1
&& U16_IS_LEAD(*(s1
-1)))
801 /* part of a surrogate pair, leave >=d800 */
803 /* BMP code point - may be surrogate code point - make <d800 */
808 (c2
<=0xdbff && (s2
+1)!=limit2
&& U16_IS_TRAIL(*(s2
+1))) ||
809 (U16_IS_TRAIL(c2
) && start2
!=s2
&& U16_IS_LEAD(*(s2
-1)))
811 /* part of a surrogate pair, leave >=d800 */
813 /* BMP code point - may be surrogate code point - make <d800 */
818 /* now c1 and c2 are in the requested (code unit or code point) order */
819 return (int32_t)c1
-(int32_t)c2
;
823 * Compare two strings as presented by UCharIterators.
824 * Use code unit or code point order.
825 * When the function returns, it is undefined where the iterators
828 U_CAPI
int32_t U_EXPORT2
829 u_strCompareIter(UCharIterator
*iter1
, UCharIterator
*iter2
, UBool codePointOrder
) {
832 /* argument checking */
833 if(iter1
==NULL
|| iter2
==NULL
) {
834 return 0; /* bad arguments */
837 return 0; /* identical iterators */
840 /* reset iterators to start? */
841 iter1
->move(iter1
, 0, UITER_START
);
842 iter2
->move(iter2
, 0, UITER_START
);
844 /* compare identical prefixes - they do not need to be fixed up */
846 c1
=iter1
->next(iter1
);
847 c2
=iter2
->next(iter2
);
856 /* if both values are in or above the surrogate range, fix them up */
857 if(c1
>=0xd800 && c2
>=0xd800 && codePointOrder
) {
858 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
860 (c1
<=0xdbff && U16_IS_TRAIL(iter1
->current(iter1
))) ||
861 (U16_IS_TRAIL(c1
) && (iter1
->previous(iter1
), U16_IS_LEAD(iter1
->previous(iter1
))))
863 /* part of a surrogate pair, leave >=d800 */
865 /* BMP code point - may be surrogate code point - make <d800 */
870 (c2
<=0xdbff && U16_IS_TRAIL(iter2
->current(iter2
))) ||
871 (U16_IS_TRAIL(c2
) && (iter2
->previous(iter2
), U16_IS_LEAD(iter2
->previous(iter2
))))
873 /* part of a surrogate pair, leave >=d800 */
875 /* BMP code point - may be surrogate code point - make <d800 */
880 /* now c1 and c2 are in the requested (code unit or code point) order */
881 return (int32_t)c1
-(int32_t)c2
;
886 * u_strCompareIter() does not leave the iterators _on_ the different units.
887 * This is possible but would cost a few extra indirect function calls to back
888 * up if the last unit (c1 or c2 respectively) was >=0.
890 * Consistently leaving them _behind_ the different units is not an option
891 * because the current "unit" is the end of the string if that is reached,
892 * and in such a case the iterator does not move.
893 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894 * of their strings. Calling previous() on each does not move them to where
895 * the comparison fails.
897 * So the simplest semantics is to not define where the iterators end up.
899 * The following fragment is part of what would need to be done for backing up.
902 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
904 if(!U16_IS_TRAIL(iter1
->current(iter1
))) {
905 /* lead surrogate code point - make <d800 */
908 } else if(c1
<=0xdfff) {
909 int32_t idx
=iter1
->getIndex(iter1
, UITER_CURRENT
);
910 iter1
->previous(iter1
); /* ==c1 */
911 if(!U16_IS_LEAD(iter1
->previous(iter1
))) {
912 /* trail surrogate code point - make <d800 */
915 /* go back to behind where the difference is */
916 iter1
->move(iter1
, idx
, UITER_ZERO
);
917 } else /* 0xe000<=c1<=0xffff */ {
918 /* BMP code point - make <d800 */
924 U_CAPI
int32_t U_EXPORT2
925 u_strCompare(const UChar
*s1
, int32_t length1
,
926 const UChar
*s2
, int32_t length2
,
927 UBool codePointOrder
) {
928 /* argument checking */
929 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
932 return uprv_strCompare(s1
, length1
, s2
, length2
, FALSE
, codePointOrder
);
935 /* String compare in code point order - u_strcmp() compares in code unit order. */
936 U_CAPI
int32_t U_EXPORT2
937 u_strcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
) {
938 return uprv_strCompare(s1
, -1, s2
, -1, FALSE
, TRUE
);
941 U_CAPI
int32_t U_EXPORT2
942 u_strncmp(const UChar
*s1
,
949 rc
= (int32_t)*s1
- (int32_t)*s2
;
950 if(rc
!= 0 || *s1
== 0 || --n
== 0) {
961 U_CAPI
int32_t U_EXPORT2
962 u_strncmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t n
) {
963 return uprv_strCompare(s1
, n
, s2
, n
, TRUE
, TRUE
);
966 U_CAPI UChar
* U_EXPORT2
970 UChar
*anchor
= dst
; /* save a pointer to start of dst */
972 while((*(dst
++) = *(src
++)) != 0) { /* copy string 2 over */
978 U_CAPI UChar
* U_EXPORT2
979 u_strncpy(UChar
*dst
,
983 UChar
*anchor
= dst
; /* save a pointer to start of dst */
985 /* copy string 2 over */
986 while(n
> 0 && (*(dst
++) = *(src
++)) != 0) {
993 U_CAPI
int32_t U_EXPORT2
994 u_strlen(const UChar
*s
)
996 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997 return (int32_t)uprv_wcslen((const wchar_t *)s
);
1007 U_CAPI
int32_t U_EXPORT2
1008 u_countChar32(const UChar
*s
, int32_t length
) {
1011 if(s
==NULL
|| length
<-1) {
1019 if(U16_IS_LEAD(*s
) && length
>=2 && U16_IS_TRAIL(*(s
+1))) {
1027 } else /* length==-1 */ {
1037 * sufficient to look ahead one because of UTF-16;
1038 * safe to look ahead one because at worst that would be the terminating NUL
1040 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1048 U_CAPI UBool U_EXPORT2
1049 u_strHasMoreChar32Than(const UChar
*s
, int32_t length
, int32_t number
) {
1054 if(s
==NULL
|| length
<-1) {
1059 /* s is NUL-terminated */
1062 /* count code points until they exceed */
1070 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(*s
)) {
1076 /* length>=0 known */
1078 int32_t maxSupplementary
;
1080 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081 if(((length
+1)/2)>number
) {
1085 /* check if s does not even contain enough UChars */
1086 maxSupplementary
=length
-number
;
1087 if(maxSupplementary
<=0) {
1090 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1093 * count code points until they exceed and also check that there are
1094 * no more than maxSupplementary supplementary code points (UChar pairs)
1104 if(U16_IS_LEAD(*s
++) && s
!=limit
&& U16_IS_TRAIL(*s
)) {
1106 if(--maxSupplementary
<=0) {
1107 /* too many pairs - too few code points */
1116 U_CAPI UChar
* U_EXPORT2
1117 u_memcpy(UChar
*dest
, const UChar
*src
, int32_t count
) {
1119 uprv_memcpy(dest
, src
, (size_t)count
*U_SIZEOF_UCHAR
);
1124 U_CAPI UChar
* U_EXPORT2
1125 u_memmove(UChar
*dest
, const UChar
*src
, int32_t count
) {
1127 uprv_memmove(dest
, src
, (size_t)count
*U_SIZEOF_UCHAR
);
1132 U_CAPI UChar
* U_EXPORT2
1133 u_memset(UChar
*dest
, UChar c
, int32_t count
) {
1136 UChar
*limit
= dest
+ count
;
1138 while (ptr
< limit
) {
1145 U_CAPI
int32_t U_EXPORT2
1146 u_memcmp(const UChar
*buf1
, const UChar
*buf2
, int32_t count
) {
1148 const UChar
*limit
= buf1
+ count
;
1151 while (buf1
< limit
) {
1152 result
= (int32_t)(uint16_t)*buf1
- (int32_t)(uint16_t)*buf2
;
1163 U_CAPI
int32_t U_EXPORT2
1164 u_memcmpCodePointOrder(const UChar
*s1
, const UChar
*s2
, int32_t count
) {
1165 return uprv_strCompare(s1
, count
, s2
, count
, FALSE
, TRUE
);
1168 /* u_unescape & support fns ------------------------------------------------- */
1170 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1171 static const UChar UNESCAPE_MAP
[] = {
1185 enum { UNESCAPE_MAP_LENGTH
= UPRV_LENGTHOF(UNESCAPE_MAP
) };
1187 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1188 static int8_t _digit8(UChar c
) {
1189 if (c
>= 0x0030 && c
<= 0x0037) {
1190 return (int8_t)(c
- 0x0030);
1195 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1196 static int8_t _digit16(UChar c
) {
1197 if (c
>= 0x0030 && c
<= 0x0039) {
1198 return (int8_t)(c
- 0x0030);
1200 if (c
>= 0x0041 && c
<= 0x0046) {
1201 return (int8_t)(c
- (0x0041 - 10));
1203 if (c
>= 0x0061 && c
<= 0x0066) {
1204 return (int8_t)(c
- (0x0061 - 10));
1209 /* Parse a single escape sequence. Although this method deals in
1210 * UChars, it does not use C++ or UnicodeString. This allows it to
1211 * be used from C contexts. */
1212 U_CAPI UChar32 U_EXPORT2
1213 u_unescapeAt(UNESCAPE_CHAR_AT charAt
,
1218 int32_t start
= *offset
;
1224 int8_t bitsPerDigit
= 4;
1227 UBool braces
= FALSE
;
1229 /* Check that offset is in range */
1230 if (*offset
< 0 || *offset
>= length
) {
1234 /* Fetch first UChar after '\\' */
1235 c
= charAt((*offset
)++, context
);
1237 /* Convert hexadecimal and octal escapes */
1239 case 0x0075 /*'u'*/:
1240 minDig
= maxDig
= 4;
1242 case 0x0055 /*'U'*/:
1243 minDig
= maxDig
= 8;
1245 case 0x0078 /*'x'*/:
1247 if (*offset
< length
&& charAt(*offset
, context
) == 0x7B /*{*/) {
1260 n
= 1; /* Already have first octal digit */
1267 while (*offset
< length
&& n
< maxDig
) {
1268 c
= charAt(*offset
, context
);
1269 dig
= (int8_t)((bitsPerDigit
== 3) ? _digit8(c
) : _digit16(c
));
1273 result
= (result
<< bitsPerDigit
) | dig
;
1281 if (c
!= 0x7D /*}*/) {
1286 if (result
< 0 || result
>= 0x110000) {
1289 /* If an escape sequence specifies a lead surrogate, see if
1290 * there is a trail surrogate after it, either as an escape or
1291 * as a literal. If so, join them up into a supplementary.
1293 if (*offset
< length
&& U16_IS_LEAD(result
)) {
1294 int32_t ahead
= *offset
+ 1;
1295 c
= charAt(*offset
, context
);
1296 if (c
== 0x5C /*'\\'*/ && ahead
< length
) {
1297 c
= (UChar
) u_unescapeAt(charAt
, &ahead
, length
, context
);
1299 if (U16_IS_TRAIL(c
)) {
1301 result
= U16_GET_SUPPLEMENTARY(result
, c
);
1307 /* Convert C-style escapes in table */
1308 for (i
=0; i
<UNESCAPE_MAP_LENGTH
; i
+=2) {
1309 if (c
== UNESCAPE_MAP
[i
]) {
1310 return UNESCAPE_MAP
[i
+1];
1311 } else if (c
< UNESCAPE_MAP
[i
]) {
1316 /* Map \cX to control-X: X & 0x1F */
1317 if (c
== 0x0063 /*'c'*/ && *offset
< length
) {
1318 c
= charAt((*offset
)++, context
);
1319 if (U16_IS_LEAD(c
) && *offset
< length
) {
1320 UChar c2
= charAt(*offset
, context
);
1321 if (U16_IS_TRAIL(c2
)) {
1323 c
= (UChar
) U16_GET_SUPPLEMENTARY(c
, c2
); /* [sic] */
1329 /* If no special forms are recognized, then consider
1330 * the backslash to generically escape the next character.
1331 * Deal with surrogate pairs. */
1332 if (U16_IS_LEAD(c
) && *offset
< length
) {
1333 UChar c2
= charAt(*offset
, context
);
1334 if (U16_IS_TRAIL(c2
)) {
1336 return U16_GET_SUPPLEMENTARY(c
, c2
);
1342 /* Invalid escape sequence */
1343 *offset
= start
; /* Reset to initial value */
1344 return (UChar32
)0xFFFFFFFF;
1347 /* u_unescapeAt() callback to return a UChar from a char* */
1348 static UChar U_CALLCONV
1349 _charPtr_charAt(int32_t offset
, void *context
) {
1351 /* It would be more efficient to access the invariant tables
1352 * directly but there is no API for that. */
1353 u_charsToUChars(((char*) context
) + offset
, &c16
, 1);
1357 /* Append an escape-free segment of the text; used by u_unescape() */
1358 static void _appendUChars(UChar
*dest
, int32_t destCapacity
,
1359 const char *src
, int32_t srcLen
) {
1360 if (destCapacity
< 0) {
1363 if (srcLen
> destCapacity
) {
1364 srcLen
= destCapacity
;
1366 u_charsToUChars(src
, dest
, srcLen
);
1369 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1370 U_CAPI
int32_t U_EXPORT2
1371 u_unescape(const char *src
, UChar
*dest
, int32_t destCapacity
) {
1372 const char *segment
= src
;
1376 while ((c
=*src
) != 0) {
1377 /* '\\' intentionally written as compiler-specific
1378 * character constant to correspond to compiler-specific
1379 * char* constants. */
1381 int32_t lenParsed
= 0;
1383 if (src
!= segment
) {
1385 _appendUChars(dest
+ i
, destCapacity
- i
,
1386 segment
, (int32_t)(src
- segment
));
1388 i
+= (int32_t)(src
- segment
);
1390 ++src
; /* advance past '\\' */
1391 c32
= (UChar32
)u_unescapeAt(_charPtr_charAt
, &lenParsed
, (int32_t)uprv_strlen(src
), (void*)src
);
1392 if (lenParsed
== 0) {
1395 src
+= lenParsed
; /* advance past escape seq. */
1396 if (dest
!= NULL
&& U16_LENGTH(c32
) <= (destCapacity
- i
)) {
1397 U16_APPEND_UNSAFE(dest
, i
, c32
);
1399 i
+= U16_LENGTH(c32
);
1406 if (src
!= segment
) {
1408 _appendUChars(dest
+ i
, destCapacity
- i
,
1409 segment
, (int32_t)(src
- segment
));
1411 i
+= (int32_t)(src
- segment
);
1413 if (dest
!= NULL
&& i
< destCapacity
) {
1419 if (dest
!= NULL
&& destCapacity
> 0) {
1425 /* NUL-termination of strings ----------------------------------------------- */
1428 * NUL-terminate a string no matter what its type.
1429 * Set warning and error codes accordingly.
1431 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1432 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1433 /* not a public function, so no complete argument checking */ \
1436 /* assume that the caller handles this */ \
1437 } else if(length<destCapacity) { \
1438 /* NUL-terminate the string, the NUL fits */ \
1440 /* unset the not-terminated warning but leave all others */ \
1441 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1442 *pErrorCode=U_ZERO_ERROR; \
1444 } else if(length==destCapacity) { \
1445 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1446 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1447 } else /* length>destCapacity */ { \
1448 /* even the string itself did not fit - set an error code */ \
1449 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1453 U_CAPI
int32_t U_EXPORT2
1454 u_terminateUChars(UChar
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1455 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1459 U_CAPI
int32_t U_EXPORT2
1460 u_terminateChars(char *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1461 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1465 U_CAPI
int32_t U_EXPORT2
1466 u_terminateUChar32s(UChar32
*dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1467 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1471 U_CAPI
int32_t U_EXPORT2
1472 u_terminateWChars(wchar_t *dest
, int32_t destCapacity
, int32_t length
, UErrorCode
*pErrorCode
) {
1473 __TERMINATE_STRING(dest
, destCapacity
, length
, pErrorCode
);
1477 // Compute the hash code for a string -------------------------------------- ***
1479 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1480 // on UHashtable code.
1483 Compute the hash by iterating sparsely over about 32 (up to 63)
1484 characters spaced evenly through the string. For each character,
1485 multiply the previous hash value by a prime number and add the new
1486 character in, like a linear congruential random number generator,
1487 producing a pseudorandom deterministic value well distributed over
1488 the output range. [LIU]
1491 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1492 uint32_t hash = 0; \
1493 const TYPE *p = (const TYPE*) STR; \
1495 int32_t len = (int32_t)(STRLEN); \
1496 int32_t inc = ((len - 32) / 32) + 1; \
1497 const TYPE *limit = p + len; \
1499 hash = (hash * 37) + DEREF; \
1503 return static_cast<int32_t>(hash)
1505 /* Used by UnicodeString to compute its hashcode - Not public API. */
1506 U_CAPI
int32_t U_EXPORT2
1507 ustr_hashUCharsN(const UChar
*str
, int32_t length
) {
1508 STRING_HASH(UChar
, str
, length
, *p
);
1511 U_CAPI
int32_t U_EXPORT2
1512 ustr_hashCharsN(const char *str
, int32_t length
) {
1513 STRING_HASH(uint8_t, str
, length
, *p
);
1516 U_CAPI
int32_t U_EXPORT2
1517 ustr_hashICharsN(const char *str
, int32_t length
) {
1518 STRING_HASH(char, str
, length
, (uint8_t)uprv_tolower(*p
));