]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
4 | * Copyright (C) 1998-2003, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ****************************************************************************** | |
8 | * | |
9 | * File ustring.h | |
10 | * | |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 12/07/98 bertrand Creation. | |
15 | ****************************************************************************** | |
16 | */ | |
17 | ||
18 | #include "unicode/utypes.h" | |
19 | #include "unicode/uchar.h" | |
20 | #include "unicode/uiter.h" | |
21 | #include "unicode/ustring.h" | |
22 | #include "unicode/putil.h" | |
23 | #include "unicode/ucnv.h" | |
24 | #include "cstring.h" | |
25 | #include "cwchar.h" | |
26 | #include "cmemory.h" | |
27 | #include "umutex.h" | |
28 | #include "ustr_imp.h" | |
29 | ||
30 | /* forward declaractions of definitions for the shared default converter */ | |
31 | ||
32 | static UConverter *gDefaultConverter = NULL; | |
33 | ||
34 | /* ANSI string.h - style functions ------------------------------------------ */ | |
35 | ||
36 | /* maximum string length for u_uastrcpy() and u_austrcpy() implementations */ | |
37 | #define MAX_STRLEN 0x0FFFFFFF | |
38 | ||
39 | /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ | |
40 | #define U_BMP_MAX 0xffff | |
41 | ||
42 | /* Forward binary string search functions ----------------------------------- */ | |
43 | ||
44 | /* | |
45 | * Test if a substring match inside a string is at code point boundaries. | |
46 | * All pointers refer to the same buffer. | |
47 | * The limit pointer may be NULL, all others must be real pointers. | |
48 | */ | |
49 | static U_INLINE UBool | |
50 | isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { | |
51 | if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { | |
52 | /* the leading edge of the match is in the middle of a surrogate pair */ | |
53 | return FALSE; | |
54 | } | |
55 | if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { | |
56 | /* the trailing edge of the match is in the middle of a surrogate pair */ | |
57 | return FALSE; | |
58 | } | |
59 | return TRUE; | |
60 | } | |
61 | ||
62 | U_CAPI UChar * U_EXPORT2 | |
63 | u_strFindFirst(const UChar *s, int32_t length, | |
64 | const UChar *sub, int32_t subLength) { | |
65 | const UChar *start, *p, *q, *subLimit; | |
66 | UChar c, cs, cq; | |
67 | ||
68 | if(sub==NULL || subLength<-1) { | |
69 | return (UChar *)s; | |
70 | } | |
71 | if(s==NULL || length<-1) { | |
72 | return NULL; | |
73 | } | |
74 | ||
75 | start=s; | |
76 | ||
77 | if(length<0 && subLength<0) { | |
78 | /* both strings are NUL-terminated */ | |
79 | if((cs=*sub++)==0) { | |
80 | return (UChar *)s; | |
81 | } | |
82 | if(*sub==0 && !U16_IS_SURROGATE(cs)) { | |
83 | /* the substring consists of a single, non-surrogate BMP code point */ | |
84 | return u_strchr(s, cs); | |
85 | } | |
86 | ||
87 | while((c=*s++)!=0) { | |
88 | if(c==cs) { | |
89 | /* found first substring UChar, compare rest */ | |
90 | p=s; | |
91 | q=sub; | |
92 | for(;;) { | |
93 | if((cq=*q)==0) { | |
94 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { | |
95 | return (UChar *)(s-1); /* well-formed match */ | |
96 | } else { | |
97 | break; /* no match because surrogate pair is split */ | |
98 | } | |
99 | } | |
100 | if((c=*p)==0) { | |
101 | return NULL; /* no match, and none possible after s */ | |
102 | } | |
103 | if(c!=cq) { | |
104 | break; /* no match */ | |
105 | } | |
106 | ++p; | |
107 | ++q; | |
108 | } | |
109 | } | |
110 | } | |
111 | ||
112 | /* not found */ | |
113 | return NULL; | |
114 | } | |
115 | ||
116 | if(subLength<0) { | |
117 | subLength=u_strlen(sub); | |
118 | } | |
119 | if(subLength==0) { | |
120 | return (UChar *)s; | |
121 | } | |
122 | ||
123 | /* get sub[0] to search for it fast */ | |
124 | cs=*sub++; | |
125 | --subLength; | |
126 | subLimit=sub+subLength; | |
127 | ||
128 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { | |
129 | /* the substring consists of a single, non-surrogate BMP code point */ | |
130 | return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); | |
131 | } | |
132 | ||
133 | if(length<0) { | |
134 | /* s is NUL-terminated */ | |
135 | while((c=*s++)!=0) { | |
136 | if(c==cs) { | |
137 | /* found first substring UChar, compare rest */ | |
138 | p=s; | |
139 | q=sub; | |
140 | for(;;) { | |
141 | if(q==subLimit) { | |
142 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { | |
143 | return (UChar *)(s-1); /* well-formed match */ | |
144 | } else { | |
145 | break; /* no match because surrogate pair is split */ | |
146 | } | |
147 | } | |
148 | if((c=*p)==0) { | |
149 | return NULL; /* no match, and none possible after s */ | |
150 | } | |
151 | if(c!=*q) { | |
152 | break; /* no match */ | |
153 | } | |
154 | ++p; | |
155 | ++q; | |
156 | } | |
157 | } | |
158 | } | |
159 | } else { | |
160 | const UChar *limit, *preLimit; | |
161 | ||
162 | /* subLength was decremented above */ | |
163 | if(length<=subLength) { | |
164 | return NULL; /* s is shorter than sub */ | |
165 | } | |
166 | ||
167 | limit=s+length; | |
168 | ||
169 | /* the substring must start before preLimit */ | |
170 | preLimit=limit-subLength; | |
171 | ||
172 | while(s!=preLimit) { | |
173 | c=*s++; | |
174 | if(c==cs) { | |
175 | /* found first substring UChar, compare rest */ | |
176 | p=s; | |
177 | q=sub; | |
178 | for(;;) { | |
179 | if(q==subLimit) { | |
180 | if(isMatchAtCPBoundary(start, s-1, p, limit)) { | |
181 | return (UChar *)(s-1); /* well-formed match */ | |
182 | } else { | |
183 | break; /* no match because surrogate pair is split */ | |
184 | } | |
185 | } | |
186 | if(*p!=*q) { | |
187 | break; /* no match */ | |
188 | } | |
189 | ++p; | |
190 | ++q; | |
191 | } | |
192 | } | |
193 | } | |
194 | } | |
195 | ||
196 | /* not found */ | |
197 | return NULL; | |
198 | } | |
199 | ||
200 | U_CAPI UChar * U_EXPORT2 | |
201 | u_strstr(const UChar *s, const UChar *substring) { | |
202 | return u_strFindFirst(s, -1, substring, -1); | |
203 | } | |
204 | ||
205 | U_CAPI UChar * U_EXPORT2 | |
206 | u_strchr(const UChar *s, UChar c) { | |
207 | if(U16_IS_SURROGATE(c)) { | |
208 | /* make sure to not find half of a surrogate pair */ | |
209 | return u_strFindFirst(s, -1, &c, 1); | |
210 | } else { | |
211 | UChar cs; | |
212 | ||
213 | /* trivial search for a BMP code point */ | |
214 | for(;;) { | |
215 | if((cs=*s)==c) { | |
216 | return (UChar *)s; | |
217 | } | |
218 | if(cs==0) { | |
219 | return NULL; | |
220 | } | |
221 | ++s; | |
222 | } | |
223 | } | |
224 | } | |
225 | ||
226 | U_CAPI UChar * U_EXPORT2 | |
227 | u_strchr32(const UChar *s, UChar32 c) { | |
228 | if((uint32_t)c<=U_BMP_MAX) { | |
229 | /* find BMP code point */ | |
230 | return u_strchr(s, (UChar)c); | |
231 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
232 | /* find supplementary code point as surrogate pair */ | |
233 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
234 | ||
235 | while((cs=*s++)!=0) { | |
236 | if(cs==lead && *s==trail) { | |
237 | return (UChar *)(s-1); | |
238 | } | |
239 | } | |
240 | return NULL; | |
241 | } else { | |
242 | /* not a Unicode code point, not findable */ | |
243 | return NULL; | |
244 | } | |
245 | } | |
246 | ||
247 | U_CAPI UChar * U_EXPORT2 | |
248 | u_memchr(const UChar *s, UChar c, int32_t count) { | |
249 | if(count<=0) { | |
250 | return NULL; /* no string */ | |
251 | } else if(U16_IS_SURROGATE(c)) { | |
252 | /* make sure to not find half of a surrogate pair */ | |
253 | return u_strFindFirst(s, count, &c, 1); | |
254 | } else { | |
255 | /* trivial search for a BMP code point */ | |
256 | const UChar *limit=s+count; | |
257 | do { | |
258 | if(*s==c) { | |
259 | return (UChar *)s; | |
260 | } | |
261 | } while(++s!=limit); | |
262 | return NULL; | |
263 | } | |
264 | } | |
265 | ||
266 | U_CAPI UChar * U_EXPORT2 | |
267 | u_memchr32(const UChar *s, UChar32 c, int32_t count) { | |
268 | if((uint32_t)c<=U_BMP_MAX) { | |
269 | /* find BMP code point */ | |
270 | return u_memchr(s, (UChar)c, count); | |
271 | } else if(count<2) { | |
272 | /* too short for a surrogate pair */ | |
273 | return NULL; | |
274 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
275 | /* find supplementary code point as surrogate pair */ | |
276 | const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ | |
277 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
278 | ||
279 | do { | |
280 | if(*s==lead && *(s+1)==trail) { | |
281 | return (UChar *)s; | |
282 | } | |
283 | } while(++s!=limit); | |
284 | return NULL; | |
285 | } else { | |
286 | /* not a Unicode code point, not findable */ | |
287 | return NULL; | |
288 | } | |
289 | } | |
290 | ||
291 | /* Backward binary string search functions ---------------------------------- */ | |
292 | ||
293 | U_CAPI UChar * U_EXPORT2 | |
294 | u_strFindLast(const UChar *s, int32_t length, | |
295 | const UChar *sub, int32_t subLength) { | |
296 | const UChar *start, *limit, *p, *q, *subLimit; | |
297 | UChar c, cs; | |
298 | ||
299 | if(sub==NULL || subLength<-1) { | |
300 | return (UChar *)s; | |
301 | } | |
302 | if(s==NULL || length<-1) { | |
303 | return NULL; | |
304 | } | |
305 | ||
306 | /* | |
307 | * This implementation is more lazy than the one for u_strFindFirst(): | |
308 | * There is no special search code for NUL-terminated strings. | |
309 | * It does not seem to be worth it for searching substrings to | |
310 | * search forward and find all matches like in u_strrchr() and similar. | |
311 | * Therefore, we simply get both string lengths and search backward. | |
312 | * | |
313 | * markus 2002oct23 | |
314 | */ | |
315 | ||
316 | if(subLength<0) { | |
317 | subLength=u_strlen(sub); | |
318 | } | |
319 | if(subLength==0) { | |
320 | return (UChar *)s; | |
321 | } | |
322 | ||
323 | /* get sub[subLength-1] to search for it fast */ | |
324 | subLimit=sub+subLength; | |
325 | cs=*(--subLimit); | |
326 | --subLength; | |
327 | ||
328 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { | |
329 | /* the substring consists of a single, non-surrogate BMP code point */ | |
330 | return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); | |
331 | } | |
332 | ||
333 | if(length<0) { | |
334 | length=u_strlen(s); | |
335 | } | |
336 | ||
337 | /* subLength was decremented above */ | |
338 | if(length<=subLength) { | |
339 | return NULL; /* s is shorter than sub */ | |
340 | } | |
341 | ||
342 | start=s; | |
343 | limit=s+length; | |
344 | ||
345 | /* the substring must start no later than s+subLength */ | |
346 | s+=subLength; | |
347 | ||
348 | while(s!=limit) { | |
349 | c=*(--limit); | |
350 | if(c==cs) { | |
351 | /* found last substring UChar, compare rest */ | |
352 | p=limit; | |
353 | q=subLimit; | |
354 | for(;;) { | |
355 | if(q==sub) { | |
356 | if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { | |
357 | return (UChar *)p; /* well-formed match */ | |
358 | } else { | |
359 | break; /* no match because surrogate pair is split */ | |
360 | } | |
361 | } | |
362 | if(*(--p)!=*(--q)) { | |
363 | break; /* no match */ | |
364 | } | |
365 | } | |
366 | } | |
367 | } | |
368 | ||
369 | /* not found */ | |
370 | return NULL; | |
371 | } | |
372 | ||
373 | U_CAPI UChar * U_EXPORT2 | |
374 | u_strrstr(const UChar *s, const UChar *substring) { | |
375 | return u_strFindLast(s, -1, substring, -1); | |
376 | } | |
377 | ||
378 | U_CAPI UChar * U_EXPORT2 | |
379 | u_strrchr(const UChar *s, UChar c) { | |
380 | if(U16_IS_SURROGATE(c)) { | |
381 | /* make sure to not find half of a surrogate pair */ | |
382 | return u_strFindLast(s, -1, &c, 1); | |
383 | } else { | |
384 | const UChar *result=NULL; | |
385 | UChar cs; | |
386 | ||
387 | /* trivial search for a BMP code point */ | |
388 | for(;;) { | |
389 | if((cs=*s)==c) { | |
390 | result=s; | |
391 | } | |
392 | if(cs==0) { | |
393 | return (UChar *)result; | |
394 | } | |
395 | ++s; | |
396 | } | |
397 | } | |
398 | } | |
399 | ||
400 | U_CAPI UChar * U_EXPORT2 | |
401 | u_strrchr32(const UChar *s, UChar32 c) { | |
402 | if((uint32_t)c<=U_BMP_MAX) { | |
403 | /* find BMP code point */ | |
404 | return u_strrchr(s, (UChar)c); | |
405 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
406 | /* find supplementary code point as surrogate pair */ | |
407 | const UChar *result=NULL; | |
408 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
409 | ||
410 | while((cs=*s++)!=0) { | |
411 | if(cs==lead && *s==trail) { | |
412 | result=s-1; | |
413 | } | |
414 | } | |
415 | return (UChar *)result; | |
416 | } else { | |
417 | /* not a Unicode code point, not findable */ | |
418 | return NULL; | |
419 | } | |
420 | } | |
421 | ||
422 | U_CAPI UChar * U_EXPORT2 | |
423 | u_memrchr(const UChar *s, UChar c, int32_t count) { | |
424 | if(count<=0) { | |
425 | return NULL; /* no string */ | |
426 | } else if(U16_IS_SURROGATE(c)) { | |
427 | /* make sure to not find half of a surrogate pair */ | |
428 | return u_strFindLast(s, count, &c, 1); | |
429 | } else { | |
430 | /* trivial search for a BMP code point */ | |
431 | const UChar *limit=s+count; | |
432 | do { | |
433 | if(*(--limit)==c) { | |
434 | return (UChar *)limit; | |
435 | } | |
436 | } while(s!=limit); | |
437 | return NULL; | |
438 | } | |
439 | } | |
440 | ||
441 | U_CAPI UChar * U_EXPORT2 | |
442 | u_memrchr32(const UChar *s, UChar32 c, int32_t count) { | |
443 | if((uint32_t)c<=U_BMP_MAX) { | |
444 | /* find BMP code point */ | |
445 | return u_memrchr(s, (UChar)c, count); | |
446 | } else if(count<2) { | |
447 | /* too short for a surrogate pair */ | |
448 | return NULL; | |
449 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
450 | /* find supplementary code point as surrogate pair */ | |
451 | const UChar *limit=s+count-1; | |
452 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
453 | ||
454 | do { | |
455 | if(*limit==trail && *(limit-1)==lead) { | |
456 | return (UChar *)(limit-1); | |
457 | } | |
458 | } while(s!=--limit); | |
459 | return NULL; | |
460 | } else { | |
461 | /* not a Unicode code point, not findable */ | |
462 | return NULL; | |
463 | } | |
464 | } | |
465 | ||
466 | /* Tokenization functions --------------------------------------------------- */ | |
467 | ||
468 | /* | |
469 | * Match each code point in a string against each code point in the matchSet. | |
470 | * Return the index of the first string code point that | |
471 | * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. | |
472 | * Return -(string length)-1 if there is no such code point. | |
473 | */ | |
474 | static int32_t | |
475 | _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { | |
476 | int32_t matchLen, matchBMPLen, strItr, matchItr; | |
477 | UChar32 stringCh, matchCh; | |
478 | UChar c, c2; | |
479 | ||
480 | /* first part of matchSet contains only BMP code points */ | |
481 | matchBMPLen = 0; | |
482 | while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { | |
483 | ++matchBMPLen; | |
484 | } | |
485 | ||
486 | /* second part of matchSet contains BMP and supplementary code points */ | |
487 | matchLen = matchBMPLen; | |
488 | while(matchSet[matchLen] != 0) { | |
489 | ++matchLen; | |
490 | } | |
491 | ||
492 | for(strItr = 0; (c = string[strItr]) != 0;) { | |
493 | ++strItr; | |
494 | if(U16_IS_SINGLE(c)) { | |
495 | if(polarity) { | |
496 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { | |
497 | if(c == matchSet[matchItr]) { | |
498 | return strItr - 1; /* one matches */ | |
499 | } | |
500 | } | |
501 | } else { | |
502 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { | |
503 | if(c == matchSet[matchItr]) { | |
504 | goto endloop; | |
505 | } | |
506 | } | |
507 | return strItr - 1; /* none matches */ | |
508 | } | |
509 | } else { | |
510 | /* | |
511 | * No need to check for string length before U16_IS_TRAIL | |
512 | * because c2 could at worst be the terminating NUL. | |
513 | */ | |
514 | if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { | |
515 | ++strItr; | |
516 | stringCh = U16_GET_SUPPLEMENTARY(c, c2); | |
517 | } else { | |
518 | stringCh = c; /* unpaired trail surrogate */ | |
519 | } | |
520 | ||
521 | if(polarity) { | |
522 | for(matchItr = matchBMPLen; matchItr < matchLen;) { | |
523 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); | |
524 | if(stringCh == matchCh) { | |
525 | return strItr - U16_LENGTH(stringCh); /* one matches */ | |
526 | } | |
527 | } | |
528 | } else { | |
529 | for(matchItr = matchBMPLen; matchItr < matchLen;) { | |
530 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); | |
531 | if(stringCh == matchCh) { | |
532 | goto endloop; | |
533 | } | |
534 | } | |
535 | return strItr - U16_LENGTH(stringCh); /* none matches */ | |
536 | } | |
537 | } | |
538 | endloop: | |
539 | /* wish C had continue with labels like Java... */; | |
540 | } | |
541 | ||
542 | /* Didn't find it. */ | |
543 | return -strItr-1; | |
544 | } | |
545 | ||
546 | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ | |
547 | U_CAPI UChar * U_EXPORT2 | |
548 | u_strpbrk(const UChar *string, const UChar *matchSet) | |
549 | { | |
550 | int32_t index = _matchFromSet(string, matchSet, TRUE); | |
551 | if(index >= 0) { | |
552 | return (UChar *)string + index; | |
553 | } else { | |
554 | return NULL; | |
555 | } | |
556 | } | |
557 | ||
558 | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ | |
559 | U_CAPI int32_t U_EXPORT2 | |
560 | u_strcspn(const UChar *string, const UChar *matchSet) | |
561 | { | |
562 | int32_t index = _matchFromSet(string, matchSet, TRUE); | |
563 | if(index >= 0) { | |
564 | return index; | |
565 | } else { | |
566 | return -index - 1; /* == u_strlen(string) */ | |
567 | } | |
568 | } | |
569 | ||
570 | /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ | |
571 | U_CAPI int32_t U_EXPORT2 | |
572 | u_strspn(const UChar *string, const UChar *matchSet) | |
573 | { | |
574 | int32_t index = _matchFromSet(string, matchSet, FALSE); | |
575 | if(index >= 0) { | |
576 | return index; | |
577 | } else { | |
578 | return -index - 1; /* == u_strlen(string) */ | |
579 | } | |
580 | } | |
581 | ||
582 | /* ----- Text manipulation functions --- */ | |
583 | ||
584 | U_CAPI UChar* U_EXPORT2 | |
585 | u_strtok_r(UChar *src, | |
586 | const UChar *delim, | |
587 | UChar **saveState) | |
588 | { | |
589 | UChar *tokSource; | |
590 | UChar *nextToken; | |
591 | uint32_t nonDelimIdx; | |
592 | ||
593 | /* If saveState is NULL, the user messed up. */ | |
594 | if (src != NULL) { | |
595 | tokSource = src; | |
596 | *saveState = src; /* Set to "src" in case there are no delimiters */ | |
597 | } | |
598 | else if (*saveState) { | |
599 | tokSource = *saveState; | |
600 | } | |
601 | else { | |
602 | /* src == NULL && *saveState == NULL */ | |
603 | /* This shouldn't happen. We already finished tokenizing. */ | |
604 | return NULL; | |
605 | } | |
606 | ||
607 | /* Skip initial delimiters */ | |
608 | nonDelimIdx = u_strspn(tokSource, delim); | |
609 | tokSource = &tokSource[nonDelimIdx]; | |
610 | ||
611 | if (*tokSource) { | |
612 | nextToken = u_strpbrk(tokSource, delim); | |
613 | if (nextToken != NULL) { | |
614 | /* Create a token */ | |
615 | *(nextToken++) = 0; | |
616 | *saveState = nextToken; | |
617 | return tokSource; | |
618 | } | |
619 | else if (*saveState) { | |
620 | /* Return the last token */ | |
621 | *saveState = NULL; | |
622 | return tokSource; | |
623 | } | |
624 | } | |
625 | else { | |
626 | /* No tokens were found. Only delimiters were left. */ | |
627 | *saveState = NULL; | |
628 | } | |
629 | return NULL; | |
630 | } | |
631 | ||
632 | /* Miscellaneous functions -------------------------------------------------- */ | |
633 | ||
634 | U_CAPI UChar* U_EXPORT2 | |
635 | u_strcat(UChar *dst, | |
636 | const UChar *src) | |
637 | { | |
638 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
639 | ||
640 | while(*dst != 0) { /* To end of first string */ | |
641 | ++dst; | |
642 | } | |
643 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ | |
644 | } | |
645 | ||
646 | return anchor; | |
647 | } | |
648 | ||
649 | U_CAPI UChar* U_EXPORT2 | |
650 | u_strncat(UChar *dst, | |
651 | const UChar *src, | |
652 | int32_t n ) | |
653 | { | |
654 | if(n > 0) { | |
655 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
656 | ||
657 | while(*dst != 0) { /* To end of first string */ | |
658 | ++dst; | |
659 | } | |
660 | while((*dst = *src) != 0) { /* copy string 2 over */ | |
661 | ++dst; | |
662 | if(--n == 0) { | |
663 | *dst = 0; | |
664 | break; | |
665 | } | |
666 | ++src; | |
667 | } | |
668 | ||
669 | return anchor; | |
670 | } else { | |
671 | return dst; | |
672 | } | |
673 | } | |
674 | ||
675 | /* ----- Text property functions --- */ | |
676 | ||
677 | U_CAPI int32_t U_EXPORT2 | |
678 | u_strcmp(const UChar *s1, | |
679 | const UChar *s2) | |
680 | { | |
681 | UChar c1, c2; | |
682 | ||
683 | for(;;) { | |
684 | c1=*s1++; | |
685 | c2=*s2++; | |
686 | if (c1 != c2 || c1 == 0) { | |
687 | break; | |
688 | } | |
689 | } | |
690 | return (int32_t)c1 - (int32_t)c2; | |
691 | } | |
692 | ||
693 | U_CAPI int32_t U_EXPORT2 | |
694 | uprv_strCompare(const UChar *s1, int32_t length1, | |
695 | const UChar *s2, int32_t length2, | |
696 | UBool strncmpStyle, UBool codePointOrder) { | |
697 | const UChar *start1, *start2, *limit1, *limit2; | |
698 | UChar c1, c2; | |
699 | ||
700 | /* setup for fix-up */ | |
701 | start1=s1; | |
702 | start2=s2; | |
703 | ||
704 | /* compare identical prefixes - they do not need to be fixed up */ | |
705 | if(length1<0 && length2<0) { | |
706 | /* strcmp style, both NUL-terminated */ | |
707 | if(s1==s2) { | |
708 | return 0; | |
709 | } | |
710 | ||
711 | for(;;) { | |
712 | c1=*s1; | |
713 | c2=*s2; | |
714 | if(c1!=c2) { | |
715 | break; | |
716 | } | |
717 | if(c1==0) { | |
718 | return 0; | |
719 | } | |
720 | ++s1; | |
721 | ++s2; | |
722 | } | |
723 | ||
724 | /* setup for fix-up */ | |
725 | limit1=limit2=NULL; | |
726 | } else if(strncmpStyle) { | |
727 | /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ | |
728 | if(s1==s2) { | |
729 | return 0; | |
730 | } | |
731 | ||
732 | limit1=start1+length1; | |
733 | ||
734 | for(;;) { | |
735 | /* both lengths are same, check only one limit */ | |
736 | if(s1==limit1) { | |
737 | return 0; | |
738 | } | |
739 | ||
740 | c1=*s1; | |
741 | c2=*s2; | |
742 | if(c1!=c2) { | |
743 | break; | |
744 | } | |
745 | if(c1==0) { | |
746 | return 0; | |
747 | } | |
748 | ++s1; | |
749 | ++s2; | |
750 | } | |
751 | ||
752 | /* setup for fix-up */ | |
753 | limit2=start2+length1; /* use length1 here, too, to enforce assumption */ | |
754 | } else { | |
755 | /* memcmp/UnicodeString style, both length-specified */ | |
756 | int32_t lengthResult; | |
757 | ||
758 | if(length1<0) { | |
759 | length1=u_strlen(s1); | |
760 | } | |
761 | if(length2<0) { | |
762 | length2=u_strlen(s2); | |
763 | } | |
764 | ||
765 | /* limit1=start1+min(lenght1, length2) */ | |
766 | if(length1<length2) { | |
767 | lengthResult=-1; | |
768 | limit1=start1+length1; | |
769 | } else if(length1==length2) { | |
770 | lengthResult=0; | |
771 | limit1=start1+length1; | |
772 | } else /* length1>length2 */ { | |
773 | lengthResult=1; | |
774 | limit1=start1+length2; | |
775 | } | |
776 | ||
777 | if(s1==s2) { | |
778 | return lengthResult; | |
779 | } | |
780 | ||
781 | for(;;) { | |
782 | /* check pseudo-limit */ | |
783 | if(s1==limit1) { | |
784 | return lengthResult; | |
785 | } | |
786 | ||
787 | c1=*s1; | |
788 | c2=*s2; | |
789 | if(c1!=c2) { | |
790 | break; | |
791 | } | |
792 | ++s1; | |
793 | ++s2; | |
794 | } | |
795 | ||
796 | /* setup for fix-up */ | |
797 | limit1=start1+length1; | |
798 | limit2=start2+length2; | |
799 | } | |
800 | ||
801 | /* if both values are in or above the surrogate range, fix them up */ | |
802 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { | |
803 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
804 | if( | |
805 | (c1<=0xdbff && (s1+1)!=limit1 && UTF_IS_TRAIL(*(s1+1))) || | |
806 | (UTF_IS_TRAIL(c1) && start1!=s1 && UTF_IS_LEAD(*(s1-1))) | |
807 | ) { | |
808 | /* part of a surrogate pair, leave >=d800 */ | |
809 | } else { | |
810 | /* BMP code point - may be surrogate code point - make <d800 */ | |
811 | c1-=0x2800; | |
812 | } | |
813 | ||
814 | if( | |
815 | (c2<=0xdbff && (s2+1)!=limit2 && UTF_IS_TRAIL(*(s2+1))) || | |
816 | (UTF_IS_TRAIL(c2) && start2!=s2 && UTF_IS_LEAD(*(s2-1))) | |
817 | ) { | |
818 | /* part of a surrogate pair, leave >=d800 */ | |
819 | } else { | |
820 | /* BMP code point - may be surrogate code point - make <d800 */ | |
821 | c2-=0x2800; | |
822 | } | |
823 | } | |
824 | ||
825 | /* now c1 and c2 are in the requested (code unit or code point) order */ | |
826 | return (int32_t)c1-(int32_t)c2; | |
827 | } | |
828 | ||
829 | /* | |
830 | * Compare two strings as presented by UCharIterators. | |
831 | * Use code unit or code point order. | |
832 | * When the function returns, it is undefined where the iterators | |
833 | * have stopped. | |
834 | */ | |
835 | U_CAPI int32_t U_EXPORT2 | |
836 | u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { | |
837 | UChar32 c1, c2; | |
838 | ||
839 | /* argument checking */ | |
840 | if(iter1==NULL || iter2==NULL) { | |
841 | return 0; /* bad arguments */ | |
842 | } | |
843 | if(iter1==iter2) { | |
844 | return 0; /* identical iterators */ | |
845 | } | |
846 | ||
847 | /* reset iterators to start? */ | |
848 | iter1->move(iter1, 0, UITER_START); | |
849 | iter2->move(iter2, 0, UITER_START); | |
850 | ||
851 | /* compare identical prefixes - they do not need to be fixed up */ | |
852 | for(;;) { | |
853 | c1=iter1->next(iter1); | |
854 | c2=iter2->next(iter2); | |
855 | if(c1!=c2) { | |
856 | break; | |
857 | } | |
858 | if(c1==-1) { | |
859 | return 0; | |
860 | } | |
861 | } | |
862 | ||
863 | /* if both values are in or above the surrogate range, fix them up */ | |
864 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { | |
865 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
866 | if( | |
867 | (c1<=0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) || | |
868 | (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1)))) | |
869 | ) { | |
870 | /* part of a surrogate pair, leave >=d800 */ | |
871 | } else { | |
872 | /* BMP code point - may be surrogate code point - make <d800 */ | |
873 | c1-=0x2800; | |
874 | } | |
875 | ||
876 | if( | |
877 | (c2<=0xdbff && UTF_IS_TRAIL(iter2->current(iter2))) || | |
878 | (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2)))) | |
879 | ) { | |
880 | /* part of a surrogate pair, leave >=d800 */ | |
881 | } else { | |
882 | /* BMP code point - may be surrogate code point - make <d800 */ | |
883 | c2-=0x2800; | |
884 | } | |
885 | } | |
886 | ||
887 | /* now c1 and c2 are in the requested (code unit or code point) order */ | |
888 | return (int32_t)c1-(int32_t)c2; | |
889 | } | |
890 | ||
891 | #if 0 | |
892 | /* | |
893 | * u_strCompareIter() does not leave the iterators _on_ the different units. | |
894 | * This is possible but would cost a few extra indirect function calls to back | |
895 | * up if the last unit (c1 or c2 respectively) was >=0. | |
896 | * | |
897 | * Consistently leaving them _behind_ the different units is not an option | |
898 | * because the current "unit" is the end of the string if that is reached, | |
899 | * and in such a case the iterator does not move. | |
900 | * For example, when comparing "ab" with "abc", both iterators rest _on_ the end | |
901 | * of their strings. Calling previous() on each does not move them to where | |
902 | * the comparison fails. | |
903 | * | |
904 | * So the simplest semantics is to not define where the iterators end up. | |
905 | * | |
906 | * The following fragment is part of what would need to be done for backing up. | |
907 | */ | |
908 | void fragment { | |
909 | /* iff a surrogate is part of a surrogate pair, leave >=d800 */ | |
910 | if(c1<=0xdbff) { | |
911 | if(!UTF_IS_TRAIL(iter1->current(iter1))) { | |
912 | /* lead surrogate code point - make <d800 */ | |
913 | c1-=0x2800; | |
914 | } | |
915 | } else if(c1<=0xdfff) { | |
916 | int32_t index=iter1->getIndex(iter1, UITER_CURRENT); | |
917 | iter1->previous(iter1); /* ==c1 */ | |
918 | if(!UTF_IS_LEAD(iter1->previous(iter1))) { | |
919 | /* trail surrogate code point - make <d800 */ | |
920 | c1-=0x2800; | |
921 | } | |
922 | /* go back to behind where the difference is */ | |
923 | iter1->move(iter1, index, UITER_ZERO); | |
924 | } else /* 0xe000<=c1<=0xffff */ { | |
925 | /* BMP code point - make <d800 */ | |
926 | c1-=0x2800; | |
927 | } | |
928 | } | |
929 | #endif | |
930 | ||
931 | U_CAPI int32_t U_EXPORT2 | |
932 | u_strCompare(const UChar *s1, int32_t length1, | |
933 | const UChar *s2, int32_t length2, | |
934 | UBool codePointOrder) { | |
935 | /* argument checking */ | |
936 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { | |
937 | return 0; | |
938 | } | |
939 | return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); | |
940 | } | |
941 | ||
942 | /* String compare in code point order - u_strcmp() compares in code unit order. */ | |
943 | U_CAPI int32_t U_EXPORT2 | |
944 | u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { | |
945 | return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); | |
946 | } | |
947 | ||
948 | U_CAPI int32_t U_EXPORT2 | |
949 | u_strncmp(const UChar *s1, | |
950 | const UChar *s2, | |
951 | int32_t n) | |
952 | { | |
953 | if(n > 0) { | |
954 | int32_t rc; | |
955 | for(;;) { | |
956 | rc = (int32_t)*s1 - (int32_t)*s2; | |
957 | if(rc != 0 || *s1 == 0 || --n == 0) { | |
958 | return rc; | |
959 | } | |
960 | ++s1; | |
961 | ++s2; | |
962 | } | |
963 | } else { | |
964 | return 0; | |
965 | } | |
966 | } | |
967 | ||
968 | U_CAPI int32_t U_EXPORT2 | |
969 | u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { | |
970 | return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); | |
971 | } | |
972 | ||
973 | U_CAPI UChar* U_EXPORT2 | |
974 | u_strcpy(UChar *dst, | |
975 | const UChar *src) | |
976 | { | |
977 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
978 | ||
979 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ | |
980 | } | |
981 | ||
982 | return anchor; | |
983 | } | |
984 | ||
985 | U_CAPI UChar* U_EXPORT2 | |
986 | u_strncpy(UChar *dst, | |
987 | const UChar *src, | |
988 | int32_t n) | |
989 | { | |
990 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
991 | ||
992 | /* copy string 2 over */ | |
993 | while(n > 0 && (*(dst++) = *(src++)) != 0) { | |
994 | --n; | |
995 | } | |
996 | ||
997 | return anchor; | |
998 | } | |
999 | ||
1000 | U_CAPI int32_t U_EXPORT2 | |
1001 | u_strlen(const UChar *s) | |
1002 | { | |
1003 | #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR | |
1004 | return uprv_wcslen(s); | |
1005 | #else | |
1006 | const UChar *t = s; | |
1007 | while(*t != 0) { | |
1008 | ++t; | |
1009 | } | |
1010 | return t - s; | |
1011 | #endif | |
1012 | } | |
1013 | ||
1014 | U_CAPI int32_t U_EXPORT2 | |
1015 | u_countChar32(const UChar *s, int32_t length) { | |
1016 | int32_t count; | |
1017 | ||
1018 | if(s==NULL || length<-1) { | |
1019 | return 0; | |
1020 | } | |
1021 | ||
1022 | count=0; | |
1023 | if(length>=0) { | |
1024 | while(length>0) { | |
1025 | ++count; | |
1026 | if(UTF_IS_LEAD(*s) && length>=2 && UTF_IS_TRAIL(*(s+1))) { | |
1027 | s+=2; | |
1028 | length-=2; | |
1029 | } else { | |
1030 | ++s; | |
1031 | --length; | |
1032 | } | |
1033 | } | |
1034 | } else /* length==-1 */ { | |
1035 | UChar c; | |
1036 | ||
1037 | for(;;) { | |
1038 | if((c=*s++)==0) { | |
1039 | break; | |
1040 | } | |
1041 | ++count; | |
1042 | ||
1043 | /* | |
1044 | * sufficient to look ahead one because of UTF-16; | |
1045 | * safe to look ahead one because at worst that would be the terminating NUL | |
1046 | */ | |
1047 | if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) { | |
1048 | ++s; | |
1049 | } | |
1050 | } | |
1051 | } | |
1052 | return count; | |
1053 | } | |
1054 | ||
1055 | U_CAPI UBool U_EXPORT2 | |
1056 | u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { | |
1057 | ||
1058 | if(number<0) { | |
1059 | return TRUE; | |
1060 | } | |
1061 | if(s==NULL || length<-1) { | |
1062 | return FALSE; | |
1063 | } | |
1064 | ||
1065 | if(length==-1) { | |
1066 | /* s is NUL-terminated */ | |
1067 | UChar c; | |
1068 | ||
1069 | /* count code points until they exceed */ | |
1070 | for(;;) { | |
1071 | if((c=*s++)==0) { | |
1072 | return FALSE; | |
1073 | } | |
1074 | if(number==0) { | |
1075 | return TRUE; | |
1076 | } | |
1077 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { | |
1078 | ++s; | |
1079 | } | |
1080 | --number; | |
1081 | } | |
1082 | } else { | |
1083 | /* length>=0 known */ | |
1084 | const UChar *limit; | |
1085 | int32_t maxSupplementary; | |
1086 | ||
1087 | /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ | |
1088 | if(((length+1)/2)>number) { | |
1089 | return TRUE; | |
1090 | } | |
1091 | ||
1092 | /* check if s does not even contain enough UChars */ | |
1093 | maxSupplementary=length-number; | |
1094 | if(maxSupplementary<=0) { | |
1095 | return FALSE; | |
1096 | } | |
1097 | /* there are maxSupplementary=length-number more UChars than asked-for code points */ | |
1098 | ||
1099 | /* | |
1100 | * count code points until they exceed and also check that there are | |
1101 | * no more than maxSupplementary supplementary code points (UChar pairs) | |
1102 | */ | |
1103 | limit=s+length; | |
1104 | for(;;) { | |
1105 | if(s==limit) { | |
1106 | return FALSE; | |
1107 | } | |
1108 | if(number==0) { | |
1109 | return TRUE; | |
1110 | } | |
1111 | if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { | |
1112 | ++s; | |
1113 | if(--maxSupplementary<=0) { | |
1114 | /* too many pairs - too few code points */ | |
1115 | return FALSE; | |
1116 | } | |
1117 | } | |
1118 | --number; | |
1119 | } | |
1120 | } | |
1121 | } | |
1122 | ||
1123 | U_CAPI UChar * U_EXPORT2 | |
1124 | u_memcpy(UChar *dest, const UChar *src, int32_t count) { | |
1125 | return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); | |
1126 | } | |
1127 | ||
1128 | U_CAPI UChar * U_EXPORT2 | |
1129 | u_memmove(UChar *dest, const UChar *src, int32_t count) { | |
1130 | return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); | |
1131 | } | |
1132 | ||
1133 | U_CAPI UChar * U_EXPORT2 | |
1134 | u_memset(UChar *dest, UChar c, int32_t count) { | |
1135 | if(count > 0) { | |
1136 | UChar *ptr = dest; | |
1137 | UChar *limit = dest + count; | |
1138 | ||
1139 | while (ptr < limit) { | |
1140 | *(ptr++) = c; | |
1141 | } | |
1142 | } | |
1143 | return dest; | |
1144 | } | |
1145 | ||
1146 | U_CAPI int32_t U_EXPORT2 | |
1147 | u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { | |
1148 | if(count > 0) { | |
1149 | const UChar *limit = buf1 + count; | |
1150 | int32_t result; | |
1151 | ||
1152 | while (buf1 < limit) { | |
1153 | result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; | |
1154 | if (result != 0) { | |
1155 | return result; | |
1156 | } | |
1157 | buf1++; | |
1158 | buf2++; | |
1159 | } | |
1160 | } | |
1161 | return 0; | |
1162 | } | |
1163 | ||
1164 | U_CAPI int32_t U_EXPORT2 | |
1165 | u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { | |
1166 | return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); | |
1167 | } | |
1168 | ||
1169 | /* conversions between char* and UChar* ------------------------------------- */ | |
1170 | ||
1171 | /* | |
1172 | returns the minimum of (the length of the null-terminated string) and n. | |
1173 | */ | |
1174 | static int32_t u_astrnlen(const char *s1, int32_t n) | |
1175 | { | |
1176 | int32_t len = 0; | |
1177 | ||
1178 | if (s1) | |
1179 | { | |
1180 | while (n-- && *(s1++)) | |
1181 | { | |
1182 | len++; | |
1183 | } | |
1184 | } | |
1185 | return len; | |
1186 | } | |
1187 | ||
1188 | U_CAPI UChar* U_EXPORT2 | |
1189 | u_uastrncpy(UChar *ucs1, | |
1190 | const char *s2, | |
1191 | int32_t n) | |
1192 | { | |
1193 | UChar *target = ucs1; | |
1194 | UErrorCode err = U_ZERO_ERROR; | |
1195 | UConverter *cnv = u_getDefaultConverter(&err); | |
1196 | if(U_SUCCESS(err) && cnv != NULL) { | |
1197 | ucnv_reset(cnv); | |
1198 | ucnv_toUnicode(cnv, | |
1199 | &target, | |
1200 | ucs1+n, | |
1201 | &s2, | |
1202 | s2+u_astrnlen(s2, n), | |
1203 | NULL, | |
1204 | TRUE, | |
1205 | &err); | |
1206 | ucnv_reset(cnv); /* be good citizens */ | |
1207 | u_releaseDefaultConverter(cnv); | |
1208 | if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) { | |
1209 | *ucs1 = 0; /* failure */ | |
1210 | } | |
1211 | if(target < (ucs1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */ | |
1212 | *target = 0; /* terminate */ | |
1213 | } | |
1214 | } else { | |
1215 | *ucs1 = 0; | |
1216 | } | |
1217 | return ucs1; | |
1218 | } | |
1219 | ||
1220 | U_CAPI UChar* U_EXPORT2 | |
1221 | u_uastrcpy(UChar *ucs1, | |
1222 | const char *s2 ) | |
1223 | { | |
1224 | UErrorCode err = U_ZERO_ERROR; | |
1225 | UConverter *cnv = u_getDefaultConverter(&err); | |
1226 | if(U_SUCCESS(err) && cnv != NULL) { | |
1227 | ucnv_toUChars(cnv, | |
1228 | ucs1, | |
1229 | MAX_STRLEN, | |
1230 | s2, | |
1231 | uprv_strlen(s2), | |
1232 | &err); | |
1233 | u_releaseDefaultConverter(cnv); | |
1234 | if(U_FAILURE(err)) { | |
1235 | *ucs1 = 0; | |
1236 | } | |
1237 | } else { | |
1238 | *ucs1 = 0; | |
1239 | } | |
1240 | return ucs1; | |
1241 | } | |
1242 | ||
1243 | /* | |
1244 | returns the minimum of (the length of the null-terminated string) and n. | |
1245 | */ | |
1246 | static int32_t u_ustrnlen(const UChar *ucs1, int32_t n) | |
1247 | { | |
1248 | int32_t len = 0; | |
1249 | ||
1250 | if (ucs1) | |
1251 | { | |
1252 | while (n-- && *(ucs1++)) | |
1253 | { | |
1254 | len++; | |
1255 | } | |
1256 | } | |
1257 | return len; | |
1258 | } | |
1259 | ||
1260 | U_CAPI char* U_EXPORT2 | |
1261 | u_austrncpy(char *s1, | |
1262 | const UChar *ucs2, | |
1263 | int32_t n) | |
1264 | { | |
1265 | char *target = s1; | |
1266 | UErrorCode err = U_ZERO_ERROR; | |
1267 | UConverter *cnv = u_getDefaultConverter(&err); | |
1268 | if(U_SUCCESS(err) && cnv != NULL) { | |
1269 | ucnv_reset(cnv); | |
1270 | ucnv_fromUnicode(cnv, | |
1271 | &target, | |
1272 | s1+n, | |
1273 | &ucs2, | |
1274 | ucs2+u_ustrnlen(ucs2, n), | |
1275 | NULL, | |
1276 | TRUE, | |
1277 | &err); | |
1278 | ucnv_reset(cnv); /* be good citizens */ | |
1279 | u_releaseDefaultConverter(cnv); | |
1280 | if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) { | |
1281 | *s1 = 0; /* failure */ | |
1282 | } | |
1283 | if(target < (s1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */ | |
1284 | *target = 0; /* terminate */ | |
1285 | } | |
1286 | } else { | |
1287 | *s1 = 0; | |
1288 | } | |
1289 | return s1; | |
1290 | } | |
1291 | ||
1292 | U_CAPI char* U_EXPORT2 | |
1293 | u_austrcpy(char *s1, | |
1294 | const UChar *ucs2 ) | |
1295 | { | |
1296 | UErrorCode err = U_ZERO_ERROR; | |
1297 | UConverter *cnv = u_getDefaultConverter(&err); | |
1298 | if(U_SUCCESS(err) && cnv != NULL) { | |
1299 | int32_t len = ucnv_fromUChars(cnv, | |
1300 | s1, | |
1301 | MAX_STRLEN, | |
1302 | ucs2, | |
1303 | -1, | |
1304 | &err); | |
1305 | u_releaseDefaultConverter(cnv); | |
1306 | s1[len] = 0; | |
1307 | } else { | |
1308 | *s1 = 0; | |
1309 | } | |
1310 | return s1; | |
1311 | } | |
1312 | ||
1313 | /* mutexed access to a shared default converter ----------------------------- */ | |
1314 | ||
1315 | U_CAPI UConverter* U_EXPORT2 | |
1316 | u_getDefaultConverter(UErrorCode *status) | |
1317 | { | |
1318 | UConverter *converter = NULL; | |
1319 | ||
1320 | if (gDefaultConverter != NULL) { | |
1321 | umtx_lock(NULL); | |
1322 | ||
1323 | /* need to check to make sure it wasn't taken out from under us */ | |
1324 | if (gDefaultConverter != NULL) { | |
1325 | converter = gDefaultConverter; | |
1326 | gDefaultConverter = NULL; | |
1327 | } | |
1328 | umtx_unlock(NULL); | |
1329 | } | |
1330 | ||
1331 | /* if the cache was empty, create a converter */ | |
1332 | if(converter == NULL) { | |
1333 | converter = ucnv_open(NULL, status); | |
1334 | if(U_FAILURE(*status)) { | |
1335 | return NULL; | |
1336 | } | |
1337 | } | |
1338 | ||
1339 | return converter; | |
1340 | } | |
1341 | ||
1342 | U_CAPI void U_EXPORT2 | |
1343 | u_releaseDefaultConverter(UConverter *converter) | |
1344 | { | |
1345 | if(gDefaultConverter == NULL) { | |
1346 | if (converter != NULL) { | |
1347 | ucnv_reset(converter); | |
1348 | } | |
1349 | umtx_lock(NULL); | |
1350 | ||
1351 | if(gDefaultConverter == NULL) { | |
1352 | gDefaultConverter = converter; | |
1353 | converter = NULL; | |
1354 | } | |
1355 | umtx_unlock(NULL); | |
1356 | } | |
1357 | ||
1358 | if(converter != NULL) { | |
1359 | ucnv_close(converter); | |
1360 | } | |
1361 | } | |
1362 | ||
1363 | /* u_unescape & support fns ------------------------------------------------- */ | |
1364 | ||
1365 | /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ | |
1366 | static const UChar UNESCAPE_MAP[] = { | |
1367 | /*" 0x22, 0x22 */ | |
1368 | /*' 0x27, 0x27 */ | |
1369 | /*? 0x3F, 0x3F */ | |
1370 | /*\ 0x5C, 0x5C */ | |
1371 | /*a*/ 0x61, 0x07, | |
1372 | /*b*/ 0x62, 0x08, | |
1373 | /*e*/ 0x65, 0x1b, | |
1374 | /*f*/ 0x66, 0x0c, | |
1375 | /*n*/ 0x6E, 0x0a, | |
1376 | /*r*/ 0x72, 0x0d, | |
1377 | /*t*/ 0x74, 0x09, | |
1378 | /*v*/ 0x76, 0x0b | |
1379 | }; | |
1380 | enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; | |
1381 | ||
1382 | /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ | |
1383 | static int8_t _digit8(UChar c) { | |
1384 | if (c >= 0x0030 && c <= 0x0037) { | |
1385 | return (int8_t)(c - 0x0030); | |
1386 | } | |
1387 | return -1; | |
1388 | } | |
1389 | ||
1390 | /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ | |
1391 | static int8_t _digit16(UChar c) { | |
1392 | if (c >= 0x0030 && c <= 0x0039) { | |
1393 | return (int8_t)(c - 0x0030); | |
1394 | } | |
1395 | if (c >= 0x0041 && c <= 0x0046) { | |
1396 | return (int8_t)(c - (0x0041 - 10)); | |
1397 | } | |
1398 | if (c >= 0x0061 && c <= 0x0066) { | |
1399 | return (int8_t)(c - (0x0061 - 10)); | |
1400 | } | |
1401 | return -1; | |
1402 | } | |
1403 | ||
1404 | /* Parse a single escape sequence. Although this method deals in | |
1405 | * UChars, it does not use C++ or UnicodeString. This allows it to | |
1406 | * be used from C contexts. */ | |
1407 | U_CAPI UChar32 U_EXPORT2 | |
1408 | u_unescapeAt(UNESCAPE_CHAR_AT charAt, | |
1409 | int32_t *offset, | |
1410 | int32_t length, | |
1411 | void *context) { | |
1412 | ||
1413 | int32_t start = *offset; | |
1414 | UChar c; | |
1415 | UChar32 result = 0; | |
1416 | int8_t n = 0; | |
1417 | int8_t minDig = 0; | |
1418 | int8_t maxDig = 0; | |
1419 | int8_t bitsPerDigit = 4; | |
1420 | int8_t dig; | |
1421 | int32_t i; | |
1422 | UBool braces = FALSE; | |
1423 | ||
1424 | /* Check that offset is in range */ | |
1425 | if (*offset < 0 || *offset >= length) { | |
1426 | goto err; | |
1427 | } | |
1428 | ||
1429 | /* Fetch first UChar after '\\' */ | |
1430 | c = charAt((*offset)++, context); | |
1431 | ||
1432 | /* Convert hexadecimal and octal escapes */ | |
1433 | switch (c) { | |
1434 | case 0x0075 /*'u'*/: | |
1435 | minDig = maxDig = 4; | |
1436 | break; | |
1437 | case 0x0055 /*'U'*/: | |
1438 | minDig = maxDig = 8; | |
1439 | break; | |
1440 | case 0x0078 /*'x'*/: | |
1441 | minDig = 1; | |
1442 | if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { | |
1443 | ++(*offset); | |
1444 | braces = TRUE; | |
1445 | maxDig = 8; | |
1446 | } else { | |
1447 | maxDig = 2; | |
1448 | } | |
1449 | break; | |
1450 | default: | |
1451 | dig = _digit8(c); | |
1452 | if (dig >= 0) { | |
1453 | minDig = 1; | |
1454 | maxDig = 3; | |
1455 | n = 1; /* Already have first octal digit */ | |
1456 | bitsPerDigit = 3; | |
1457 | result = dig; | |
1458 | } | |
1459 | break; | |
1460 | } | |
1461 | if (minDig != 0) { | |
1462 | while (*offset < length && n < maxDig) { | |
1463 | c = charAt(*offset, context); | |
1464 | dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); | |
1465 | if (dig < 0) { | |
1466 | break; | |
1467 | } | |
1468 | result = (result << bitsPerDigit) | dig; | |
1469 | ++(*offset); | |
1470 | ++n; | |
1471 | } | |
1472 | if (n < minDig) { | |
1473 | goto err; | |
1474 | } | |
1475 | if (braces) { | |
1476 | if (c != 0x7D /*}*/) { | |
1477 | goto err; | |
1478 | } | |
1479 | ++(*offset); | |
1480 | } | |
1481 | return result; | |
1482 | } | |
1483 | ||
1484 | /* Convert C-style escapes in table */ | |
1485 | for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { | |
1486 | if (c == UNESCAPE_MAP[i]) { | |
1487 | return UNESCAPE_MAP[i+1]; | |
1488 | } else if (c < UNESCAPE_MAP[i]) { | |
1489 | break; | |
1490 | } | |
1491 | } | |
1492 | ||
1493 | /* Map \cX to control-X: X & 0x1F */ | |
1494 | if (c == 0x0063 /*'c'*/ && *offset < length) { | |
1495 | c = charAt((*offset)++, context); | |
1496 | if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { | |
1497 | UChar c2 = charAt(*offset, context); | |
1498 | if (UTF_IS_SECOND_SURROGATE(c2)) { | |
1499 | ++(*offset); | |
1500 | c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */ | |
1501 | } | |
1502 | } | |
1503 | return 0x1F & c; | |
1504 | } | |
1505 | ||
1506 | /* If no special forms are recognized, then consider | |
1507 | * the backslash to generically escape the next character. | |
1508 | * Deal with surrogate pairs. */ | |
1509 | if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { | |
1510 | UChar c2 = charAt(*offset, context); | |
1511 | if (UTF_IS_SECOND_SURROGATE(c2)) { | |
1512 | ++(*offset); | |
1513 | return UTF16_GET_PAIR_VALUE(c, c2); | |
1514 | } | |
1515 | } | |
1516 | return c; | |
1517 | ||
1518 | err: | |
1519 | /* Invalid escape sequence */ | |
1520 | *offset = start; /* Reset to initial value */ | |
1521 | return (UChar32)0xFFFFFFFF; | |
1522 | } | |
1523 | ||
1524 | /* u_unescapeAt() callback to return a UChar from a char* */ | |
1525 | static UChar U_CALLCONV | |
1526 | _charPtr_charAt(int32_t offset, void *context) { | |
1527 | UChar c16; | |
1528 | /* It would be more efficient to access the invariant tables | |
1529 | * directly but there is no API for that. */ | |
1530 | u_charsToUChars(((char*) context) + offset, &c16, 1); | |
1531 | return c16; | |
1532 | } | |
1533 | ||
1534 | /* Append an escape-free segment of the text; used by u_unescape() */ | |
1535 | static void _appendUChars(UChar *dest, int32_t destCapacity, | |
1536 | const char *src, int32_t srcLen) { | |
1537 | if (destCapacity < 0) { | |
1538 | destCapacity = 0; | |
1539 | } | |
1540 | if (srcLen > destCapacity) { | |
1541 | srcLen = destCapacity; | |
1542 | } | |
1543 | u_charsToUChars(src, dest, srcLen); | |
1544 | } | |
1545 | ||
1546 | /* Do an invariant conversion of char* -> UChar*, with escape parsing */ | |
1547 | U_CAPI int32_t U_EXPORT2 | |
1548 | u_unescape(const char *src, UChar *dest, int32_t destCapacity) { | |
1549 | const char *segment = src; | |
1550 | int32_t i = 0; | |
1551 | char c; | |
1552 | ||
1553 | while ((c=*src) != 0) { | |
1554 | /* '\\' intentionally written as compiler-specific | |
1555 | * character constant to correspond to compiler-specific | |
1556 | * char* constants. */ | |
1557 | if (c == '\\') { | |
1558 | int32_t lenParsed = 0; | |
1559 | UChar32 c32; | |
1560 | if (src != segment) { | |
1561 | if (dest != NULL) { | |
1562 | _appendUChars(dest + i, destCapacity - i, | |
1563 | segment, src - segment); | |
1564 | } | |
1565 | i += src - segment; | |
1566 | } | |
1567 | ++src; /* advance past '\\' */ | |
1568 | c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*)src); | |
1569 | if (lenParsed == 0) { | |
1570 | goto err; | |
1571 | } | |
1572 | src += lenParsed; /* advance past escape seq. */ | |
1573 | if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) { | |
1574 | UTF_APPEND_CHAR_UNSAFE(dest, i, c32); | |
1575 | } else { | |
1576 | i += UTF_CHAR_LENGTH(c32); | |
1577 | } | |
1578 | segment = src; | |
1579 | } else { | |
1580 | ++src; | |
1581 | } | |
1582 | } | |
1583 | if (src != segment) { | |
1584 | if (dest != NULL) { | |
1585 | _appendUChars(dest + i, destCapacity - i, | |
1586 | segment, src - segment); | |
1587 | } | |
1588 | i += src - segment; | |
1589 | } | |
1590 | if (dest != NULL && i < destCapacity) { | |
1591 | dest[i] = 0; | |
1592 | } | |
1593 | return i; | |
1594 | ||
1595 | err: | |
1596 | if (dest != NULL && destCapacity > 0) { | |
1597 | *dest = 0; | |
1598 | } | |
1599 | return 0; | |
1600 | } | |
1601 | ||
1602 | /* C UGrowBuffer implementation --------------------------------------------- */ | |
1603 | ||
1604 | U_CAPI UBool /* U_CALLCONV U_EXPORT2 */ | |
1605 | u_growBufferFromStatic(void *context, | |
1606 | UChar **pBuffer, int32_t *pCapacity, int32_t reqCapacity, | |
1607 | int32_t length) { | |
1608 | UChar *newBuffer=(UChar *)uprv_malloc(reqCapacity*U_SIZEOF_UCHAR); | |
1609 | if(newBuffer!=NULL) { | |
1610 | if(length>0) { | |
1611 | uprv_memcpy(newBuffer, *pBuffer, length*U_SIZEOF_UCHAR); | |
1612 | } | |
1613 | *pCapacity=reqCapacity; | |
1614 | } else { | |
1615 | *pCapacity=0; | |
1616 | } | |
1617 | ||
1618 | /* release the old pBuffer if it was not statically allocated */ | |
1619 | if(*pBuffer!=(UChar *)context) { | |
1620 | uprv_free(*pBuffer); | |
1621 | } | |
1622 | ||
1623 | *pBuffer=newBuffer; | |
1624 | return (UBool)(newBuffer!=NULL); | |
1625 | } | |
1626 | ||
1627 | /* NUL-termination of strings ----------------------------------------------- */ | |
1628 | ||
1629 | /** | |
1630 | * NUL-terminate a string no matter what its type. | |
1631 | * Set warning and error codes accordingly. | |
1632 | */ | |
1633 | #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ | |
1634 | if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ | |
1635 | /* not a public function, so no complete argument checking */ \ | |
1636 | \ | |
1637 | if(length<0) { \ | |
1638 | /* assume that the caller handles this */ \ | |
1639 | } else if(length<destCapacity) { \ | |
1640 | /* NUL-terminate the string, the NUL fits */ \ | |
1641 | dest[length]=0; \ | |
1642 | /* unset the not-terminated warning but leave all others */ \ | |
1643 | if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ | |
1644 | *pErrorCode=U_ZERO_ERROR; \ | |
1645 | } \ | |
1646 | } else if(length==destCapacity) { \ | |
1647 | /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ | |
1648 | *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ | |
1649 | } else /* length>destCapacity */ { \ | |
1650 | /* even the string itself did not fit - set an error code */ \ | |
1651 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ | |
1652 | } \ | |
1653 | } | |
1654 | ||
1655 | U_CAPI int32_t U_EXPORT2 | |
1656 | u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1657 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1658 | return length; | |
1659 | } | |
1660 | ||
1661 | U_CAPI int32_t U_EXPORT2 | |
1662 | u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1663 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1664 | return length; | |
1665 | } | |
1666 | ||
1667 | U_CAPI int32_t U_EXPORT2 | |
1668 | u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1669 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1670 | return length; | |
1671 | } | |
1672 | ||
1673 | U_CAPI int32_t U_EXPORT2 | |
1674 | u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1675 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1676 | return length; | |
1677 | } |