]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
46f4442e | 4 | * Copyright (C) 1998-2008, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * | |
9 | * File ustring.h | |
10 | * | |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 12/07/98 bertrand Creation. | |
15 | ****************************************************************************** | |
16 | */ | |
17 | ||
18 | #include "unicode/utypes.h" | |
b75a7d8f | 19 | #include "unicode/putil.h" |
374ca955 | 20 | #include "unicode/ustring.h" |
b75a7d8f A |
21 | #include "cstring.h" |
22 | #include "cwchar.h" | |
23 | #include "cmemory.h" | |
b75a7d8f A |
24 | #include "ustr_imp.h" |
25 | ||
b75a7d8f A |
26 | /* ANSI string.h - style functions ------------------------------------------ */ |
27 | ||
b75a7d8f A |
28 | /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ |
29 | #define U_BMP_MAX 0xffff | |
30 | ||
31 | /* Forward binary string search functions ----------------------------------- */ | |
32 | ||
33 | /* | |
34 | * Test if a substring match inside a string is at code point boundaries. | |
35 | * All pointers refer to the same buffer. | |
36 | * The limit pointer may be NULL, all others must be real pointers. | |
37 | */ | |
38 | static U_INLINE UBool | |
39 | isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { | |
40 | if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { | |
41 | /* the leading edge of the match is in the middle of a surrogate pair */ | |
42 | return FALSE; | |
43 | } | |
44 | if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { | |
45 | /* the trailing edge of the match is in the middle of a surrogate pair */ | |
46 | return FALSE; | |
47 | } | |
48 | return TRUE; | |
49 | } | |
50 | ||
51 | U_CAPI UChar * U_EXPORT2 | |
52 | u_strFindFirst(const UChar *s, int32_t length, | |
53 | const UChar *sub, int32_t subLength) { | |
54 | const UChar *start, *p, *q, *subLimit; | |
55 | UChar c, cs, cq; | |
56 | ||
57 | if(sub==NULL || subLength<-1) { | |
58 | return (UChar *)s; | |
59 | } | |
60 | if(s==NULL || length<-1) { | |
61 | return NULL; | |
62 | } | |
63 | ||
64 | start=s; | |
65 | ||
66 | if(length<0 && subLength<0) { | |
67 | /* both strings are NUL-terminated */ | |
68 | if((cs=*sub++)==0) { | |
69 | return (UChar *)s; | |
70 | } | |
71 | if(*sub==0 && !U16_IS_SURROGATE(cs)) { | |
72 | /* the substring consists of a single, non-surrogate BMP code point */ | |
73 | return u_strchr(s, cs); | |
74 | } | |
75 | ||
76 | while((c=*s++)!=0) { | |
77 | if(c==cs) { | |
78 | /* found first substring UChar, compare rest */ | |
79 | p=s; | |
80 | q=sub; | |
81 | for(;;) { | |
82 | if((cq=*q)==0) { | |
83 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { | |
84 | return (UChar *)(s-1); /* well-formed match */ | |
85 | } else { | |
86 | break; /* no match because surrogate pair is split */ | |
87 | } | |
88 | } | |
89 | if((c=*p)==0) { | |
90 | return NULL; /* no match, and none possible after s */ | |
91 | } | |
92 | if(c!=cq) { | |
93 | break; /* no match */ | |
94 | } | |
95 | ++p; | |
96 | ++q; | |
97 | } | |
98 | } | |
99 | } | |
100 | ||
101 | /* not found */ | |
102 | return NULL; | |
103 | } | |
104 | ||
105 | if(subLength<0) { | |
106 | subLength=u_strlen(sub); | |
107 | } | |
108 | if(subLength==0) { | |
109 | return (UChar *)s; | |
110 | } | |
111 | ||
112 | /* get sub[0] to search for it fast */ | |
113 | cs=*sub++; | |
114 | --subLength; | |
115 | subLimit=sub+subLength; | |
116 | ||
117 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { | |
118 | /* the substring consists of a single, non-surrogate BMP code point */ | |
119 | return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); | |
120 | } | |
121 | ||
122 | if(length<0) { | |
123 | /* s is NUL-terminated */ | |
124 | while((c=*s++)!=0) { | |
125 | if(c==cs) { | |
126 | /* found first substring UChar, compare rest */ | |
127 | p=s; | |
128 | q=sub; | |
129 | for(;;) { | |
130 | if(q==subLimit) { | |
131 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { | |
132 | return (UChar *)(s-1); /* well-formed match */ | |
133 | } else { | |
134 | break; /* no match because surrogate pair is split */ | |
135 | } | |
136 | } | |
137 | if((c=*p)==0) { | |
138 | return NULL; /* no match, and none possible after s */ | |
139 | } | |
140 | if(c!=*q) { | |
141 | break; /* no match */ | |
142 | } | |
143 | ++p; | |
144 | ++q; | |
145 | } | |
146 | } | |
147 | } | |
148 | } else { | |
149 | const UChar *limit, *preLimit; | |
150 | ||
151 | /* subLength was decremented above */ | |
152 | if(length<=subLength) { | |
153 | return NULL; /* s is shorter than sub */ | |
154 | } | |
155 | ||
156 | limit=s+length; | |
157 | ||
158 | /* the substring must start before preLimit */ | |
159 | preLimit=limit-subLength; | |
160 | ||
161 | while(s!=preLimit) { | |
162 | c=*s++; | |
163 | if(c==cs) { | |
164 | /* found first substring UChar, compare rest */ | |
165 | p=s; | |
166 | q=sub; | |
167 | for(;;) { | |
168 | if(q==subLimit) { | |
169 | if(isMatchAtCPBoundary(start, s-1, p, limit)) { | |
170 | return (UChar *)(s-1); /* well-formed match */ | |
171 | } else { | |
172 | break; /* no match because surrogate pair is split */ | |
173 | } | |
174 | } | |
175 | if(*p!=*q) { | |
176 | break; /* no match */ | |
177 | } | |
178 | ++p; | |
179 | ++q; | |
180 | } | |
181 | } | |
182 | } | |
183 | } | |
184 | ||
185 | /* not found */ | |
186 | return NULL; | |
187 | } | |
188 | ||
189 | U_CAPI UChar * U_EXPORT2 | |
190 | u_strstr(const UChar *s, const UChar *substring) { | |
191 | return u_strFindFirst(s, -1, substring, -1); | |
192 | } | |
193 | ||
194 | U_CAPI UChar * U_EXPORT2 | |
195 | u_strchr(const UChar *s, UChar c) { | |
196 | if(U16_IS_SURROGATE(c)) { | |
197 | /* make sure to not find half of a surrogate pair */ | |
198 | return u_strFindFirst(s, -1, &c, 1); | |
199 | } else { | |
200 | UChar cs; | |
201 | ||
202 | /* trivial search for a BMP code point */ | |
203 | for(;;) { | |
204 | if((cs=*s)==c) { | |
205 | return (UChar *)s; | |
206 | } | |
207 | if(cs==0) { | |
208 | return NULL; | |
209 | } | |
210 | ++s; | |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | U_CAPI UChar * U_EXPORT2 | |
216 | u_strchr32(const UChar *s, UChar32 c) { | |
217 | if((uint32_t)c<=U_BMP_MAX) { | |
218 | /* find BMP code point */ | |
219 | return u_strchr(s, (UChar)c); | |
220 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
221 | /* find supplementary code point as surrogate pair */ | |
222 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
223 | ||
224 | while((cs=*s++)!=0) { | |
225 | if(cs==lead && *s==trail) { | |
226 | return (UChar *)(s-1); | |
227 | } | |
228 | } | |
229 | return NULL; | |
230 | } else { | |
231 | /* not a Unicode code point, not findable */ | |
232 | return NULL; | |
233 | } | |
234 | } | |
235 | ||
236 | U_CAPI UChar * U_EXPORT2 | |
237 | u_memchr(const UChar *s, UChar c, int32_t count) { | |
238 | if(count<=0) { | |
239 | return NULL; /* no string */ | |
240 | } else if(U16_IS_SURROGATE(c)) { | |
241 | /* make sure to not find half of a surrogate pair */ | |
242 | return u_strFindFirst(s, count, &c, 1); | |
243 | } else { | |
244 | /* trivial search for a BMP code point */ | |
245 | const UChar *limit=s+count; | |
246 | do { | |
247 | if(*s==c) { | |
248 | return (UChar *)s; | |
249 | } | |
250 | } while(++s!=limit); | |
251 | return NULL; | |
252 | } | |
253 | } | |
254 | ||
255 | U_CAPI UChar * U_EXPORT2 | |
256 | u_memchr32(const UChar *s, UChar32 c, int32_t count) { | |
257 | if((uint32_t)c<=U_BMP_MAX) { | |
258 | /* find BMP code point */ | |
259 | return u_memchr(s, (UChar)c, count); | |
260 | } else if(count<2) { | |
261 | /* too short for a surrogate pair */ | |
262 | return NULL; | |
263 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
264 | /* find supplementary code point as surrogate pair */ | |
265 | const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ | |
266 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
267 | ||
268 | do { | |
269 | if(*s==lead && *(s+1)==trail) { | |
270 | return (UChar *)s; | |
271 | } | |
272 | } while(++s!=limit); | |
273 | return NULL; | |
274 | } else { | |
275 | /* not a Unicode code point, not findable */ | |
276 | return NULL; | |
277 | } | |
278 | } | |
279 | ||
280 | /* Backward binary string search functions ---------------------------------- */ | |
281 | ||
282 | U_CAPI UChar * U_EXPORT2 | |
283 | u_strFindLast(const UChar *s, int32_t length, | |
284 | const UChar *sub, int32_t subLength) { | |
285 | const UChar *start, *limit, *p, *q, *subLimit; | |
286 | UChar c, cs; | |
287 | ||
288 | if(sub==NULL || subLength<-1) { | |
289 | return (UChar *)s; | |
290 | } | |
291 | if(s==NULL || length<-1) { | |
292 | return NULL; | |
293 | } | |
294 | ||
295 | /* | |
296 | * This implementation is more lazy than the one for u_strFindFirst(): | |
297 | * There is no special search code for NUL-terminated strings. | |
298 | * It does not seem to be worth it for searching substrings to | |
299 | * search forward and find all matches like in u_strrchr() and similar. | |
300 | * Therefore, we simply get both string lengths and search backward. | |
301 | * | |
302 | * markus 2002oct23 | |
303 | */ | |
304 | ||
305 | if(subLength<0) { | |
306 | subLength=u_strlen(sub); | |
307 | } | |
308 | if(subLength==0) { | |
309 | return (UChar *)s; | |
310 | } | |
311 | ||
312 | /* get sub[subLength-1] to search for it fast */ | |
313 | subLimit=sub+subLength; | |
314 | cs=*(--subLimit); | |
315 | --subLength; | |
316 | ||
317 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { | |
318 | /* the substring consists of a single, non-surrogate BMP code point */ | |
319 | return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); | |
320 | } | |
321 | ||
322 | if(length<0) { | |
323 | length=u_strlen(s); | |
324 | } | |
325 | ||
326 | /* subLength was decremented above */ | |
327 | if(length<=subLength) { | |
328 | return NULL; /* s is shorter than sub */ | |
329 | } | |
330 | ||
331 | start=s; | |
332 | limit=s+length; | |
333 | ||
334 | /* the substring must start no later than s+subLength */ | |
335 | s+=subLength; | |
336 | ||
337 | while(s!=limit) { | |
338 | c=*(--limit); | |
339 | if(c==cs) { | |
340 | /* found last substring UChar, compare rest */ | |
341 | p=limit; | |
342 | q=subLimit; | |
343 | for(;;) { | |
344 | if(q==sub) { | |
345 | if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { | |
346 | return (UChar *)p; /* well-formed match */ | |
347 | } else { | |
348 | break; /* no match because surrogate pair is split */ | |
349 | } | |
350 | } | |
351 | if(*(--p)!=*(--q)) { | |
352 | break; /* no match */ | |
353 | } | |
354 | } | |
355 | } | |
356 | } | |
357 | ||
358 | /* not found */ | |
359 | return NULL; | |
360 | } | |
361 | ||
362 | U_CAPI UChar * U_EXPORT2 | |
363 | u_strrstr(const UChar *s, const UChar *substring) { | |
364 | return u_strFindLast(s, -1, substring, -1); | |
365 | } | |
366 | ||
367 | U_CAPI UChar * U_EXPORT2 | |
368 | u_strrchr(const UChar *s, UChar c) { | |
369 | if(U16_IS_SURROGATE(c)) { | |
370 | /* make sure to not find half of a surrogate pair */ | |
371 | return u_strFindLast(s, -1, &c, 1); | |
372 | } else { | |
373 | const UChar *result=NULL; | |
374 | UChar cs; | |
375 | ||
376 | /* trivial search for a BMP code point */ | |
377 | for(;;) { | |
378 | if((cs=*s)==c) { | |
379 | result=s; | |
380 | } | |
381 | if(cs==0) { | |
382 | return (UChar *)result; | |
383 | } | |
384 | ++s; | |
385 | } | |
386 | } | |
387 | } | |
388 | ||
389 | U_CAPI UChar * U_EXPORT2 | |
390 | u_strrchr32(const UChar *s, UChar32 c) { | |
391 | if((uint32_t)c<=U_BMP_MAX) { | |
392 | /* find BMP code point */ | |
393 | return u_strrchr(s, (UChar)c); | |
394 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
395 | /* find supplementary code point as surrogate pair */ | |
396 | const UChar *result=NULL; | |
397 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
398 | ||
399 | while((cs=*s++)!=0) { | |
400 | if(cs==lead && *s==trail) { | |
401 | result=s-1; | |
402 | } | |
403 | } | |
404 | return (UChar *)result; | |
405 | } else { | |
406 | /* not a Unicode code point, not findable */ | |
407 | return NULL; | |
408 | } | |
409 | } | |
410 | ||
411 | U_CAPI UChar * U_EXPORT2 | |
412 | u_memrchr(const UChar *s, UChar c, int32_t count) { | |
413 | if(count<=0) { | |
414 | return NULL; /* no string */ | |
415 | } else if(U16_IS_SURROGATE(c)) { | |
416 | /* make sure to not find half of a surrogate pair */ | |
417 | return u_strFindLast(s, count, &c, 1); | |
418 | } else { | |
419 | /* trivial search for a BMP code point */ | |
420 | const UChar *limit=s+count; | |
421 | do { | |
422 | if(*(--limit)==c) { | |
423 | return (UChar *)limit; | |
424 | } | |
425 | } while(s!=limit); | |
426 | return NULL; | |
427 | } | |
428 | } | |
429 | ||
430 | U_CAPI UChar * U_EXPORT2 | |
431 | u_memrchr32(const UChar *s, UChar32 c, int32_t count) { | |
432 | if((uint32_t)c<=U_BMP_MAX) { | |
433 | /* find BMP code point */ | |
434 | return u_memrchr(s, (UChar)c, count); | |
435 | } else if(count<2) { | |
436 | /* too short for a surrogate pair */ | |
437 | return NULL; | |
438 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
439 | /* find supplementary code point as surrogate pair */ | |
440 | const UChar *limit=s+count-1; | |
441 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
442 | ||
443 | do { | |
444 | if(*limit==trail && *(limit-1)==lead) { | |
445 | return (UChar *)(limit-1); | |
446 | } | |
447 | } while(s!=--limit); | |
448 | return NULL; | |
449 | } else { | |
450 | /* not a Unicode code point, not findable */ | |
451 | return NULL; | |
452 | } | |
453 | } | |
454 | ||
455 | /* Tokenization functions --------------------------------------------------- */ | |
456 | ||
457 | /* | |
458 | * Match each code point in a string against each code point in the matchSet. | |
459 | * Return the index of the first string code point that | |
460 | * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. | |
461 | * Return -(string length)-1 if there is no such code point. | |
462 | */ | |
463 | static int32_t | |
464 | _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { | |
465 | int32_t matchLen, matchBMPLen, strItr, matchItr; | |
466 | UChar32 stringCh, matchCh; | |
467 | UChar c, c2; | |
468 | ||
469 | /* first part of matchSet contains only BMP code points */ | |
470 | matchBMPLen = 0; | |
471 | while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { | |
472 | ++matchBMPLen; | |
473 | } | |
474 | ||
475 | /* second part of matchSet contains BMP and supplementary code points */ | |
476 | matchLen = matchBMPLen; | |
477 | while(matchSet[matchLen] != 0) { | |
478 | ++matchLen; | |
479 | } | |
480 | ||
481 | for(strItr = 0; (c = string[strItr]) != 0;) { | |
482 | ++strItr; | |
483 | if(U16_IS_SINGLE(c)) { | |
484 | if(polarity) { | |
485 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { | |
486 | if(c == matchSet[matchItr]) { | |
487 | return strItr - 1; /* one matches */ | |
488 | } | |
489 | } | |
490 | } else { | |
491 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { | |
492 | if(c == matchSet[matchItr]) { | |
493 | goto endloop; | |
494 | } | |
495 | } | |
496 | return strItr - 1; /* none matches */ | |
497 | } | |
498 | } else { | |
499 | /* | |
500 | * No need to check for string length before U16_IS_TRAIL | |
501 | * because c2 could at worst be the terminating NUL. | |
502 | */ | |
503 | if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { | |
504 | ++strItr; | |
505 | stringCh = U16_GET_SUPPLEMENTARY(c, c2); | |
506 | } else { | |
507 | stringCh = c; /* unpaired trail surrogate */ | |
508 | } | |
509 | ||
510 | if(polarity) { | |
511 | for(matchItr = matchBMPLen; matchItr < matchLen;) { | |
512 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); | |
513 | if(stringCh == matchCh) { | |
514 | return strItr - U16_LENGTH(stringCh); /* one matches */ | |
515 | } | |
516 | } | |
517 | } else { | |
518 | for(matchItr = matchBMPLen; matchItr < matchLen;) { | |
519 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); | |
520 | if(stringCh == matchCh) { | |
521 | goto endloop; | |
522 | } | |
523 | } | |
524 | return strItr - U16_LENGTH(stringCh); /* none matches */ | |
525 | } | |
526 | } | |
527 | endloop: | |
528 | /* wish C had continue with labels like Java... */; | |
529 | } | |
530 | ||
531 | /* Didn't find it. */ | |
532 | return -strItr-1; | |
533 | } | |
534 | ||
535 | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ | |
536 | U_CAPI UChar * U_EXPORT2 | |
537 | u_strpbrk(const UChar *string, const UChar *matchSet) | |
538 | { | |
539 | int32_t index = _matchFromSet(string, matchSet, TRUE); | |
540 | if(index >= 0) { | |
541 | return (UChar *)string + index; | |
542 | } else { | |
543 | return NULL; | |
544 | } | |
545 | } | |
546 | ||
547 | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ | |
548 | U_CAPI int32_t U_EXPORT2 | |
549 | u_strcspn(const UChar *string, const UChar *matchSet) | |
550 | { | |
551 | int32_t index = _matchFromSet(string, matchSet, TRUE); | |
552 | if(index >= 0) { | |
553 | return index; | |
554 | } else { | |
555 | return -index - 1; /* == u_strlen(string) */ | |
556 | } | |
557 | } | |
558 | ||
559 | /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ | |
560 | U_CAPI int32_t U_EXPORT2 | |
561 | u_strspn(const UChar *string, const UChar *matchSet) | |
562 | { | |
563 | int32_t index = _matchFromSet(string, matchSet, FALSE); | |
564 | if(index >= 0) { | |
565 | return index; | |
566 | } else { | |
567 | return -index - 1; /* == u_strlen(string) */ | |
568 | } | |
569 | } | |
570 | ||
571 | /* ----- Text manipulation functions --- */ | |
572 | ||
573 | U_CAPI UChar* U_EXPORT2 | |
574 | u_strtok_r(UChar *src, | |
575 | const UChar *delim, | |
576 | UChar **saveState) | |
577 | { | |
578 | UChar *tokSource; | |
579 | UChar *nextToken; | |
580 | uint32_t nonDelimIdx; | |
581 | ||
582 | /* If saveState is NULL, the user messed up. */ | |
583 | if (src != NULL) { | |
584 | tokSource = src; | |
585 | *saveState = src; /* Set to "src" in case there are no delimiters */ | |
586 | } | |
587 | else if (*saveState) { | |
588 | tokSource = *saveState; | |
589 | } | |
590 | else { | |
591 | /* src == NULL && *saveState == NULL */ | |
592 | /* This shouldn't happen. We already finished tokenizing. */ | |
593 | return NULL; | |
594 | } | |
595 | ||
596 | /* Skip initial delimiters */ | |
597 | nonDelimIdx = u_strspn(tokSource, delim); | |
598 | tokSource = &tokSource[nonDelimIdx]; | |
599 | ||
600 | if (*tokSource) { | |
601 | nextToken = u_strpbrk(tokSource, delim); | |
602 | if (nextToken != NULL) { | |
603 | /* Create a token */ | |
604 | *(nextToken++) = 0; | |
605 | *saveState = nextToken; | |
606 | return tokSource; | |
607 | } | |
608 | else if (*saveState) { | |
609 | /* Return the last token */ | |
610 | *saveState = NULL; | |
611 | return tokSource; | |
612 | } | |
613 | } | |
614 | else { | |
615 | /* No tokens were found. Only delimiters were left. */ | |
616 | *saveState = NULL; | |
617 | } | |
618 | return NULL; | |
619 | } | |
620 | ||
621 | /* Miscellaneous functions -------------------------------------------------- */ | |
622 | ||
623 | U_CAPI UChar* U_EXPORT2 | |
624 | u_strcat(UChar *dst, | |
625 | const UChar *src) | |
626 | { | |
627 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
628 | ||
629 | while(*dst != 0) { /* To end of first string */ | |
630 | ++dst; | |
631 | } | |
632 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ | |
633 | } | |
634 | ||
635 | return anchor; | |
636 | } | |
637 | ||
638 | U_CAPI UChar* U_EXPORT2 | |
639 | u_strncat(UChar *dst, | |
640 | const UChar *src, | |
641 | int32_t n ) | |
642 | { | |
643 | if(n > 0) { | |
644 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
645 | ||
646 | while(*dst != 0) { /* To end of first string */ | |
647 | ++dst; | |
648 | } | |
649 | while((*dst = *src) != 0) { /* copy string 2 over */ | |
650 | ++dst; | |
651 | if(--n == 0) { | |
652 | *dst = 0; | |
653 | break; | |
654 | } | |
655 | ++src; | |
656 | } | |
657 | ||
658 | return anchor; | |
659 | } else { | |
660 | return dst; | |
661 | } | |
662 | } | |
663 | ||
664 | /* ----- Text property functions --- */ | |
665 | ||
666 | U_CAPI int32_t U_EXPORT2 | |
667 | u_strcmp(const UChar *s1, | |
668 | const UChar *s2) | |
669 | { | |
670 | UChar c1, c2; | |
671 | ||
672 | for(;;) { | |
673 | c1=*s1++; | |
674 | c2=*s2++; | |
675 | if (c1 != c2 || c1 == 0) { | |
676 | break; | |
677 | } | |
678 | } | |
679 | return (int32_t)c1 - (int32_t)c2; | |
680 | } | |
681 | ||
46f4442e | 682 | U_CFUNC int32_t U_EXPORT2 |
b75a7d8f A |
683 | uprv_strCompare(const UChar *s1, int32_t length1, |
684 | const UChar *s2, int32_t length2, | |
685 | UBool strncmpStyle, UBool codePointOrder) { | |
686 | const UChar *start1, *start2, *limit1, *limit2; | |
687 | UChar c1, c2; | |
688 | ||
689 | /* setup for fix-up */ | |
690 | start1=s1; | |
691 | start2=s2; | |
692 | ||
693 | /* compare identical prefixes - they do not need to be fixed up */ | |
694 | if(length1<0 && length2<0) { | |
695 | /* strcmp style, both NUL-terminated */ | |
696 | if(s1==s2) { | |
697 | return 0; | |
698 | } | |
699 | ||
700 | for(;;) { | |
701 | c1=*s1; | |
702 | c2=*s2; | |
703 | if(c1!=c2) { | |
704 | break; | |
705 | } | |
706 | if(c1==0) { | |
707 | return 0; | |
708 | } | |
709 | ++s1; | |
710 | ++s2; | |
711 | } | |
712 | ||
713 | /* setup for fix-up */ | |
714 | limit1=limit2=NULL; | |
715 | } else if(strncmpStyle) { | |
716 | /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ | |
717 | if(s1==s2) { | |
718 | return 0; | |
719 | } | |
720 | ||
721 | limit1=start1+length1; | |
722 | ||
723 | for(;;) { | |
724 | /* both lengths are same, check only one limit */ | |
725 | if(s1==limit1) { | |
726 | return 0; | |
727 | } | |
728 | ||
729 | c1=*s1; | |
730 | c2=*s2; | |
731 | if(c1!=c2) { | |
732 | break; | |
733 | } | |
734 | if(c1==0) { | |
735 | return 0; | |
736 | } | |
737 | ++s1; | |
738 | ++s2; | |
739 | } | |
740 | ||
741 | /* setup for fix-up */ | |
742 | limit2=start2+length1; /* use length1 here, too, to enforce assumption */ | |
743 | } else { | |
744 | /* memcmp/UnicodeString style, both length-specified */ | |
745 | int32_t lengthResult; | |
746 | ||
747 | if(length1<0) { | |
748 | length1=u_strlen(s1); | |
749 | } | |
750 | if(length2<0) { | |
751 | length2=u_strlen(s2); | |
752 | } | |
753 | ||
754 | /* limit1=start1+min(lenght1, length2) */ | |
755 | if(length1<length2) { | |
756 | lengthResult=-1; | |
757 | limit1=start1+length1; | |
758 | } else if(length1==length2) { | |
759 | lengthResult=0; | |
760 | limit1=start1+length1; | |
761 | } else /* length1>length2 */ { | |
762 | lengthResult=1; | |
763 | limit1=start1+length2; | |
764 | } | |
765 | ||
766 | if(s1==s2) { | |
767 | return lengthResult; | |
768 | } | |
769 | ||
770 | for(;;) { | |
771 | /* check pseudo-limit */ | |
772 | if(s1==limit1) { | |
773 | return lengthResult; | |
774 | } | |
775 | ||
776 | c1=*s1; | |
777 | c2=*s2; | |
778 | if(c1!=c2) { | |
779 | break; | |
780 | } | |
781 | ++s1; | |
782 | ++s2; | |
783 | } | |
784 | ||
785 | /* setup for fix-up */ | |
786 | limit1=start1+length1; | |
787 | limit2=start2+length2; | |
788 | } | |
789 | ||
790 | /* if both values are in or above the surrogate range, fix them up */ | |
791 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { | |
792 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
793 | if( | |
794 | (c1<=0xdbff && (s1+1)!=limit1 && UTF_IS_TRAIL(*(s1+1))) || | |
795 | (UTF_IS_TRAIL(c1) && start1!=s1 && UTF_IS_LEAD(*(s1-1))) | |
796 | ) { | |
797 | /* part of a surrogate pair, leave >=d800 */ | |
798 | } else { | |
799 | /* BMP code point - may be surrogate code point - make <d800 */ | |
800 | c1-=0x2800; | |
801 | } | |
802 | ||
803 | if( | |
804 | (c2<=0xdbff && (s2+1)!=limit2 && UTF_IS_TRAIL(*(s2+1))) || | |
805 | (UTF_IS_TRAIL(c2) && start2!=s2 && UTF_IS_LEAD(*(s2-1))) | |
806 | ) { | |
807 | /* part of a surrogate pair, leave >=d800 */ | |
808 | } else { | |
809 | /* BMP code point - may be surrogate code point - make <d800 */ | |
810 | c2-=0x2800; | |
811 | } | |
812 | } | |
813 | ||
814 | /* now c1 and c2 are in the requested (code unit or code point) order */ | |
815 | return (int32_t)c1-(int32_t)c2; | |
816 | } | |
817 | ||
818 | /* | |
819 | * Compare two strings as presented by UCharIterators. | |
820 | * Use code unit or code point order. | |
821 | * When the function returns, it is undefined where the iterators | |
822 | * have stopped. | |
823 | */ | |
824 | U_CAPI int32_t U_EXPORT2 | |
825 | u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { | |
826 | UChar32 c1, c2; | |
827 | ||
828 | /* argument checking */ | |
829 | if(iter1==NULL || iter2==NULL) { | |
830 | return 0; /* bad arguments */ | |
831 | } | |
832 | if(iter1==iter2) { | |
833 | return 0; /* identical iterators */ | |
834 | } | |
835 | ||
836 | /* reset iterators to start? */ | |
837 | iter1->move(iter1, 0, UITER_START); | |
838 | iter2->move(iter2, 0, UITER_START); | |
839 | ||
840 | /* compare identical prefixes - they do not need to be fixed up */ | |
841 | for(;;) { | |
842 | c1=iter1->next(iter1); | |
843 | c2=iter2->next(iter2); | |
844 | if(c1!=c2) { | |
845 | break; | |
846 | } | |
847 | if(c1==-1) { | |
848 | return 0; | |
849 | } | |
850 | } | |
851 | ||
852 | /* if both values are in or above the surrogate range, fix them up */ | |
853 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { | |
854 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
855 | if( | |
856 | (c1<=0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) || | |
857 | (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1)))) | |
858 | ) { | |
859 | /* part of a surrogate pair, leave >=d800 */ | |
860 | } else { | |
861 | /* BMP code point - may be surrogate code point - make <d800 */ | |
862 | c1-=0x2800; | |
863 | } | |
864 | ||
865 | if( | |
866 | (c2<=0xdbff && UTF_IS_TRAIL(iter2->current(iter2))) || | |
867 | (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2)))) | |
868 | ) { | |
869 | /* part of a surrogate pair, leave >=d800 */ | |
870 | } else { | |
871 | /* BMP code point - may be surrogate code point - make <d800 */ | |
872 | c2-=0x2800; | |
873 | } | |
874 | } | |
875 | ||
876 | /* now c1 and c2 are in the requested (code unit or code point) order */ | |
877 | return (int32_t)c1-(int32_t)c2; | |
878 | } | |
879 | ||
880 | #if 0 | |
881 | /* | |
882 | * u_strCompareIter() does not leave the iterators _on_ the different units. | |
883 | * This is possible but would cost a few extra indirect function calls to back | |
884 | * up if the last unit (c1 or c2 respectively) was >=0. | |
885 | * | |
886 | * Consistently leaving them _behind_ the different units is not an option | |
887 | * because the current "unit" is the end of the string if that is reached, | |
888 | * and in such a case the iterator does not move. | |
889 | * For example, when comparing "ab" with "abc", both iterators rest _on_ the end | |
890 | * of their strings. Calling previous() on each does not move them to where | |
891 | * the comparison fails. | |
892 | * | |
893 | * So the simplest semantics is to not define where the iterators end up. | |
894 | * | |
895 | * The following fragment is part of what would need to be done for backing up. | |
896 | */ | |
897 | void fragment { | |
898 | /* iff a surrogate is part of a surrogate pair, leave >=d800 */ | |
899 | if(c1<=0xdbff) { | |
900 | if(!UTF_IS_TRAIL(iter1->current(iter1))) { | |
901 | /* lead surrogate code point - make <d800 */ | |
902 | c1-=0x2800; | |
903 | } | |
904 | } else if(c1<=0xdfff) { | |
905 | int32_t index=iter1->getIndex(iter1, UITER_CURRENT); | |
906 | iter1->previous(iter1); /* ==c1 */ | |
907 | if(!UTF_IS_LEAD(iter1->previous(iter1))) { | |
908 | /* trail surrogate code point - make <d800 */ | |
909 | c1-=0x2800; | |
910 | } | |
911 | /* go back to behind where the difference is */ | |
912 | iter1->move(iter1, index, UITER_ZERO); | |
913 | } else /* 0xe000<=c1<=0xffff */ { | |
914 | /* BMP code point - make <d800 */ | |
915 | c1-=0x2800; | |
916 | } | |
917 | } | |
918 | #endif | |
919 | ||
920 | U_CAPI int32_t U_EXPORT2 | |
921 | u_strCompare(const UChar *s1, int32_t length1, | |
922 | const UChar *s2, int32_t length2, | |
923 | UBool codePointOrder) { | |
924 | /* argument checking */ | |
925 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { | |
926 | return 0; | |
927 | } | |
928 | return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); | |
929 | } | |
930 | ||
931 | /* String compare in code point order - u_strcmp() compares in code unit order. */ | |
932 | U_CAPI int32_t U_EXPORT2 | |
933 | u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { | |
934 | return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); | |
935 | } | |
936 | ||
937 | U_CAPI int32_t U_EXPORT2 | |
938 | u_strncmp(const UChar *s1, | |
939 | const UChar *s2, | |
940 | int32_t n) | |
941 | { | |
942 | if(n > 0) { | |
943 | int32_t rc; | |
944 | for(;;) { | |
945 | rc = (int32_t)*s1 - (int32_t)*s2; | |
946 | if(rc != 0 || *s1 == 0 || --n == 0) { | |
947 | return rc; | |
948 | } | |
949 | ++s1; | |
950 | ++s2; | |
951 | } | |
952 | } else { | |
953 | return 0; | |
954 | } | |
955 | } | |
956 | ||
957 | U_CAPI int32_t U_EXPORT2 | |
958 | u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { | |
959 | return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); | |
960 | } | |
961 | ||
962 | U_CAPI UChar* U_EXPORT2 | |
963 | u_strcpy(UChar *dst, | |
964 | const UChar *src) | |
965 | { | |
966 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
967 | ||
968 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ | |
969 | } | |
970 | ||
971 | return anchor; | |
972 | } | |
973 | ||
974 | U_CAPI UChar* U_EXPORT2 | |
975 | u_strncpy(UChar *dst, | |
976 | const UChar *src, | |
977 | int32_t n) | |
978 | { | |
979 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
980 | ||
981 | /* copy string 2 over */ | |
982 | while(n > 0 && (*(dst++) = *(src++)) != 0) { | |
983 | --n; | |
984 | } | |
985 | ||
986 | return anchor; | |
987 | } | |
988 | ||
989 | U_CAPI int32_t U_EXPORT2 | |
990 | u_strlen(const UChar *s) | |
991 | { | |
992 | #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR | |
73c04bcf | 993 | return (int32_t)uprv_wcslen(s); |
b75a7d8f A |
994 | #else |
995 | const UChar *t = s; | |
996 | while(*t != 0) { | |
997 | ++t; | |
998 | } | |
999 | return t - s; | |
1000 | #endif | |
1001 | } | |
1002 | ||
1003 | U_CAPI int32_t U_EXPORT2 | |
1004 | u_countChar32(const UChar *s, int32_t length) { | |
1005 | int32_t count; | |
1006 | ||
1007 | if(s==NULL || length<-1) { | |
1008 | return 0; | |
1009 | } | |
1010 | ||
1011 | count=0; | |
1012 | if(length>=0) { | |
1013 | while(length>0) { | |
1014 | ++count; | |
1015 | if(UTF_IS_LEAD(*s) && length>=2 && UTF_IS_TRAIL(*(s+1))) { | |
1016 | s+=2; | |
1017 | length-=2; | |
1018 | } else { | |
1019 | ++s; | |
1020 | --length; | |
1021 | } | |
1022 | } | |
1023 | } else /* length==-1 */ { | |
1024 | UChar c; | |
1025 | ||
1026 | for(;;) { | |
1027 | if((c=*s++)==0) { | |
1028 | break; | |
1029 | } | |
1030 | ++count; | |
1031 | ||
1032 | /* | |
1033 | * sufficient to look ahead one because of UTF-16; | |
1034 | * safe to look ahead one because at worst that would be the terminating NUL | |
1035 | */ | |
1036 | if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) { | |
1037 | ++s; | |
1038 | } | |
1039 | } | |
1040 | } | |
1041 | return count; | |
1042 | } | |
1043 | ||
1044 | U_CAPI UBool U_EXPORT2 | |
1045 | u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { | |
1046 | ||
1047 | if(number<0) { | |
1048 | return TRUE; | |
1049 | } | |
1050 | if(s==NULL || length<-1) { | |
1051 | return FALSE; | |
1052 | } | |
1053 | ||
1054 | if(length==-1) { | |
1055 | /* s is NUL-terminated */ | |
1056 | UChar c; | |
1057 | ||
1058 | /* count code points until they exceed */ | |
1059 | for(;;) { | |
1060 | if((c=*s++)==0) { | |
1061 | return FALSE; | |
1062 | } | |
1063 | if(number==0) { | |
1064 | return TRUE; | |
1065 | } | |
1066 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { | |
1067 | ++s; | |
1068 | } | |
1069 | --number; | |
1070 | } | |
1071 | } else { | |
1072 | /* length>=0 known */ | |
1073 | const UChar *limit; | |
1074 | int32_t maxSupplementary; | |
1075 | ||
1076 | /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ | |
1077 | if(((length+1)/2)>number) { | |
1078 | return TRUE; | |
1079 | } | |
1080 | ||
1081 | /* check if s does not even contain enough UChars */ | |
1082 | maxSupplementary=length-number; | |
1083 | if(maxSupplementary<=0) { | |
1084 | return FALSE; | |
1085 | } | |
1086 | /* there are maxSupplementary=length-number more UChars than asked-for code points */ | |
1087 | ||
1088 | /* | |
1089 | * count code points until they exceed and also check that there are | |
1090 | * no more than maxSupplementary supplementary code points (UChar pairs) | |
1091 | */ | |
1092 | limit=s+length; | |
1093 | for(;;) { | |
1094 | if(s==limit) { | |
1095 | return FALSE; | |
1096 | } | |
1097 | if(number==0) { | |
1098 | return TRUE; | |
1099 | } | |
1100 | if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { | |
1101 | ++s; | |
1102 | if(--maxSupplementary<=0) { | |
1103 | /* too many pairs - too few code points */ | |
1104 | return FALSE; | |
1105 | } | |
1106 | } | |
1107 | --number; | |
1108 | } | |
1109 | } | |
1110 | } | |
1111 | ||
1112 | U_CAPI UChar * U_EXPORT2 | |
1113 | u_memcpy(UChar *dest, const UChar *src, int32_t count) { | |
1114 | return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); | |
1115 | } | |
1116 | ||
1117 | U_CAPI UChar * U_EXPORT2 | |
1118 | u_memmove(UChar *dest, const UChar *src, int32_t count) { | |
1119 | return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); | |
1120 | } | |
1121 | ||
1122 | U_CAPI UChar * U_EXPORT2 | |
1123 | u_memset(UChar *dest, UChar c, int32_t count) { | |
1124 | if(count > 0) { | |
1125 | UChar *ptr = dest; | |
1126 | UChar *limit = dest + count; | |
1127 | ||
1128 | while (ptr < limit) { | |
1129 | *(ptr++) = c; | |
1130 | } | |
1131 | } | |
1132 | return dest; | |
1133 | } | |
1134 | ||
1135 | U_CAPI int32_t U_EXPORT2 | |
1136 | u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { | |
1137 | if(count > 0) { | |
1138 | const UChar *limit = buf1 + count; | |
1139 | int32_t result; | |
1140 | ||
1141 | while (buf1 < limit) { | |
1142 | result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; | |
1143 | if (result != 0) { | |
1144 | return result; | |
1145 | } | |
1146 | buf1++; | |
1147 | buf2++; | |
1148 | } | |
1149 | } | |
1150 | return 0; | |
1151 | } | |
1152 | ||
1153 | U_CAPI int32_t U_EXPORT2 | |
1154 | u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { | |
1155 | return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); | |
1156 | } | |
1157 | ||
b75a7d8f A |
1158 | /* u_unescape & support fns ------------------------------------------------- */ |
1159 | ||
1160 | /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ | |
1161 | static const UChar UNESCAPE_MAP[] = { | |
1162 | /*" 0x22, 0x22 */ | |
1163 | /*' 0x27, 0x27 */ | |
1164 | /*? 0x3F, 0x3F */ | |
1165 | /*\ 0x5C, 0x5C */ | |
1166 | /*a*/ 0x61, 0x07, | |
1167 | /*b*/ 0x62, 0x08, | |
1168 | /*e*/ 0x65, 0x1b, | |
1169 | /*f*/ 0x66, 0x0c, | |
1170 | /*n*/ 0x6E, 0x0a, | |
1171 | /*r*/ 0x72, 0x0d, | |
1172 | /*t*/ 0x74, 0x09, | |
1173 | /*v*/ 0x76, 0x0b | |
1174 | }; | |
1175 | enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; | |
1176 | ||
1177 | /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ | |
1178 | static int8_t _digit8(UChar c) { | |
1179 | if (c >= 0x0030 && c <= 0x0037) { | |
1180 | return (int8_t)(c - 0x0030); | |
1181 | } | |
1182 | return -1; | |
1183 | } | |
1184 | ||
1185 | /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ | |
1186 | static int8_t _digit16(UChar c) { | |
1187 | if (c >= 0x0030 && c <= 0x0039) { | |
1188 | return (int8_t)(c - 0x0030); | |
1189 | } | |
1190 | if (c >= 0x0041 && c <= 0x0046) { | |
1191 | return (int8_t)(c - (0x0041 - 10)); | |
1192 | } | |
1193 | if (c >= 0x0061 && c <= 0x0066) { | |
1194 | return (int8_t)(c - (0x0061 - 10)); | |
1195 | } | |
1196 | return -1; | |
1197 | } | |
1198 | ||
1199 | /* Parse a single escape sequence. Although this method deals in | |
1200 | * UChars, it does not use C++ or UnicodeString. This allows it to | |
1201 | * be used from C contexts. */ | |
1202 | U_CAPI UChar32 U_EXPORT2 | |
1203 | u_unescapeAt(UNESCAPE_CHAR_AT charAt, | |
1204 | int32_t *offset, | |
1205 | int32_t length, | |
1206 | void *context) { | |
1207 | ||
1208 | int32_t start = *offset; | |
1209 | UChar c; | |
1210 | UChar32 result = 0; | |
1211 | int8_t n = 0; | |
1212 | int8_t minDig = 0; | |
1213 | int8_t maxDig = 0; | |
1214 | int8_t bitsPerDigit = 4; | |
1215 | int8_t dig; | |
1216 | int32_t i; | |
1217 | UBool braces = FALSE; | |
1218 | ||
1219 | /* Check that offset is in range */ | |
1220 | if (*offset < 0 || *offset >= length) { | |
1221 | goto err; | |
1222 | } | |
1223 | ||
1224 | /* Fetch first UChar after '\\' */ | |
1225 | c = charAt((*offset)++, context); | |
1226 | ||
1227 | /* Convert hexadecimal and octal escapes */ | |
1228 | switch (c) { | |
1229 | case 0x0075 /*'u'*/: | |
1230 | minDig = maxDig = 4; | |
1231 | break; | |
1232 | case 0x0055 /*'U'*/: | |
1233 | minDig = maxDig = 8; | |
1234 | break; | |
1235 | case 0x0078 /*'x'*/: | |
1236 | minDig = 1; | |
1237 | if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { | |
1238 | ++(*offset); | |
1239 | braces = TRUE; | |
1240 | maxDig = 8; | |
1241 | } else { | |
1242 | maxDig = 2; | |
1243 | } | |
1244 | break; | |
1245 | default: | |
1246 | dig = _digit8(c); | |
1247 | if (dig >= 0) { | |
1248 | minDig = 1; | |
1249 | maxDig = 3; | |
1250 | n = 1; /* Already have first octal digit */ | |
1251 | bitsPerDigit = 3; | |
1252 | result = dig; | |
1253 | } | |
1254 | break; | |
1255 | } | |
1256 | if (minDig != 0) { | |
1257 | while (*offset < length && n < maxDig) { | |
1258 | c = charAt(*offset, context); | |
1259 | dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); | |
1260 | if (dig < 0) { | |
1261 | break; | |
1262 | } | |
1263 | result = (result << bitsPerDigit) | dig; | |
1264 | ++(*offset); | |
1265 | ++n; | |
1266 | } | |
1267 | if (n < minDig) { | |
1268 | goto err; | |
1269 | } | |
1270 | if (braces) { | |
1271 | if (c != 0x7D /*}*/) { | |
1272 | goto err; | |
1273 | } | |
1274 | ++(*offset); | |
1275 | } | |
374ca955 A |
1276 | if (result < 0 || result >= 0x110000) { |
1277 | goto err; | |
1278 | } | |
1279 | /* If an escape sequence specifies a lead surrogate, see if | |
1280 | * there is a trail surrogate after it, either as an escape or | |
1281 | * as a literal. If so, join them up into a supplementary. | |
1282 | */ | |
1283 | if (*offset < length && U16_IS_LEAD(result)) { | |
1284 | int32_t ahead = *offset + 1; | |
1285 | c = charAt(*offset, context); | |
1286 | if (c == 0x5C /*'\\'*/ && ahead < length) { | |
1287 | c = (UChar) u_unescapeAt(charAt, &ahead, length, context); | |
1288 | } | |
1289 | if (U16_IS_TRAIL(c)) { | |
1290 | *offset = ahead; | |
1291 | result = U16_GET_SUPPLEMENTARY(result, c); | |
1292 | } | |
1293 | } | |
b75a7d8f A |
1294 | return result; |
1295 | } | |
1296 | ||
1297 | /* Convert C-style escapes in table */ | |
1298 | for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { | |
1299 | if (c == UNESCAPE_MAP[i]) { | |
1300 | return UNESCAPE_MAP[i+1]; | |
1301 | } else if (c < UNESCAPE_MAP[i]) { | |
1302 | break; | |
1303 | } | |
1304 | } | |
1305 | ||
1306 | /* Map \cX to control-X: X & 0x1F */ | |
1307 | if (c == 0x0063 /*'c'*/ && *offset < length) { | |
1308 | c = charAt((*offset)++, context); | |
1309 | if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { | |
1310 | UChar c2 = charAt(*offset, context); | |
1311 | if (UTF_IS_SECOND_SURROGATE(c2)) { | |
1312 | ++(*offset); | |
1313 | c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */ | |
1314 | } | |
1315 | } | |
1316 | return 0x1F & c; | |
1317 | } | |
1318 | ||
1319 | /* If no special forms are recognized, then consider | |
1320 | * the backslash to generically escape the next character. | |
1321 | * Deal with surrogate pairs. */ | |
1322 | if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { | |
1323 | UChar c2 = charAt(*offset, context); | |
1324 | if (UTF_IS_SECOND_SURROGATE(c2)) { | |
1325 | ++(*offset); | |
1326 | return UTF16_GET_PAIR_VALUE(c, c2); | |
1327 | } | |
1328 | } | |
1329 | return c; | |
1330 | ||
1331 | err: | |
1332 | /* Invalid escape sequence */ | |
1333 | *offset = start; /* Reset to initial value */ | |
1334 | return (UChar32)0xFFFFFFFF; | |
1335 | } | |
1336 | ||
1337 | /* u_unescapeAt() callback to return a UChar from a char* */ | |
1338 | static UChar U_CALLCONV | |
1339 | _charPtr_charAt(int32_t offset, void *context) { | |
1340 | UChar c16; | |
1341 | /* It would be more efficient to access the invariant tables | |
1342 | * directly but there is no API for that. */ | |
1343 | u_charsToUChars(((char*) context) + offset, &c16, 1); | |
1344 | return c16; | |
1345 | } | |
1346 | ||
1347 | /* Append an escape-free segment of the text; used by u_unescape() */ | |
1348 | static void _appendUChars(UChar *dest, int32_t destCapacity, | |
1349 | const char *src, int32_t srcLen) { | |
1350 | if (destCapacity < 0) { | |
1351 | destCapacity = 0; | |
1352 | } | |
1353 | if (srcLen > destCapacity) { | |
1354 | srcLen = destCapacity; | |
1355 | } | |
1356 | u_charsToUChars(src, dest, srcLen); | |
1357 | } | |
1358 | ||
1359 | /* Do an invariant conversion of char* -> UChar*, with escape parsing */ | |
1360 | U_CAPI int32_t U_EXPORT2 | |
1361 | u_unescape(const char *src, UChar *dest, int32_t destCapacity) { | |
1362 | const char *segment = src; | |
1363 | int32_t i = 0; | |
1364 | char c; | |
1365 | ||
1366 | while ((c=*src) != 0) { | |
1367 | /* '\\' intentionally written as compiler-specific | |
1368 | * character constant to correspond to compiler-specific | |
1369 | * char* constants. */ | |
1370 | if (c == '\\') { | |
1371 | int32_t lenParsed = 0; | |
1372 | UChar32 c32; | |
1373 | if (src != segment) { | |
1374 | if (dest != NULL) { | |
1375 | _appendUChars(dest + i, destCapacity - i, | |
1376 | segment, src - segment); | |
1377 | } | |
1378 | i += src - segment; | |
1379 | } | |
1380 | ++src; /* advance past '\\' */ | |
1381 | c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*)src); | |
1382 | if (lenParsed == 0) { | |
1383 | goto err; | |
1384 | } | |
1385 | src += lenParsed; /* advance past escape seq. */ | |
1386 | if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) { | |
1387 | UTF_APPEND_CHAR_UNSAFE(dest, i, c32); | |
1388 | } else { | |
1389 | i += UTF_CHAR_LENGTH(c32); | |
1390 | } | |
1391 | segment = src; | |
1392 | } else { | |
1393 | ++src; | |
1394 | } | |
1395 | } | |
1396 | if (src != segment) { | |
1397 | if (dest != NULL) { | |
1398 | _appendUChars(dest + i, destCapacity - i, | |
1399 | segment, src - segment); | |
1400 | } | |
1401 | i += src - segment; | |
1402 | } | |
1403 | if (dest != NULL && i < destCapacity) { | |
1404 | dest[i] = 0; | |
1405 | } | |
1406 | return i; | |
1407 | ||
1408 | err: | |
1409 | if (dest != NULL && destCapacity > 0) { | |
1410 | *dest = 0; | |
1411 | } | |
1412 | return 0; | |
1413 | } | |
1414 | ||
1415 | /* C UGrowBuffer implementation --------------------------------------------- */ | |
1416 | ||
1417 | U_CAPI UBool /* U_CALLCONV U_EXPORT2 */ | |
1418 | u_growBufferFromStatic(void *context, | |
1419 | UChar **pBuffer, int32_t *pCapacity, int32_t reqCapacity, | |
1420 | int32_t length) { | |
1421 | UChar *newBuffer=(UChar *)uprv_malloc(reqCapacity*U_SIZEOF_UCHAR); | |
1422 | if(newBuffer!=NULL) { | |
1423 | if(length>0) { | |
1424 | uprv_memcpy(newBuffer, *pBuffer, length*U_SIZEOF_UCHAR); | |
1425 | } | |
1426 | *pCapacity=reqCapacity; | |
1427 | } else { | |
1428 | *pCapacity=0; | |
1429 | } | |
1430 | ||
1431 | /* release the old pBuffer if it was not statically allocated */ | |
1432 | if(*pBuffer!=(UChar *)context) { | |
1433 | uprv_free(*pBuffer); | |
1434 | } | |
1435 | ||
1436 | *pBuffer=newBuffer; | |
1437 | return (UBool)(newBuffer!=NULL); | |
1438 | } | |
1439 | ||
1440 | /* NUL-termination of strings ----------------------------------------------- */ | |
1441 | ||
1442 | /** | |
1443 | * NUL-terminate a string no matter what its type. | |
1444 | * Set warning and error codes accordingly. | |
1445 | */ | |
1446 | #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ | |
1447 | if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ | |
1448 | /* not a public function, so no complete argument checking */ \ | |
1449 | \ | |
1450 | if(length<0) { \ | |
1451 | /* assume that the caller handles this */ \ | |
1452 | } else if(length<destCapacity) { \ | |
1453 | /* NUL-terminate the string, the NUL fits */ \ | |
1454 | dest[length]=0; \ | |
1455 | /* unset the not-terminated warning but leave all others */ \ | |
1456 | if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ | |
1457 | *pErrorCode=U_ZERO_ERROR; \ | |
1458 | } \ | |
1459 | } else if(length==destCapacity) { \ | |
1460 | /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ | |
1461 | *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ | |
1462 | } else /* length>destCapacity */ { \ | |
1463 | /* even the string itself did not fit - set an error code */ \ | |
1464 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ | |
1465 | } \ | |
1466 | } | |
1467 | ||
1468 | U_CAPI int32_t U_EXPORT2 | |
1469 | u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1470 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1471 | return length; | |
1472 | } | |
1473 | ||
1474 | U_CAPI int32_t U_EXPORT2 | |
1475 | u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1476 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1477 | return length; | |
1478 | } | |
1479 | ||
1480 | U_CAPI int32_t U_EXPORT2 | |
1481 | u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1482 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1483 | return length; | |
1484 | } | |
1485 | ||
1486 | U_CAPI int32_t U_EXPORT2 | |
1487 | u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1488 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1489 | return length; | |
1490 | } |