]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
4388f060 | 4 | * Copyright (C) 1998-2011, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * | |
4388f060 | 9 | * File ustring.cpp |
b75a7d8f A |
10 | * |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 12/07/98 bertrand Creation. | |
15 | ****************************************************************************** | |
16 | */ | |
17 | ||
18 | #include "unicode/utypes.h" | |
b75a7d8f | 19 | #include "unicode/putil.h" |
374ca955 | 20 | #include "unicode/ustring.h" |
4388f060 | 21 | #include "unicode/utf16.h" |
b75a7d8f A |
22 | #include "cstring.h" |
23 | #include "cwchar.h" | |
24 | #include "cmemory.h" | |
b75a7d8f A |
25 | #include "ustr_imp.h" |
26 | ||
b75a7d8f A |
27 | /* ANSI string.h - style functions ------------------------------------------ */ |
28 | ||
b75a7d8f A |
29 | /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ |
30 | #define U_BMP_MAX 0xffff | |
31 | ||
32 | /* Forward binary string search functions ----------------------------------- */ | |
33 | ||
34 | /* | |
35 | * Test if a substring match inside a string is at code point boundaries. | |
36 | * All pointers refer to the same buffer. | |
37 | * The limit pointer may be NULL, all others must be real pointers. | |
38 | */ | |
4388f060 | 39 | static inline UBool |
b75a7d8f A |
40 | isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { |
41 | if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { | |
42 | /* the leading edge of the match is in the middle of a surrogate pair */ | |
43 | return FALSE; | |
44 | } | |
45 | if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { | |
46 | /* the trailing edge of the match is in the middle of a surrogate pair */ | |
47 | return FALSE; | |
48 | } | |
49 | return TRUE; | |
50 | } | |
51 | ||
52 | U_CAPI UChar * U_EXPORT2 | |
53 | u_strFindFirst(const UChar *s, int32_t length, | |
54 | const UChar *sub, int32_t subLength) { | |
55 | const UChar *start, *p, *q, *subLimit; | |
56 | UChar c, cs, cq; | |
57 | ||
58 | if(sub==NULL || subLength<-1) { | |
59 | return (UChar *)s; | |
60 | } | |
61 | if(s==NULL || length<-1) { | |
62 | return NULL; | |
63 | } | |
64 | ||
65 | start=s; | |
66 | ||
67 | if(length<0 && subLength<0) { | |
68 | /* both strings are NUL-terminated */ | |
69 | if((cs=*sub++)==0) { | |
70 | return (UChar *)s; | |
71 | } | |
72 | if(*sub==0 && !U16_IS_SURROGATE(cs)) { | |
73 | /* the substring consists of a single, non-surrogate BMP code point */ | |
74 | return u_strchr(s, cs); | |
75 | } | |
76 | ||
77 | while((c=*s++)!=0) { | |
78 | if(c==cs) { | |
79 | /* found first substring UChar, compare rest */ | |
80 | p=s; | |
81 | q=sub; | |
82 | for(;;) { | |
83 | if((cq=*q)==0) { | |
84 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { | |
85 | return (UChar *)(s-1); /* well-formed match */ | |
86 | } else { | |
87 | break; /* no match because surrogate pair is split */ | |
88 | } | |
89 | } | |
90 | if((c=*p)==0) { | |
91 | return NULL; /* no match, and none possible after s */ | |
92 | } | |
93 | if(c!=cq) { | |
94 | break; /* no match */ | |
95 | } | |
96 | ++p; | |
97 | ++q; | |
98 | } | |
99 | } | |
100 | } | |
101 | ||
102 | /* not found */ | |
103 | return NULL; | |
104 | } | |
105 | ||
106 | if(subLength<0) { | |
107 | subLength=u_strlen(sub); | |
108 | } | |
109 | if(subLength==0) { | |
110 | return (UChar *)s; | |
111 | } | |
112 | ||
113 | /* get sub[0] to search for it fast */ | |
114 | cs=*sub++; | |
115 | --subLength; | |
116 | subLimit=sub+subLength; | |
117 | ||
118 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { | |
119 | /* the substring consists of a single, non-surrogate BMP code point */ | |
120 | return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); | |
121 | } | |
122 | ||
123 | if(length<0) { | |
124 | /* s is NUL-terminated */ | |
125 | while((c=*s++)!=0) { | |
126 | if(c==cs) { | |
127 | /* found first substring UChar, compare rest */ | |
128 | p=s; | |
129 | q=sub; | |
130 | for(;;) { | |
131 | if(q==subLimit) { | |
132 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { | |
133 | return (UChar *)(s-1); /* well-formed match */ | |
134 | } else { | |
135 | break; /* no match because surrogate pair is split */ | |
136 | } | |
137 | } | |
138 | if((c=*p)==0) { | |
139 | return NULL; /* no match, and none possible after s */ | |
140 | } | |
141 | if(c!=*q) { | |
142 | break; /* no match */ | |
143 | } | |
144 | ++p; | |
145 | ++q; | |
146 | } | |
147 | } | |
148 | } | |
149 | } else { | |
150 | const UChar *limit, *preLimit; | |
151 | ||
152 | /* subLength was decremented above */ | |
153 | if(length<=subLength) { | |
154 | return NULL; /* s is shorter than sub */ | |
155 | } | |
156 | ||
157 | limit=s+length; | |
158 | ||
159 | /* the substring must start before preLimit */ | |
160 | preLimit=limit-subLength; | |
161 | ||
162 | while(s!=preLimit) { | |
163 | c=*s++; | |
164 | if(c==cs) { | |
165 | /* found first substring UChar, compare rest */ | |
166 | p=s; | |
167 | q=sub; | |
168 | for(;;) { | |
169 | if(q==subLimit) { | |
170 | if(isMatchAtCPBoundary(start, s-1, p, limit)) { | |
171 | return (UChar *)(s-1); /* well-formed match */ | |
172 | } else { | |
173 | break; /* no match because surrogate pair is split */ | |
174 | } | |
175 | } | |
176 | if(*p!=*q) { | |
177 | break; /* no match */ | |
178 | } | |
179 | ++p; | |
180 | ++q; | |
181 | } | |
182 | } | |
183 | } | |
184 | } | |
185 | ||
186 | /* not found */ | |
187 | return NULL; | |
188 | } | |
189 | ||
190 | U_CAPI UChar * U_EXPORT2 | |
191 | u_strstr(const UChar *s, const UChar *substring) { | |
192 | return u_strFindFirst(s, -1, substring, -1); | |
193 | } | |
194 | ||
195 | U_CAPI UChar * U_EXPORT2 | |
196 | u_strchr(const UChar *s, UChar c) { | |
197 | if(U16_IS_SURROGATE(c)) { | |
198 | /* make sure to not find half of a surrogate pair */ | |
199 | return u_strFindFirst(s, -1, &c, 1); | |
200 | } else { | |
201 | UChar cs; | |
202 | ||
203 | /* trivial search for a BMP code point */ | |
204 | for(;;) { | |
205 | if((cs=*s)==c) { | |
206 | return (UChar *)s; | |
207 | } | |
208 | if(cs==0) { | |
209 | return NULL; | |
210 | } | |
211 | ++s; | |
212 | } | |
213 | } | |
214 | } | |
215 | ||
216 | U_CAPI UChar * U_EXPORT2 | |
217 | u_strchr32(const UChar *s, UChar32 c) { | |
218 | if((uint32_t)c<=U_BMP_MAX) { | |
219 | /* find BMP code point */ | |
220 | return u_strchr(s, (UChar)c); | |
221 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
222 | /* find supplementary code point as surrogate pair */ | |
223 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
224 | ||
225 | while((cs=*s++)!=0) { | |
226 | if(cs==lead && *s==trail) { | |
227 | return (UChar *)(s-1); | |
228 | } | |
229 | } | |
230 | return NULL; | |
231 | } else { | |
232 | /* not a Unicode code point, not findable */ | |
233 | return NULL; | |
234 | } | |
235 | } | |
236 | ||
237 | U_CAPI UChar * U_EXPORT2 | |
238 | u_memchr(const UChar *s, UChar c, int32_t count) { | |
239 | if(count<=0) { | |
240 | return NULL; /* no string */ | |
241 | } else if(U16_IS_SURROGATE(c)) { | |
242 | /* make sure to not find half of a surrogate pair */ | |
243 | return u_strFindFirst(s, count, &c, 1); | |
244 | } else { | |
245 | /* trivial search for a BMP code point */ | |
246 | const UChar *limit=s+count; | |
247 | do { | |
248 | if(*s==c) { | |
249 | return (UChar *)s; | |
250 | } | |
251 | } while(++s!=limit); | |
252 | return NULL; | |
253 | } | |
254 | } | |
255 | ||
256 | U_CAPI UChar * U_EXPORT2 | |
257 | u_memchr32(const UChar *s, UChar32 c, int32_t count) { | |
258 | if((uint32_t)c<=U_BMP_MAX) { | |
259 | /* find BMP code point */ | |
260 | return u_memchr(s, (UChar)c, count); | |
261 | } else if(count<2) { | |
262 | /* too short for a surrogate pair */ | |
263 | return NULL; | |
264 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
265 | /* find supplementary code point as surrogate pair */ | |
266 | const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ | |
267 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
268 | ||
269 | do { | |
270 | if(*s==lead && *(s+1)==trail) { | |
271 | return (UChar *)s; | |
272 | } | |
273 | } while(++s!=limit); | |
274 | return NULL; | |
275 | } else { | |
276 | /* not a Unicode code point, not findable */ | |
277 | return NULL; | |
278 | } | |
279 | } | |
280 | ||
281 | /* Backward binary string search functions ---------------------------------- */ | |
282 | ||
283 | U_CAPI UChar * U_EXPORT2 | |
284 | u_strFindLast(const UChar *s, int32_t length, | |
285 | const UChar *sub, int32_t subLength) { | |
286 | const UChar *start, *limit, *p, *q, *subLimit; | |
287 | UChar c, cs; | |
288 | ||
289 | if(sub==NULL || subLength<-1) { | |
290 | return (UChar *)s; | |
291 | } | |
292 | if(s==NULL || length<-1) { | |
293 | return NULL; | |
294 | } | |
295 | ||
296 | /* | |
297 | * This implementation is more lazy than the one for u_strFindFirst(): | |
298 | * There is no special search code for NUL-terminated strings. | |
299 | * It does not seem to be worth it for searching substrings to | |
300 | * search forward and find all matches like in u_strrchr() and similar. | |
301 | * Therefore, we simply get both string lengths and search backward. | |
302 | * | |
303 | * markus 2002oct23 | |
304 | */ | |
305 | ||
306 | if(subLength<0) { | |
307 | subLength=u_strlen(sub); | |
308 | } | |
309 | if(subLength==0) { | |
310 | return (UChar *)s; | |
311 | } | |
312 | ||
313 | /* get sub[subLength-1] to search for it fast */ | |
314 | subLimit=sub+subLength; | |
315 | cs=*(--subLimit); | |
316 | --subLength; | |
317 | ||
318 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { | |
319 | /* the substring consists of a single, non-surrogate BMP code point */ | |
320 | return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); | |
321 | } | |
322 | ||
323 | if(length<0) { | |
324 | length=u_strlen(s); | |
325 | } | |
326 | ||
327 | /* subLength was decremented above */ | |
328 | if(length<=subLength) { | |
329 | return NULL; /* s is shorter than sub */ | |
330 | } | |
331 | ||
332 | start=s; | |
333 | limit=s+length; | |
334 | ||
335 | /* the substring must start no later than s+subLength */ | |
336 | s+=subLength; | |
337 | ||
338 | while(s!=limit) { | |
339 | c=*(--limit); | |
340 | if(c==cs) { | |
341 | /* found last substring UChar, compare rest */ | |
342 | p=limit; | |
343 | q=subLimit; | |
344 | for(;;) { | |
345 | if(q==sub) { | |
346 | if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { | |
347 | return (UChar *)p; /* well-formed match */ | |
348 | } else { | |
349 | break; /* no match because surrogate pair is split */ | |
350 | } | |
351 | } | |
352 | if(*(--p)!=*(--q)) { | |
353 | break; /* no match */ | |
354 | } | |
355 | } | |
356 | } | |
357 | } | |
358 | ||
359 | /* not found */ | |
360 | return NULL; | |
361 | } | |
362 | ||
363 | U_CAPI UChar * U_EXPORT2 | |
364 | u_strrstr(const UChar *s, const UChar *substring) { | |
365 | return u_strFindLast(s, -1, substring, -1); | |
366 | } | |
367 | ||
368 | U_CAPI UChar * U_EXPORT2 | |
369 | u_strrchr(const UChar *s, UChar c) { | |
370 | if(U16_IS_SURROGATE(c)) { | |
371 | /* make sure to not find half of a surrogate pair */ | |
372 | return u_strFindLast(s, -1, &c, 1); | |
373 | } else { | |
374 | const UChar *result=NULL; | |
375 | UChar cs; | |
376 | ||
377 | /* trivial search for a BMP code point */ | |
378 | for(;;) { | |
379 | if((cs=*s)==c) { | |
380 | result=s; | |
381 | } | |
382 | if(cs==0) { | |
383 | return (UChar *)result; | |
384 | } | |
385 | ++s; | |
386 | } | |
387 | } | |
388 | } | |
389 | ||
390 | U_CAPI UChar * U_EXPORT2 | |
391 | u_strrchr32(const UChar *s, UChar32 c) { | |
392 | if((uint32_t)c<=U_BMP_MAX) { | |
393 | /* find BMP code point */ | |
394 | return u_strrchr(s, (UChar)c); | |
395 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
396 | /* find supplementary code point as surrogate pair */ | |
397 | const UChar *result=NULL; | |
398 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
399 | ||
400 | while((cs=*s++)!=0) { | |
401 | if(cs==lead && *s==trail) { | |
402 | result=s-1; | |
403 | } | |
404 | } | |
405 | return (UChar *)result; | |
406 | } else { | |
407 | /* not a Unicode code point, not findable */ | |
408 | return NULL; | |
409 | } | |
410 | } | |
411 | ||
412 | U_CAPI UChar * U_EXPORT2 | |
413 | u_memrchr(const UChar *s, UChar c, int32_t count) { | |
414 | if(count<=0) { | |
415 | return NULL; /* no string */ | |
416 | } else if(U16_IS_SURROGATE(c)) { | |
417 | /* make sure to not find half of a surrogate pair */ | |
418 | return u_strFindLast(s, count, &c, 1); | |
419 | } else { | |
420 | /* trivial search for a BMP code point */ | |
421 | const UChar *limit=s+count; | |
422 | do { | |
423 | if(*(--limit)==c) { | |
424 | return (UChar *)limit; | |
425 | } | |
426 | } while(s!=limit); | |
427 | return NULL; | |
428 | } | |
429 | } | |
430 | ||
431 | U_CAPI UChar * U_EXPORT2 | |
432 | u_memrchr32(const UChar *s, UChar32 c, int32_t count) { | |
433 | if((uint32_t)c<=U_BMP_MAX) { | |
434 | /* find BMP code point */ | |
435 | return u_memrchr(s, (UChar)c, count); | |
436 | } else if(count<2) { | |
437 | /* too short for a surrogate pair */ | |
438 | return NULL; | |
439 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { | |
440 | /* find supplementary code point as surrogate pair */ | |
441 | const UChar *limit=s+count-1; | |
442 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); | |
443 | ||
444 | do { | |
445 | if(*limit==trail && *(limit-1)==lead) { | |
446 | return (UChar *)(limit-1); | |
447 | } | |
448 | } while(s!=--limit); | |
449 | return NULL; | |
450 | } else { | |
451 | /* not a Unicode code point, not findable */ | |
452 | return NULL; | |
453 | } | |
454 | } | |
455 | ||
456 | /* Tokenization functions --------------------------------------------------- */ | |
457 | ||
458 | /* | |
459 | * Match each code point in a string against each code point in the matchSet. | |
460 | * Return the index of the first string code point that | |
461 | * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. | |
462 | * Return -(string length)-1 if there is no such code point. | |
463 | */ | |
464 | static int32_t | |
465 | _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { | |
466 | int32_t matchLen, matchBMPLen, strItr, matchItr; | |
467 | UChar32 stringCh, matchCh; | |
468 | UChar c, c2; | |
469 | ||
470 | /* first part of matchSet contains only BMP code points */ | |
471 | matchBMPLen = 0; | |
472 | while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { | |
473 | ++matchBMPLen; | |
474 | } | |
475 | ||
476 | /* second part of matchSet contains BMP and supplementary code points */ | |
477 | matchLen = matchBMPLen; | |
478 | while(matchSet[matchLen] != 0) { | |
479 | ++matchLen; | |
480 | } | |
481 | ||
482 | for(strItr = 0; (c = string[strItr]) != 0;) { | |
483 | ++strItr; | |
484 | if(U16_IS_SINGLE(c)) { | |
485 | if(polarity) { | |
486 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { | |
487 | if(c == matchSet[matchItr]) { | |
488 | return strItr - 1; /* one matches */ | |
489 | } | |
490 | } | |
491 | } else { | |
492 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { | |
493 | if(c == matchSet[matchItr]) { | |
494 | goto endloop; | |
495 | } | |
496 | } | |
497 | return strItr - 1; /* none matches */ | |
498 | } | |
499 | } else { | |
500 | /* | |
501 | * No need to check for string length before U16_IS_TRAIL | |
502 | * because c2 could at worst be the terminating NUL. | |
503 | */ | |
504 | if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { | |
505 | ++strItr; | |
506 | stringCh = U16_GET_SUPPLEMENTARY(c, c2); | |
507 | } else { | |
508 | stringCh = c; /* unpaired trail surrogate */ | |
509 | } | |
510 | ||
511 | if(polarity) { | |
512 | for(matchItr = matchBMPLen; matchItr < matchLen;) { | |
513 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); | |
514 | if(stringCh == matchCh) { | |
515 | return strItr - U16_LENGTH(stringCh); /* one matches */ | |
516 | } | |
517 | } | |
518 | } else { | |
519 | for(matchItr = matchBMPLen; matchItr < matchLen;) { | |
520 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); | |
521 | if(stringCh == matchCh) { | |
522 | goto endloop; | |
523 | } | |
524 | } | |
525 | return strItr - U16_LENGTH(stringCh); /* none matches */ | |
526 | } | |
527 | } | |
528 | endloop: | |
529 | /* wish C had continue with labels like Java... */; | |
530 | } | |
531 | ||
532 | /* Didn't find it. */ | |
533 | return -strItr-1; | |
534 | } | |
535 | ||
536 | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ | |
537 | U_CAPI UChar * U_EXPORT2 | |
538 | u_strpbrk(const UChar *string, const UChar *matchSet) | |
539 | { | |
729e4ab9 A |
540 | int32_t idx = _matchFromSet(string, matchSet, TRUE); |
541 | if(idx >= 0) { | |
542 | return (UChar *)string + idx; | |
b75a7d8f A |
543 | } else { |
544 | return NULL; | |
545 | } | |
546 | } | |
547 | ||
548 | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ | |
549 | U_CAPI int32_t U_EXPORT2 | |
550 | u_strcspn(const UChar *string, const UChar *matchSet) | |
551 | { | |
729e4ab9 A |
552 | int32_t idx = _matchFromSet(string, matchSet, TRUE); |
553 | if(idx >= 0) { | |
554 | return idx; | |
b75a7d8f | 555 | } else { |
729e4ab9 | 556 | return -idx - 1; /* == u_strlen(string) */ |
b75a7d8f A |
557 | } |
558 | } | |
559 | ||
560 | /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ | |
561 | U_CAPI int32_t U_EXPORT2 | |
562 | u_strspn(const UChar *string, const UChar *matchSet) | |
563 | { | |
729e4ab9 A |
564 | int32_t idx = _matchFromSet(string, matchSet, FALSE); |
565 | if(idx >= 0) { | |
566 | return idx; | |
b75a7d8f | 567 | } else { |
729e4ab9 | 568 | return -idx - 1; /* == u_strlen(string) */ |
b75a7d8f A |
569 | } |
570 | } | |
571 | ||
572 | /* ----- Text manipulation functions --- */ | |
573 | ||
574 | U_CAPI UChar* U_EXPORT2 | |
575 | u_strtok_r(UChar *src, | |
576 | const UChar *delim, | |
577 | UChar **saveState) | |
578 | { | |
579 | UChar *tokSource; | |
580 | UChar *nextToken; | |
581 | uint32_t nonDelimIdx; | |
582 | ||
583 | /* If saveState is NULL, the user messed up. */ | |
584 | if (src != NULL) { | |
585 | tokSource = src; | |
586 | *saveState = src; /* Set to "src" in case there are no delimiters */ | |
587 | } | |
588 | else if (*saveState) { | |
589 | tokSource = *saveState; | |
590 | } | |
591 | else { | |
592 | /* src == NULL && *saveState == NULL */ | |
593 | /* This shouldn't happen. We already finished tokenizing. */ | |
594 | return NULL; | |
595 | } | |
596 | ||
597 | /* Skip initial delimiters */ | |
598 | nonDelimIdx = u_strspn(tokSource, delim); | |
599 | tokSource = &tokSource[nonDelimIdx]; | |
600 | ||
601 | if (*tokSource) { | |
602 | nextToken = u_strpbrk(tokSource, delim); | |
603 | if (nextToken != NULL) { | |
604 | /* Create a token */ | |
605 | *(nextToken++) = 0; | |
606 | *saveState = nextToken; | |
607 | return tokSource; | |
608 | } | |
609 | else if (*saveState) { | |
610 | /* Return the last token */ | |
611 | *saveState = NULL; | |
612 | return tokSource; | |
613 | } | |
614 | } | |
615 | else { | |
616 | /* No tokens were found. Only delimiters were left. */ | |
617 | *saveState = NULL; | |
618 | } | |
619 | return NULL; | |
620 | } | |
621 | ||
622 | /* Miscellaneous functions -------------------------------------------------- */ | |
623 | ||
624 | U_CAPI UChar* U_EXPORT2 | |
625 | u_strcat(UChar *dst, | |
626 | const UChar *src) | |
627 | { | |
628 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
629 | ||
630 | while(*dst != 0) { /* To end of first string */ | |
631 | ++dst; | |
632 | } | |
633 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ | |
634 | } | |
635 | ||
636 | return anchor; | |
637 | } | |
638 | ||
639 | U_CAPI UChar* U_EXPORT2 | |
640 | u_strncat(UChar *dst, | |
641 | const UChar *src, | |
642 | int32_t n ) | |
643 | { | |
644 | if(n > 0) { | |
645 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
646 | ||
647 | while(*dst != 0) { /* To end of first string */ | |
648 | ++dst; | |
649 | } | |
650 | while((*dst = *src) != 0) { /* copy string 2 over */ | |
651 | ++dst; | |
652 | if(--n == 0) { | |
653 | *dst = 0; | |
654 | break; | |
655 | } | |
656 | ++src; | |
657 | } | |
658 | ||
659 | return anchor; | |
660 | } else { | |
661 | return dst; | |
662 | } | |
663 | } | |
664 | ||
665 | /* ----- Text property functions --- */ | |
666 | ||
667 | U_CAPI int32_t U_EXPORT2 | |
668 | u_strcmp(const UChar *s1, | |
669 | const UChar *s2) | |
670 | { | |
671 | UChar c1, c2; | |
672 | ||
673 | for(;;) { | |
674 | c1=*s1++; | |
675 | c2=*s2++; | |
676 | if (c1 != c2 || c1 == 0) { | |
677 | break; | |
678 | } | |
679 | } | |
680 | return (int32_t)c1 - (int32_t)c2; | |
681 | } | |
682 | ||
46f4442e | 683 | U_CFUNC int32_t U_EXPORT2 |
b75a7d8f A |
684 | uprv_strCompare(const UChar *s1, int32_t length1, |
685 | const UChar *s2, int32_t length2, | |
686 | UBool strncmpStyle, UBool codePointOrder) { | |
687 | const UChar *start1, *start2, *limit1, *limit2; | |
688 | UChar c1, c2; | |
689 | ||
690 | /* setup for fix-up */ | |
691 | start1=s1; | |
692 | start2=s2; | |
693 | ||
694 | /* compare identical prefixes - they do not need to be fixed up */ | |
695 | if(length1<0 && length2<0) { | |
696 | /* strcmp style, both NUL-terminated */ | |
697 | if(s1==s2) { | |
698 | return 0; | |
699 | } | |
700 | ||
701 | for(;;) { | |
702 | c1=*s1; | |
703 | c2=*s2; | |
704 | if(c1!=c2) { | |
705 | break; | |
706 | } | |
707 | if(c1==0) { | |
708 | return 0; | |
709 | } | |
710 | ++s1; | |
711 | ++s2; | |
712 | } | |
713 | ||
714 | /* setup for fix-up */ | |
715 | limit1=limit2=NULL; | |
716 | } else if(strncmpStyle) { | |
717 | /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ | |
718 | if(s1==s2) { | |
719 | return 0; | |
720 | } | |
721 | ||
722 | limit1=start1+length1; | |
723 | ||
724 | for(;;) { | |
725 | /* both lengths are same, check only one limit */ | |
726 | if(s1==limit1) { | |
727 | return 0; | |
728 | } | |
729 | ||
730 | c1=*s1; | |
731 | c2=*s2; | |
732 | if(c1!=c2) { | |
733 | break; | |
734 | } | |
735 | if(c1==0) { | |
736 | return 0; | |
737 | } | |
738 | ++s1; | |
739 | ++s2; | |
740 | } | |
741 | ||
742 | /* setup for fix-up */ | |
743 | limit2=start2+length1; /* use length1 here, too, to enforce assumption */ | |
744 | } else { | |
745 | /* memcmp/UnicodeString style, both length-specified */ | |
746 | int32_t lengthResult; | |
747 | ||
748 | if(length1<0) { | |
749 | length1=u_strlen(s1); | |
750 | } | |
751 | if(length2<0) { | |
752 | length2=u_strlen(s2); | |
753 | } | |
754 | ||
755 | /* limit1=start1+min(lenght1, length2) */ | |
756 | if(length1<length2) { | |
757 | lengthResult=-1; | |
758 | limit1=start1+length1; | |
759 | } else if(length1==length2) { | |
760 | lengthResult=0; | |
761 | limit1=start1+length1; | |
762 | } else /* length1>length2 */ { | |
763 | lengthResult=1; | |
764 | limit1=start1+length2; | |
765 | } | |
766 | ||
767 | if(s1==s2) { | |
768 | return lengthResult; | |
769 | } | |
770 | ||
771 | for(;;) { | |
772 | /* check pseudo-limit */ | |
773 | if(s1==limit1) { | |
774 | return lengthResult; | |
775 | } | |
776 | ||
777 | c1=*s1; | |
778 | c2=*s2; | |
779 | if(c1!=c2) { | |
780 | break; | |
781 | } | |
782 | ++s1; | |
783 | ++s2; | |
784 | } | |
785 | ||
786 | /* setup for fix-up */ | |
787 | limit1=start1+length1; | |
788 | limit2=start2+length2; | |
789 | } | |
790 | ||
791 | /* if both values are in or above the surrogate range, fix them up */ | |
792 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { | |
793 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
794 | if( | |
4388f060 A |
795 | (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || |
796 | (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) | |
b75a7d8f A |
797 | ) { |
798 | /* part of a surrogate pair, leave >=d800 */ | |
799 | } else { | |
800 | /* BMP code point - may be surrogate code point - make <d800 */ | |
801 | c1-=0x2800; | |
802 | } | |
803 | ||
804 | if( | |
4388f060 A |
805 | (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) || |
806 | (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1))) | |
b75a7d8f A |
807 | ) { |
808 | /* part of a surrogate pair, leave >=d800 */ | |
809 | } else { | |
810 | /* BMP code point - may be surrogate code point - make <d800 */ | |
811 | c2-=0x2800; | |
812 | } | |
813 | } | |
814 | ||
815 | /* now c1 and c2 are in the requested (code unit or code point) order */ | |
816 | return (int32_t)c1-(int32_t)c2; | |
817 | } | |
818 | ||
819 | /* | |
820 | * Compare two strings as presented by UCharIterators. | |
821 | * Use code unit or code point order. | |
822 | * When the function returns, it is undefined where the iterators | |
823 | * have stopped. | |
824 | */ | |
825 | U_CAPI int32_t U_EXPORT2 | |
826 | u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { | |
827 | UChar32 c1, c2; | |
828 | ||
829 | /* argument checking */ | |
830 | if(iter1==NULL || iter2==NULL) { | |
831 | return 0; /* bad arguments */ | |
832 | } | |
833 | if(iter1==iter2) { | |
834 | return 0; /* identical iterators */ | |
835 | } | |
836 | ||
837 | /* reset iterators to start? */ | |
838 | iter1->move(iter1, 0, UITER_START); | |
839 | iter2->move(iter2, 0, UITER_START); | |
840 | ||
841 | /* compare identical prefixes - they do not need to be fixed up */ | |
842 | for(;;) { | |
843 | c1=iter1->next(iter1); | |
844 | c2=iter2->next(iter2); | |
845 | if(c1!=c2) { | |
846 | break; | |
847 | } | |
848 | if(c1==-1) { | |
849 | return 0; | |
850 | } | |
851 | } | |
852 | ||
853 | /* if both values are in or above the surrogate range, fix them up */ | |
854 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { | |
855 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
856 | if( | |
4388f060 A |
857 | (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || |
858 | (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) | |
b75a7d8f A |
859 | ) { |
860 | /* part of a surrogate pair, leave >=d800 */ | |
861 | } else { | |
862 | /* BMP code point - may be surrogate code point - make <d800 */ | |
863 | c1-=0x2800; | |
864 | } | |
865 | ||
866 | if( | |
4388f060 A |
867 | (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || |
868 | (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) | |
b75a7d8f A |
869 | ) { |
870 | /* part of a surrogate pair, leave >=d800 */ | |
871 | } else { | |
872 | /* BMP code point - may be surrogate code point - make <d800 */ | |
873 | c2-=0x2800; | |
874 | } | |
875 | } | |
876 | ||
877 | /* now c1 and c2 are in the requested (code unit or code point) order */ | |
878 | return (int32_t)c1-(int32_t)c2; | |
879 | } | |
880 | ||
881 | #if 0 | |
882 | /* | |
883 | * u_strCompareIter() does not leave the iterators _on_ the different units. | |
884 | * This is possible but would cost a few extra indirect function calls to back | |
885 | * up if the last unit (c1 or c2 respectively) was >=0. | |
886 | * | |
887 | * Consistently leaving them _behind_ the different units is not an option | |
888 | * because the current "unit" is the end of the string if that is reached, | |
889 | * and in such a case the iterator does not move. | |
890 | * For example, when comparing "ab" with "abc", both iterators rest _on_ the end | |
891 | * of their strings. Calling previous() on each does not move them to where | |
892 | * the comparison fails. | |
893 | * | |
894 | * So the simplest semantics is to not define where the iterators end up. | |
895 | * | |
896 | * The following fragment is part of what would need to be done for backing up. | |
897 | */ | |
898 | void fragment { | |
899 | /* iff a surrogate is part of a surrogate pair, leave >=d800 */ | |
900 | if(c1<=0xdbff) { | |
4388f060 | 901 | if(!U16_IS_TRAIL(iter1->current(iter1))) { |
b75a7d8f A |
902 | /* lead surrogate code point - make <d800 */ |
903 | c1-=0x2800; | |
904 | } | |
905 | } else if(c1<=0xdfff) { | |
729e4ab9 | 906 | int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); |
b75a7d8f | 907 | iter1->previous(iter1); /* ==c1 */ |
4388f060 | 908 | if(!U16_IS_LEAD(iter1->previous(iter1))) { |
b75a7d8f A |
909 | /* trail surrogate code point - make <d800 */ |
910 | c1-=0x2800; | |
911 | } | |
912 | /* go back to behind where the difference is */ | |
729e4ab9 | 913 | iter1->move(iter1, idx, UITER_ZERO); |
b75a7d8f A |
914 | } else /* 0xe000<=c1<=0xffff */ { |
915 | /* BMP code point - make <d800 */ | |
916 | c1-=0x2800; | |
917 | } | |
918 | } | |
919 | #endif | |
920 | ||
921 | U_CAPI int32_t U_EXPORT2 | |
922 | u_strCompare(const UChar *s1, int32_t length1, | |
923 | const UChar *s2, int32_t length2, | |
924 | UBool codePointOrder) { | |
925 | /* argument checking */ | |
926 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { | |
927 | return 0; | |
928 | } | |
929 | return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); | |
930 | } | |
931 | ||
932 | /* String compare in code point order - u_strcmp() compares in code unit order. */ | |
933 | U_CAPI int32_t U_EXPORT2 | |
934 | u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { | |
935 | return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); | |
936 | } | |
937 | ||
938 | U_CAPI int32_t U_EXPORT2 | |
939 | u_strncmp(const UChar *s1, | |
940 | const UChar *s2, | |
941 | int32_t n) | |
942 | { | |
943 | if(n > 0) { | |
944 | int32_t rc; | |
945 | for(;;) { | |
946 | rc = (int32_t)*s1 - (int32_t)*s2; | |
947 | if(rc != 0 || *s1 == 0 || --n == 0) { | |
948 | return rc; | |
949 | } | |
950 | ++s1; | |
951 | ++s2; | |
952 | } | |
953 | } else { | |
954 | return 0; | |
955 | } | |
956 | } | |
957 | ||
958 | U_CAPI int32_t U_EXPORT2 | |
959 | u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { | |
960 | return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); | |
961 | } | |
962 | ||
963 | U_CAPI UChar* U_EXPORT2 | |
964 | u_strcpy(UChar *dst, | |
965 | const UChar *src) | |
966 | { | |
967 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
968 | ||
969 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ | |
970 | } | |
971 | ||
972 | return anchor; | |
973 | } | |
974 | ||
975 | U_CAPI UChar* U_EXPORT2 | |
976 | u_strncpy(UChar *dst, | |
977 | const UChar *src, | |
978 | int32_t n) | |
979 | { | |
980 | UChar *anchor = dst; /* save a pointer to start of dst */ | |
981 | ||
982 | /* copy string 2 over */ | |
983 | while(n > 0 && (*(dst++) = *(src++)) != 0) { | |
984 | --n; | |
985 | } | |
986 | ||
987 | return anchor; | |
988 | } | |
989 | ||
990 | U_CAPI int32_t U_EXPORT2 | |
991 | u_strlen(const UChar *s) | |
992 | { | |
993 | #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR | |
73c04bcf | 994 | return (int32_t)uprv_wcslen(s); |
b75a7d8f A |
995 | #else |
996 | const UChar *t = s; | |
997 | while(*t != 0) { | |
998 | ++t; | |
999 | } | |
1000 | return t - s; | |
1001 | #endif | |
1002 | } | |
1003 | ||
1004 | U_CAPI int32_t U_EXPORT2 | |
1005 | u_countChar32(const UChar *s, int32_t length) { | |
1006 | int32_t count; | |
1007 | ||
1008 | if(s==NULL || length<-1) { | |
1009 | return 0; | |
1010 | } | |
1011 | ||
1012 | count=0; | |
1013 | if(length>=0) { | |
1014 | while(length>0) { | |
1015 | ++count; | |
4388f060 | 1016 | if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { |
b75a7d8f A |
1017 | s+=2; |
1018 | length-=2; | |
1019 | } else { | |
1020 | ++s; | |
1021 | --length; | |
1022 | } | |
1023 | } | |
1024 | } else /* length==-1 */ { | |
1025 | UChar c; | |
1026 | ||
1027 | for(;;) { | |
1028 | if((c=*s++)==0) { | |
1029 | break; | |
1030 | } | |
1031 | ++count; | |
1032 | ||
1033 | /* | |
1034 | * sufficient to look ahead one because of UTF-16; | |
1035 | * safe to look ahead one because at worst that would be the terminating NUL | |
1036 | */ | |
4388f060 | 1037 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { |
b75a7d8f A |
1038 | ++s; |
1039 | } | |
1040 | } | |
1041 | } | |
1042 | return count; | |
1043 | } | |
1044 | ||
1045 | U_CAPI UBool U_EXPORT2 | |
1046 | u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { | |
1047 | ||
1048 | if(number<0) { | |
1049 | return TRUE; | |
1050 | } | |
1051 | if(s==NULL || length<-1) { | |
1052 | return FALSE; | |
1053 | } | |
1054 | ||
1055 | if(length==-1) { | |
1056 | /* s is NUL-terminated */ | |
1057 | UChar c; | |
1058 | ||
1059 | /* count code points until they exceed */ | |
1060 | for(;;) { | |
1061 | if((c=*s++)==0) { | |
1062 | return FALSE; | |
1063 | } | |
1064 | if(number==0) { | |
1065 | return TRUE; | |
1066 | } | |
1067 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { | |
1068 | ++s; | |
1069 | } | |
1070 | --number; | |
1071 | } | |
1072 | } else { | |
1073 | /* length>=0 known */ | |
1074 | const UChar *limit; | |
1075 | int32_t maxSupplementary; | |
1076 | ||
1077 | /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ | |
1078 | if(((length+1)/2)>number) { | |
1079 | return TRUE; | |
1080 | } | |
1081 | ||
1082 | /* check if s does not even contain enough UChars */ | |
1083 | maxSupplementary=length-number; | |
1084 | if(maxSupplementary<=0) { | |
1085 | return FALSE; | |
1086 | } | |
1087 | /* there are maxSupplementary=length-number more UChars than asked-for code points */ | |
1088 | ||
1089 | /* | |
1090 | * count code points until they exceed and also check that there are | |
1091 | * no more than maxSupplementary supplementary code points (UChar pairs) | |
1092 | */ | |
1093 | limit=s+length; | |
1094 | for(;;) { | |
1095 | if(s==limit) { | |
1096 | return FALSE; | |
1097 | } | |
1098 | if(number==0) { | |
1099 | return TRUE; | |
1100 | } | |
1101 | if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { | |
1102 | ++s; | |
1103 | if(--maxSupplementary<=0) { | |
1104 | /* too many pairs - too few code points */ | |
1105 | return FALSE; | |
1106 | } | |
1107 | } | |
1108 | --number; | |
1109 | } | |
1110 | } | |
1111 | } | |
1112 | ||
1113 | U_CAPI UChar * U_EXPORT2 | |
1114 | u_memcpy(UChar *dest, const UChar *src, int32_t count) { | |
1115 | return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); | |
1116 | } | |
1117 | ||
1118 | U_CAPI UChar * U_EXPORT2 | |
1119 | u_memmove(UChar *dest, const UChar *src, int32_t count) { | |
1120 | return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); | |
1121 | } | |
1122 | ||
1123 | U_CAPI UChar * U_EXPORT2 | |
1124 | u_memset(UChar *dest, UChar c, int32_t count) { | |
1125 | if(count > 0) { | |
1126 | UChar *ptr = dest; | |
1127 | UChar *limit = dest + count; | |
1128 | ||
1129 | while (ptr < limit) { | |
1130 | *(ptr++) = c; | |
1131 | } | |
1132 | } | |
1133 | return dest; | |
1134 | } | |
1135 | ||
1136 | U_CAPI int32_t U_EXPORT2 | |
1137 | u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { | |
1138 | if(count > 0) { | |
1139 | const UChar *limit = buf1 + count; | |
1140 | int32_t result; | |
1141 | ||
1142 | while (buf1 < limit) { | |
1143 | result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; | |
1144 | if (result != 0) { | |
1145 | return result; | |
1146 | } | |
1147 | buf1++; | |
1148 | buf2++; | |
1149 | } | |
1150 | } | |
1151 | return 0; | |
1152 | } | |
1153 | ||
1154 | U_CAPI int32_t U_EXPORT2 | |
1155 | u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { | |
1156 | return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); | |
1157 | } | |
1158 | ||
b75a7d8f A |
1159 | /* u_unescape & support fns ------------------------------------------------- */ |
1160 | ||
1161 | /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ | |
1162 | static const UChar UNESCAPE_MAP[] = { | |
1163 | /*" 0x22, 0x22 */ | |
1164 | /*' 0x27, 0x27 */ | |
1165 | /*? 0x3F, 0x3F */ | |
1166 | /*\ 0x5C, 0x5C */ | |
1167 | /*a*/ 0x61, 0x07, | |
1168 | /*b*/ 0x62, 0x08, | |
1169 | /*e*/ 0x65, 0x1b, | |
1170 | /*f*/ 0x66, 0x0c, | |
1171 | /*n*/ 0x6E, 0x0a, | |
1172 | /*r*/ 0x72, 0x0d, | |
1173 | /*t*/ 0x74, 0x09, | |
1174 | /*v*/ 0x76, 0x0b | |
1175 | }; | |
1176 | enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; | |
1177 | ||
1178 | /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ | |
1179 | static int8_t _digit8(UChar c) { | |
1180 | if (c >= 0x0030 && c <= 0x0037) { | |
1181 | return (int8_t)(c - 0x0030); | |
1182 | } | |
1183 | return -1; | |
1184 | } | |
1185 | ||
1186 | /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ | |
1187 | static int8_t _digit16(UChar c) { | |
1188 | if (c >= 0x0030 && c <= 0x0039) { | |
1189 | return (int8_t)(c - 0x0030); | |
1190 | } | |
1191 | if (c >= 0x0041 && c <= 0x0046) { | |
1192 | return (int8_t)(c - (0x0041 - 10)); | |
1193 | } | |
1194 | if (c >= 0x0061 && c <= 0x0066) { | |
1195 | return (int8_t)(c - (0x0061 - 10)); | |
1196 | } | |
1197 | return -1; | |
1198 | } | |
1199 | ||
1200 | /* Parse a single escape sequence. Although this method deals in | |
1201 | * UChars, it does not use C++ or UnicodeString. This allows it to | |
1202 | * be used from C contexts. */ | |
1203 | U_CAPI UChar32 U_EXPORT2 | |
1204 | u_unescapeAt(UNESCAPE_CHAR_AT charAt, | |
1205 | int32_t *offset, | |
1206 | int32_t length, | |
1207 | void *context) { | |
1208 | ||
1209 | int32_t start = *offset; | |
1210 | UChar c; | |
1211 | UChar32 result = 0; | |
1212 | int8_t n = 0; | |
1213 | int8_t minDig = 0; | |
1214 | int8_t maxDig = 0; | |
1215 | int8_t bitsPerDigit = 4; | |
1216 | int8_t dig; | |
1217 | int32_t i; | |
1218 | UBool braces = FALSE; | |
1219 | ||
1220 | /* Check that offset is in range */ | |
1221 | if (*offset < 0 || *offset >= length) { | |
1222 | goto err; | |
1223 | } | |
1224 | ||
1225 | /* Fetch first UChar after '\\' */ | |
1226 | c = charAt((*offset)++, context); | |
1227 | ||
1228 | /* Convert hexadecimal and octal escapes */ | |
1229 | switch (c) { | |
1230 | case 0x0075 /*'u'*/: | |
1231 | minDig = maxDig = 4; | |
1232 | break; | |
1233 | case 0x0055 /*'U'*/: | |
1234 | minDig = maxDig = 8; | |
1235 | break; | |
1236 | case 0x0078 /*'x'*/: | |
1237 | minDig = 1; | |
1238 | if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { | |
1239 | ++(*offset); | |
1240 | braces = TRUE; | |
1241 | maxDig = 8; | |
1242 | } else { | |
1243 | maxDig = 2; | |
1244 | } | |
1245 | break; | |
1246 | default: | |
1247 | dig = _digit8(c); | |
1248 | if (dig >= 0) { | |
1249 | minDig = 1; | |
1250 | maxDig = 3; | |
1251 | n = 1; /* Already have first octal digit */ | |
1252 | bitsPerDigit = 3; | |
1253 | result = dig; | |
1254 | } | |
1255 | break; | |
1256 | } | |
1257 | if (minDig != 0) { | |
1258 | while (*offset < length && n < maxDig) { | |
1259 | c = charAt(*offset, context); | |
1260 | dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); | |
1261 | if (dig < 0) { | |
1262 | break; | |
1263 | } | |
1264 | result = (result << bitsPerDigit) | dig; | |
1265 | ++(*offset); | |
1266 | ++n; | |
1267 | } | |
1268 | if (n < minDig) { | |
1269 | goto err; | |
1270 | } | |
1271 | if (braces) { | |
1272 | if (c != 0x7D /*}*/) { | |
1273 | goto err; | |
1274 | } | |
1275 | ++(*offset); | |
1276 | } | |
374ca955 A |
1277 | if (result < 0 || result >= 0x110000) { |
1278 | goto err; | |
1279 | } | |
1280 | /* If an escape sequence specifies a lead surrogate, see if | |
1281 | * there is a trail surrogate after it, either as an escape or | |
1282 | * as a literal. If so, join them up into a supplementary. | |
1283 | */ | |
1284 | if (*offset < length && U16_IS_LEAD(result)) { | |
1285 | int32_t ahead = *offset + 1; | |
1286 | c = charAt(*offset, context); | |
1287 | if (c == 0x5C /*'\\'*/ && ahead < length) { | |
1288 | c = (UChar) u_unescapeAt(charAt, &ahead, length, context); | |
1289 | } | |
1290 | if (U16_IS_TRAIL(c)) { | |
1291 | *offset = ahead; | |
1292 | result = U16_GET_SUPPLEMENTARY(result, c); | |
1293 | } | |
1294 | } | |
b75a7d8f A |
1295 | return result; |
1296 | } | |
1297 | ||
1298 | /* Convert C-style escapes in table */ | |
1299 | for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { | |
1300 | if (c == UNESCAPE_MAP[i]) { | |
1301 | return UNESCAPE_MAP[i+1]; | |
1302 | } else if (c < UNESCAPE_MAP[i]) { | |
1303 | break; | |
1304 | } | |
1305 | } | |
1306 | ||
1307 | /* Map \cX to control-X: X & 0x1F */ | |
1308 | if (c == 0x0063 /*'c'*/ && *offset < length) { | |
1309 | c = charAt((*offset)++, context); | |
4388f060 | 1310 | if (U16_IS_LEAD(c) && *offset < length) { |
b75a7d8f | 1311 | UChar c2 = charAt(*offset, context); |
4388f060 | 1312 | if (U16_IS_TRAIL(c2)) { |
b75a7d8f | 1313 | ++(*offset); |
4388f060 | 1314 | c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */ |
b75a7d8f A |
1315 | } |
1316 | } | |
1317 | return 0x1F & c; | |
1318 | } | |
1319 | ||
1320 | /* If no special forms are recognized, then consider | |
1321 | * the backslash to generically escape the next character. | |
1322 | * Deal with surrogate pairs. */ | |
4388f060 | 1323 | if (U16_IS_LEAD(c) && *offset < length) { |
b75a7d8f | 1324 | UChar c2 = charAt(*offset, context); |
4388f060 | 1325 | if (U16_IS_TRAIL(c2)) { |
b75a7d8f | 1326 | ++(*offset); |
4388f060 | 1327 | return U16_GET_SUPPLEMENTARY(c, c2); |
b75a7d8f A |
1328 | } |
1329 | } | |
1330 | return c; | |
1331 | ||
1332 | err: | |
1333 | /* Invalid escape sequence */ | |
1334 | *offset = start; /* Reset to initial value */ | |
1335 | return (UChar32)0xFFFFFFFF; | |
1336 | } | |
1337 | ||
1338 | /* u_unescapeAt() callback to return a UChar from a char* */ | |
1339 | static UChar U_CALLCONV | |
1340 | _charPtr_charAt(int32_t offset, void *context) { | |
1341 | UChar c16; | |
1342 | /* It would be more efficient to access the invariant tables | |
1343 | * directly but there is no API for that. */ | |
1344 | u_charsToUChars(((char*) context) + offset, &c16, 1); | |
1345 | return c16; | |
1346 | } | |
1347 | ||
1348 | /* Append an escape-free segment of the text; used by u_unescape() */ | |
1349 | static void _appendUChars(UChar *dest, int32_t destCapacity, | |
1350 | const char *src, int32_t srcLen) { | |
1351 | if (destCapacity < 0) { | |
1352 | destCapacity = 0; | |
1353 | } | |
1354 | if (srcLen > destCapacity) { | |
1355 | srcLen = destCapacity; | |
1356 | } | |
1357 | u_charsToUChars(src, dest, srcLen); | |
1358 | } | |
1359 | ||
1360 | /* Do an invariant conversion of char* -> UChar*, with escape parsing */ | |
1361 | U_CAPI int32_t U_EXPORT2 | |
1362 | u_unescape(const char *src, UChar *dest, int32_t destCapacity) { | |
1363 | const char *segment = src; | |
1364 | int32_t i = 0; | |
1365 | char c; | |
1366 | ||
1367 | while ((c=*src) != 0) { | |
1368 | /* '\\' intentionally written as compiler-specific | |
1369 | * character constant to correspond to compiler-specific | |
1370 | * char* constants. */ | |
1371 | if (c == '\\') { | |
1372 | int32_t lenParsed = 0; | |
1373 | UChar32 c32; | |
1374 | if (src != segment) { | |
1375 | if (dest != NULL) { | |
1376 | _appendUChars(dest + i, destCapacity - i, | |
729e4ab9 | 1377 | segment, (int32_t)(src - segment)); |
b75a7d8f | 1378 | } |
729e4ab9 | 1379 | i += (int32_t)(src - segment); |
b75a7d8f A |
1380 | } |
1381 | ++src; /* advance past '\\' */ | |
729e4ab9 | 1382 | c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); |
b75a7d8f A |
1383 | if (lenParsed == 0) { |
1384 | goto err; | |
1385 | } | |
1386 | src += lenParsed; /* advance past escape seq. */ | |
4388f060 A |
1387 | if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { |
1388 | U16_APPEND_UNSAFE(dest, i, c32); | |
b75a7d8f | 1389 | } else { |
4388f060 | 1390 | i += U16_LENGTH(c32); |
b75a7d8f A |
1391 | } |
1392 | segment = src; | |
1393 | } else { | |
1394 | ++src; | |
1395 | } | |
1396 | } | |
1397 | if (src != segment) { | |
1398 | if (dest != NULL) { | |
1399 | _appendUChars(dest + i, destCapacity - i, | |
729e4ab9 | 1400 | segment, (int32_t)(src - segment)); |
b75a7d8f | 1401 | } |
729e4ab9 | 1402 | i += (int32_t)(src - segment); |
b75a7d8f A |
1403 | } |
1404 | if (dest != NULL && i < destCapacity) { | |
1405 | dest[i] = 0; | |
1406 | } | |
1407 | return i; | |
1408 | ||
1409 | err: | |
1410 | if (dest != NULL && destCapacity > 0) { | |
1411 | *dest = 0; | |
1412 | } | |
1413 | return 0; | |
1414 | } | |
1415 | ||
b75a7d8f A |
1416 | /* NUL-termination of strings ----------------------------------------------- */ |
1417 | ||
1418 | /** | |
1419 | * NUL-terminate a string no matter what its type. | |
1420 | * Set warning and error codes accordingly. | |
1421 | */ | |
1422 | #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ | |
1423 | if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ | |
1424 | /* not a public function, so no complete argument checking */ \ | |
1425 | \ | |
1426 | if(length<0) { \ | |
1427 | /* assume that the caller handles this */ \ | |
1428 | } else if(length<destCapacity) { \ | |
1429 | /* NUL-terminate the string, the NUL fits */ \ | |
1430 | dest[length]=0; \ | |
1431 | /* unset the not-terminated warning but leave all others */ \ | |
1432 | if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ | |
1433 | *pErrorCode=U_ZERO_ERROR; \ | |
1434 | } \ | |
1435 | } else if(length==destCapacity) { \ | |
1436 | /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ | |
1437 | *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ | |
1438 | } else /* length>destCapacity */ { \ | |
1439 | /* even the string itself did not fit - set an error code */ \ | |
1440 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ | |
1441 | } \ | |
1442 | } | |
1443 | ||
1444 | U_CAPI int32_t U_EXPORT2 | |
1445 | u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1446 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1447 | return length; | |
1448 | } | |
1449 | ||
1450 | U_CAPI int32_t U_EXPORT2 | |
1451 | u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1452 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1453 | return length; | |
1454 | } | |
1455 | ||
1456 | U_CAPI int32_t U_EXPORT2 | |
1457 | u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1458 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1459 | return length; | |
1460 | } | |
1461 | ||
1462 | U_CAPI int32_t U_EXPORT2 | |
1463 | u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { | |
1464 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); | |
1465 | return length; | |
1466 | } | |
4388f060 A |
1467 | |
1468 | // Compute the hash code for a string -------------------------------------- *** | |
1469 | ||
1470 | // Moved here from uhash.c so that UnicodeString::hashCode() does not depend | |
1471 | // on UHashtable code. | |
1472 | ||
1473 | /* | |
1474 | Compute the hash by iterating sparsely over about 32 (up to 63) | |
1475 | characters spaced evenly through the string. For each character, | |
1476 | multiply the previous hash value by a prime number and add the new | |
1477 | character in, like a linear congruential random number generator, | |
1478 | producing a pseudorandom deterministic value well distributed over | |
1479 | the output range. [LIU] | |
1480 | */ | |
1481 | ||
1482 | #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \ | |
1483 | int32_t hash = 0; \ | |
1484 | const TYPE *p = (const TYPE*) STR; \ | |
1485 | if (p != NULL) { \ | |
1486 | int32_t len = (int32_t)(STRLEN); \ | |
1487 | int32_t inc = ((len - 32) / 32) + 1; \ | |
1488 | const TYPE *limit = p + len; \ | |
1489 | while (p<limit) { \ | |
1490 | hash = (hash * 37) + DEREF; \ | |
1491 | p += inc; \ | |
1492 | } \ | |
1493 | } \ | |
1494 | return hash | |
1495 | ||
1496 | /* Used by UnicodeString to compute its hashcode - Not public API. */ | |
1497 | U_CAPI int32_t U_EXPORT2 | |
1498 | ustr_hashUCharsN(const UChar *str, int32_t length) { | |
1499 | STRING_HASH(UChar, str, length, *p); | |
1500 | } | |
1501 | ||
1502 | U_CAPI int32_t U_EXPORT2 | |
1503 | ustr_hashCharsN(const char *str, int32_t length) { | |
1504 | STRING_HASH(uint8_t, str, length, *p); | |
1505 | } | |
1506 | ||
1507 | U_CAPI int32_t U_EXPORT2 | |
1508 | ustr_hashICharsN(const char *str, int32_t length) { | |
1509 | STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p)); | |
1510 | } |