]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
729e4ab9 | 4 | * Copyright (C) 2001-2010, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * | |
9 | * File ustrtrns.c | |
10 | * | |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 9/10/2001 Ram Creation. | |
15 | ****************************************************************************** | |
16 | */ | |
17 | ||
18 | /******************************************************************************* | |
19 | * | |
20 | * u_strTo* and u_strFrom* APIs | |
374ca955 | 21 | * WCS functions moved to ustr_wcs.c for better modularization |
b75a7d8f A |
22 | * |
23 | ******************************************************************************* | |
24 | */ | |
25 | ||
26 | ||
27 | #include "unicode/putil.h" | |
b75a7d8f A |
28 | #include "unicode/ustring.h" |
29 | #include "cstring.h" | |
b75a7d8f A |
30 | #include "cmemory.h" |
31 | #include "ustr_imp.h" | |
32 | ||
b75a7d8f | 33 | U_CAPI UChar* U_EXPORT2 |
729e4ab9 A |
34 | u_strFromUTF32WithSub(UChar *dest, |
35 | int32_t destCapacity, | |
b75a7d8f A |
36 | int32_t *pDestLength, |
37 | const UChar32 *src, | |
38 | int32_t srcLength, | |
729e4ab9 A |
39 | UChar32 subchar, int32_t *pNumSubstitutions, |
40 | UErrorCode *pErrorCode) { | |
41 | const UChar32 *srcLimit; | |
42 | UChar32 ch; | |
43 | UChar *destLimit; | |
44 | UChar *pDest; | |
45 | int32_t reqLength; | |
46 | int32_t numSubstitutions; | |
b75a7d8f A |
47 | |
48 | /* args check */ | |
729e4ab9 | 49 | if(U_FAILURE(*pErrorCode)){ |
b75a7d8f A |
50 | return NULL; |
51 | } | |
729e4ab9 A |
52 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
53 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || | |
54 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) | |
55 | ) { | |
b75a7d8f A |
56 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
57 | return NULL; | |
58 | } | |
59 | ||
729e4ab9 A |
60 | if(pNumSubstitutions != NULL) { |
61 | *pNumSubstitutions = 0; | |
62 | } | |
63 | ||
64 | pDest = dest; | |
65 | destLimit = dest + destCapacity; | |
66 | reqLength = 0; | |
67 | numSubstitutions = 0; | |
68 | ||
69 | if(srcLength < 0) { | |
70 | /* simple loop for conversion of a NUL-terminated BMP string */ | |
71 | while((ch=*src) != 0 && | |
72 | ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { | |
73 | ++src; | |
74 | if(pDest < destLimit) { | |
75 | *pDest++ = (UChar)ch; | |
76 | } else { | |
77 | ++reqLength; | |
b75a7d8f A |
78 | } |
79 | } | |
729e4ab9 A |
80 | srcLimit = src; |
81 | if(ch != 0) { | |
82 | /* "complicated" case, find the end of the remaining string */ | |
83 | while(*++srcLimit != 0) {} | |
b75a7d8f | 84 | } |
729e4ab9 A |
85 | } else { |
86 | srcLimit = src + srcLength; | |
87 | } | |
88 | ||
89 | /* convert with length */ | |
90 | while(src < srcLimit) { | |
91 | ch = *src++; | |
92 | do { | |
93 | /* usually "loops" once; twice only for writing subchar */ | |
94 | if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { | |
95 | if(pDest < destLimit) { | |
96 | *pDest++ = (UChar)ch; | |
97 | } else { | |
98 | ++reqLength; | |
99 | } | |
100 | break; | |
101 | } else if(0x10000 <= ch && ch <= 0x10ffff) { | |
102 | if((pDest + 2) <= destLimit) { | |
103 | *pDest++ = U16_LEAD(ch); | |
104 | *pDest++ = U16_TRAIL(ch); | |
105 | } else { | |
106 | reqLength += 2; | |
b75a7d8f | 107 | } |
729e4ab9 A |
108 | break; |
109 | } else if((ch = subchar) < 0) { | |
110 | /* surrogate code point, or not a Unicode code point at all */ | |
b75a7d8f A |
111 | *pErrorCode = U_INVALID_CHAR_FOUND; |
112 | return NULL; | |
729e4ab9 A |
113 | } else { |
114 | ++numSubstitutions; | |
b75a7d8f | 115 | } |
729e4ab9 | 116 | } while(TRUE); |
b75a7d8f A |
117 | } |
118 | ||
73c04bcf | 119 | reqLength += (int32_t)(pDest - dest); |
729e4ab9 | 120 | if(pDestLength) { |
b75a7d8f A |
121 | *pDestLength = reqLength; |
122 | } | |
729e4ab9 A |
123 | if(pNumSubstitutions != NULL) { |
124 | *pNumSubstitutions = numSubstitutions; | |
125 | } | |
b75a7d8f A |
126 | |
127 | /* Terminate the buffer */ | |
729e4ab9 | 128 | u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
b75a7d8f A |
129 | |
130 | return dest; | |
131 | } | |
132 | ||
729e4ab9 A |
133 | U_CAPI UChar* U_EXPORT2 |
134 | u_strFromUTF32(UChar *dest, | |
135 | int32_t destCapacity, | |
136 | int32_t *pDestLength, | |
137 | const UChar32 *src, | |
138 | int32_t srcLength, | |
139 | UErrorCode *pErrorCode) { | |
140 | return u_strFromUTF32WithSub( | |
141 | dest, destCapacity, pDestLength, | |
142 | src, srcLength, | |
143 | U_SENTINEL, NULL, | |
144 | pErrorCode); | |
145 | } | |
b75a7d8f A |
146 | |
147 | U_CAPI UChar32* U_EXPORT2 | |
729e4ab9 A |
148 | u_strToUTF32WithSub(UChar32 *dest, |
149 | int32_t destCapacity, | |
150 | int32_t *pDestLength, | |
151 | const UChar *src, | |
152 | int32_t srcLength, | |
153 | UChar32 subchar, int32_t *pNumSubstitutions, | |
154 | UErrorCode *pErrorCode) { | |
155 | const UChar *srcLimit; | |
156 | UChar32 ch; | |
157 | UChar ch2; | |
158 | UChar32 *destLimit; | |
159 | UChar32 *pDest; | |
160 | int32_t reqLength; | |
161 | int32_t numSubstitutions; | |
b75a7d8f A |
162 | |
163 | /* args check */ | |
729e4ab9 | 164 | if(U_FAILURE(*pErrorCode)){ |
b75a7d8f A |
165 | return NULL; |
166 | } | |
729e4ab9 A |
167 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
168 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || | |
169 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) | |
170 | ) { | |
b75a7d8f A |
171 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
172 | return NULL; | |
173 | } | |
174 | ||
729e4ab9 A |
175 | if(pNumSubstitutions != NULL) { |
176 | *pNumSubstitutions = 0; | |
177 | } | |
178 | ||
179 | pDest = dest; | |
180 | destLimit = dest + destCapacity; | |
181 | reqLength = 0; | |
182 | numSubstitutions = 0; | |
183 | ||
184 | if(srcLength < 0) { | |
185 | /* simple loop for conversion of a NUL-terminated BMP string */ | |
186 | while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { | |
187 | ++src; | |
188 | if(pDest < destLimit) { | |
189 | *pDest++ = ch; | |
190 | } else { | |
191 | ++reqLength; | |
b75a7d8f | 192 | } |
b75a7d8f | 193 | } |
729e4ab9 A |
194 | srcLimit = src; |
195 | if(ch != 0) { | |
196 | /* "complicated" case, find the end of the remaining string */ | |
197 | while(*++srcLimit != 0) {} | |
b75a7d8f A |
198 | } |
199 | } else { | |
729e4ab9 A |
200 | srcLimit = src + srcLength; |
201 | } | |
202 | ||
203 | /* convert with length */ | |
204 | while(src < srcLimit) { | |
205 | ch = *src++; | |
206 | if(!U16_IS_SURROGATE(ch)) { | |
207 | /* write or count ch below */ | |
208 | } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { | |
209 | ++src; | |
210 | ch = U16_GET_SUPPLEMENTARY(ch, ch2); | |
211 | } else if((ch = subchar) < 0) { | |
212 | /* unpaired surrogate */ | |
213 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
214 | return NULL; | |
215 | } else { | |
216 | ++numSubstitutions; | |
b75a7d8f | 217 | } |
729e4ab9 A |
218 | if(pDest < destLimit) { |
219 | *pDest++ = ch; | |
220 | } else { | |
b75a7d8f A |
221 | ++reqLength; |
222 | } | |
223 | } | |
224 | ||
729e4ab9 A |
225 | reqLength += (int32_t)(pDest - dest); |
226 | if(pDestLength) { | |
b75a7d8f A |
227 | *pDestLength = reqLength; |
228 | } | |
729e4ab9 A |
229 | if(pNumSubstitutions != NULL) { |
230 | *pNumSubstitutions = numSubstitutions; | |
231 | } | |
b75a7d8f A |
232 | |
233 | /* Terminate the buffer */ | |
729e4ab9 | 234 | u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); |
b75a7d8f A |
235 | |
236 | return dest; | |
237 | } | |
238 | ||
729e4ab9 A |
239 | U_CAPI UChar32* U_EXPORT2 |
240 | u_strToUTF32(UChar32 *dest, | |
241 | int32_t destCapacity, | |
242 | int32_t *pDestLength, | |
243 | const UChar *src, | |
244 | int32_t srcLength, | |
245 | UErrorCode *pErrorCode) { | |
246 | return u_strToUTF32WithSub( | |
247 | dest, destCapacity, pDestLength, | |
248 | src, srcLength, | |
249 | U_SENTINEL, NULL, | |
250 | pErrorCode); | |
251 | } | |
252 | ||
73c04bcf A |
253 | /* for utf8_nextCharSafeBodyTerminated() */ |
254 | static const UChar32 | |
255 | utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; | |
256 | ||
257 | /* | |
258 | * Version of utf8_nextCharSafeBody() with the following differences: | |
259 | * - checks for NUL termination instead of length | |
260 | * - works with pointers instead of indexes | |
261 | * - always strict (strict==-1) | |
262 | * | |
263 | * *ps points to after the lead byte and will be moved to after the last trail byte. | |
264 | * c is the lead byte. | |
265 | * @return the code point, or U_SENTINEL | |
266 | */ | |
267 | static UChar32 | |
268 | utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { | |
269 | const uint8_t *s=*ps; | |
270 | uint8_t trail, illegal=0; | |
271 | uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); | |
272 | UTF8_MASK_LEAD_BYTE((c), count); | |
273 | /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ | |
274 | switch(count) { | |
275 | /* each branch falls through to the next one */ | |
276 | case 5: | |
277 | case 4: | |
278 | /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ | |
279 | illegal=1; | |
280 | break; | |
281 | case 3: | |
282 | trail=(uint8_t)(*s++ - 0x80); | |
283 | c=(c<<6)|trail; | |
284 | if(trail>0x3f || c>=0x110) { | |
285 | /* not a trail byte, or code point>0x10ffff (outside Unicode) */ | |
286 | illegal=1; | |
287 | break; | |
288 | } | |
289 | case 2: | |
290 | trail=(uint8_t)(*s++ - 0x80); | |
291 | if(trail>0x3f) { | |
292 | /* not a trail byte */ | |
293 | illegal=1; | |
294 | break; | |
295 | } | |
296 | c=(c<<6)|trail; | |
297 | case 1: | |
298 | trail=(uint8_t)(*s++ - 0x80); | |
299 | if(trail>0x3f) { | |
300 | /* not a trail byte */ | |
301 | illegal=1; | |
302 | } | |
303 | c=(c<<6)|trail; | |
304 | break; | |
305 | case 0: | |
306 | return U_SENTINEL; | |
307 | /* no default branch to optimize switch() - all values are covered */ | |
308 | } | |
309 | ||
310 | /* correct sequence - all trail bytes have (b7..b6)==(10)? */ | |
311 | /* illegal is also set if count>=4 */ | |
312 | if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { | |
313 | /* error handling */ | |
314 | /* don't go beyond this sequence */ | |
315 | s=*ps; | |
316 | while(count>0 && UTF8_IS_TRAIL(*s)) { | |
317 | ++s; | |
318 | --count; | |
319 | } | |
320 | c=U_SENTINEL; | |
321 | } | |
322 | *ps=s; | |
323 | return c; | |
324 | } | |
325 | ||
326 | /* | |
327 | * Version of utf8_nextCharSafeBody() with the following differences: | |
328 | * - works with pointers instead of indexes | |
329 | * - always strict (strict==-1) | |
330 | * | |
331 | * *ps points to after the lead byte and will be moved to after the last trail byte. | |
332 | * c is the lead byte. | |
333 | * @return the code point, or U_SENTINEL | |
334 | */ | |
335 | static UChar32 | |
336 | utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { | |
337 | const uint8_t *s=*ps; | |
338 | uint8_t trail, illegal=0; | |
339 | uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); | |
340 | if((limit-s)>=count) { | |
341 | UTF8_MASK_LEAD_BYTE((c), count); | |
342 | /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ | |
343 | switch(count) { | |
344 | /* each branch falls through to the next one */ | |
345 | case 5: | |
346 | case 4: | |
347 | /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ | |
348 | illegal=1; | |
349 | break; | |
350 | case 3: | |
351 | trail=*s++; | |
352 | c=(c<<6)|(trail&0x3f); | |
353 | if(c<0x110) { | |
354 | illegal|=(trail&0xc0)^0x80; | |
355 | } else { | |
356 | /* code point>0x10ffff, outside Unicode */ | |
357 | illegal=1; | |
358 | break; | |
359 | } | |
360 | case 2: | |
361 | trail=*s++; | |
362 | c=(c<<6)|(trail&0x3f); | |
363 | illegal|=(trail&0xc0)^0x80; | |
364 | case 1: | |
365 | trail=*s++; | |
366 | c=(c<<6)|(trail&0x3f); | |
367 | illegal|=(trail&0xc0)^0x80; | |
368 | break; | |
369 | case 0: | |
370 | return U_SENTINEL; | |
371 | /* no default branch to optimize switch() - all values are covered */ | |
372 | } | |
373 | } else { | |
374 | illegal=1; /* too few bytes left */ | |
375 | } | |
376 | ||
377 | /* correct sequence - all trail bytes have (b7..b6)==(10)? */ | |
378 | /* illegal is also set if count>=4 */ | |
379 | if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { | |
380 | /* error handling */ | |
381 | /* don't go beyond this sequence */ | |
382 | s=*ps; | |
383 | while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) { | |
384 | ++s; | |
385 | --count; | |
386 | } | |
387 | c=U_SENTINEL; | |
388 | } | |
389 | *ps=s; | |
390 | return c; | |
391 | } | |
392 | ||
b75a7d8f | 393 | U_CAPI UChar* U_EXPORT2 |
73c04bcf | 394 | u_strFromUTF8WithSub(UChar *dest, |
b75a7d8f A |
395 | int32_t destCapacity, |
396 | int32_t *pDestLength, | |
73c04bcf | 397 | const char* src, |
b75a7d8f | 398 | int32_t srcLength, |
73c04bcf | 399 | UChar32 subchar, int32_t *pNumSubstitutions, |
b75a7d8f | 400 | UErrorCode *pErrorCode){ |
b75a7d8f A |
401 | UChar *pDest = dest; |
402 | UChar *pDestLimit = dest+destCapacity; | |
73c04bcf | 403 | UChar32 ch; |
b75a7d8f | 404 | int32_t reqLength = 0; |
73c04bcf A |
405 | const uint8_t* pSrc = (const uint8_t*) src; |
406 | uint8_t t1, t2; /* trail bytes */ | |
407 | int32_t numSubstitutions; | |
b75a7d8f A |
408 | |
409 | /* args check */ | |
410 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ | |
411 | return NULL; | |
412 | } | |
413 | ||
729e4ab9 A |
414 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
415 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || | |
73c04bcf A |
416 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
417 | ) { | |
b75a7d8f A |
418 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
419 | return NULL; | |
420 | } | |
421 | ||
729e4ab9 A |
422 | if(pNumSubstitutions!=NULL) { |
423 | *pNumSubstitutions=0; | |
424 | } | |
73c04bcf A |
425 | numSubstitutions=0; |
426 | ||
427 | /* | |
428 | * Inline processing of UTF-8 byte sequences: | |
429 | * | |
430 | * Byte sequences for the most common characters are handled inline in | |
431 | * the conversion loops. In order to reduce the path lengths for those | |
432 | * characters, the tests are arranged in a kind of binary search. | |
433 | * ASCII (<=0x7f) is checked first, followed by the dividing point | |
434 | * between 2- and 3-byte sequences (0xe0). | |
435 | * The 3-byte branch is tested first to speed up CJK text. | |
436 | * The compiler should combine the subtractions for the two tests for 0xe0. | |
437 | * Each branch then tests for the other end of its range. | |
438 | */ | |
439 | ||
440 | if(srcLength < 0){ | |
441 | /* | |
442 | * Transform a NUL-terminated string. | |
443 | * The code explicitly checks for NULs only in the lead byte position. | |
444 | * A NUL byte in the trail byte position fails the trail byte range check anyway. | |
445 | */ | |
446 | while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { | |
447 | if(ch <= 0x7f){ | |
448 | *pDest++=(UChar)ch; | |
449 | ++pSrc; | |
450 | } else { | |
451 | if(ch > 0xe0) { | |
452 | if( /* handle U+1000..U+CFFF inline */ | |
453 | ch <= 0xec && | |
454 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && | |
455 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f | |
456 | ) { | |
457 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
458 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); | |
459 | pSrc += 3; | |
460 | continue; | |
461 | } | |
462 | } else if(ch < 0xe0) { | |
463 | if( /* handle U+0080..U+07FF inline */ | |
464 | ch >= 0xc2 && | |
465 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f | |
466 | ) { | |
467 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); | |
468 | pSrc += 2; | |
469 | continue; | |
470 | } | |
471 | } | |
472 | ||
473 | /* function call for "complicated" and error cases */ | |
474 | ++pSrc; /* continue after the lead byte */ | |
475 | ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); | |
476 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { | |
477 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
478 | return NULL; | |
479 | } else if(ch<=0xFFFF) { | |
480 | *(pDest++)=(UChar)ch; | |
481 | } else { | |
482 | *(pDest++)=UTF16_LEAD(ch); | |
483 | if(pDest<pDestLimit) { | |
484 | *(pDest++)=UTF16_TRAIL(ch); | |
485 | } else { | |
486 | reqLength++; | |
487 | break; | |
488 | } | |
489 | } | |
490 | } | |
491 | } | |
492 | ||
493 | /* Pre-flight the rest of the string. */ | |
494 | while((ch = *pSrc) != 0) { | |
495 | if(ch <= 0x7f){ | |
496 | ++reqLength; | |
497 | ++pSrc; | |
498 | } else { | |
499 | if(ch > 0xe0) { | |
500 | if( /* handle U+1000..U+CFFF inline */ | |
501 | ch <= 0xec && | |
502 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f && | |
503 | (uint8_t)(pSrc[2] - 0x80) <= 0x3f | |
504 | ) { | |
505 | ++reqLength; | |
506 | pSrc += 3; | |
507 | continue; | |
508 | } | |
509 | } else if(ch < 0xe0) { | |
510 | if( /* handle U+0080..U+07FF inline */ | |
511 | ch >= 0xc2 && | |
512 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f | |
513 | ) { | |
514 | ++reqLength; | |
515 | pSrc += 2; | |
516 | continue; | |
517 | } | |
518 | } | |
519 | ||
520 | /* function call for "complicated" and error cases */ | |
521 | ++pSrc; /* continue after the lead byte */ | |
522 | ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); | |
523 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { | |
524 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
525 | return NULL; | |
526 | } | |
527 | reqLength += U16_LENGTH(ch); | |
528 | } | |
529 | } | |
530 | } else /* srcLength >= 0 */ { | |
531 | const uint8_t *pSrcLimit = pSrc + srcLength; | |
532 | int32_t count; | |
533 | ||
534 | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ | |
535 | for(;;) { | |
536 | /* | |
537 | * Each iteration of the inner loop progresses by at most 3 UTF-8 | |
538 | * bytes and one UChar, for most characters. | |
539 | * For supplementary code points (4 & 2), which are rare, | |
540 | * there is an additional adjustment. | |
541 | */ | |
542 | count = (int32_t)(pDestLimit - pDest); | |
543 | srcLength = (int32_t)((pSrcLimit - pSrc) / 3); | |
544 | if(count > srcLength) { | |
545 | count = srcLength; /* min(remaining dest, remaining src/3) */ | |
546 | } | |
547 | if(count < 3) { | |
548 | /* | |
549 | * Too much overhead if we get near the end of the string, | |
550 | * continue with the next loop. | |
551 | */ | |
552 | break; | |
553 | } | |
554 | ||
555 | do { | |
556 | ch = *pSrc; | |
557 | if(ch <= 0x7f){ | |
558 | *pDest++=(UChar)ch; | |
559 | ++pSrc; | |
560 | } else { | |
561 | if(ch > 0xe0) { | |
562 | if( /* handle U+1000..U+CFFF inline */ | |
563 | ch <= 0xec && | |
564 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && | |
565 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f | |
566 | ) { | |
567 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
568 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); | |
569 | pSrc += 3; | |
570 | continue; | |
571 | } | |
572 | } else if(ch < 0xe0) { | |
573 | if( /* handle U+0080..U+07FF inline */ | |
574 | ch >= 0xc2 && | |
575 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f | |
576 | ) { | |
577 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); | |
578 | pSrc += 2; | |
579 | continue; | |
580 | } | |
581 | } | |
582 | ||
583 | if(ch >= 0xf0 || subchar > 0xffff) { | |
584 | /* | |
585 | * We may read up to six bytes and write up to two UChars, | |
586 | * which we didn't account for with computing count, | |
587 | * so we adjust it here. | |
588 | */ | |
589 | if(--count == 0) { | |
590 | break; | |
591 | } | |
592 | } | |
593 | ||
594 | /* function call for "complicated" and error cases */ | |
595 | ++pSrc; /* continue after the lead byte */ | |
596 | ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); | |
597 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ | |
598 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
599 | return NULL; | |
600 | }else if(ch<=0xFFFF){ | |
601 | *(pDest++)=(UChar)ch; | |
602 | }else{ | |
603 | *(pDest++)=UTF16_LEAD(ch); | |
729e4ab9 | 604 | *(pDest++)=UTF16_TRAIL(ch); |
73c04bcf A |
605 | } |
606 | } | |
607 | } while(--count > 0); | |
608 | } | |
609 | ||
610 | while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { | |
611 | ch = *pSrc; | |
612 | if(ch <= 0x7f){ | |
613 | *pDest++=(UChar)ch; | |
614 | ++pSrc; | |
615 | } else { | |
616 | if(ch > 0xe0) { | |
617 | if( /* handle U+1000..U+CFFF inline */ | |
618 | ch <= 0xec && | |
619 | ((pSrcLimit - pSrc) >= 3) && | |
620 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && | |
621 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f | |
622 | ) { | |
623 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
624 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); | |
625 | pSrc += 3; | |
626 | continue; | |
627 | } | |
628 | } else if(ch < 0xe0) { | |
629 | if( /* handle U+0080..U+07FF inline */ | |
630 | ch >= 0xc2 && | |
631 | ((pSrcLimit - pSrc) >= 2) && | |
632 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f | |
633 | ) { | |
634 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); | |
635 | pSrc += 2; | |
636 | continue; | |
637 | } | |
638 | } | |
639 | ||
640 | /* function call for "complicated" and error cases */ | |
641 | ++pSrc; /* continue after the lead byte */ | |
642 | ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); | |
643 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ | |
644 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
645 | return NULL; | |
646 | }else if(ch<=0xFFFF){ | |
647 | *(pDest++)=(UChar)ch; | |
b75a7d8f | 648 | }else{ |
73c04bcf A |
649 | *(pDest++)=UTF16_LEAD(ch); |
650 | if(pDest<pDestLimit){ | |
651 | *(pDest++)=UTF16_TRAIL(ch); | |
652 | }else{ | |
653 | reqLength++; | |
654 | break; | |
655 | } | |
656 | } | |
657 | } | |
658 | } | |
729e4ab9 | 659 | /* do not fill the dest buffer just count the UChars needed */ |
73c04bcf A |
660 | while(pSrc < pSrcLimit){ |
661 | ch = *pSrc; | |
662 | if(ch <= 0x7f){ | |
663 | reqLength++; | |
664 | ++pSrc; | |
665 | } else { | |
666 | if(ch > 0xe0) { | |
667 | if( /* handle U+1000..U+CFFF inline */ | |
668 | ch <= 0xec && | |
669 | ((pSrcLimit - pSrc) >= 3) && | |
670 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f && | |
671 | (uint8_t)(pSrc[2] - 0x80) <= 0x3f | |
672 | ) { | |
673 | reqLength++; | |
674 | pSrc += 3; | |
675 | continue; | |
676 | } | |
677 | } else if(ch < 0xe0) { | |
678 | if( /* handle U+0080..U+07FF inline */ | |
679 | ch >= 0xc2 && | |
680 | ((pSrcLimit - pSrc) >= 2) && | |
681 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f | |
682 | ) { | |
683 | reqLength++; | |
684 | pSrc += 2; | |
685 | continue; | |
686 | } | |
b75a7d8f | 687 | } |
73c04bcf A |
688 | |
689 | /* function call for "complicated" and error cases */ | |
690 | ++pSrc; /* continue after the lead byte */ | |
691 | ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); | |
692 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ | |
693 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
694 | return NULL; | |
695 | } | |
696 | reqLength+=UTF_CHAR_LENGTH(ch); | |
b75a7d8f A |
697 | } |
698 | } | |
699 | } | |
73c04bcf A |
700 | |
701 | reqLength+=(int32_t)(pDest - dest); | |
702 | ||
703 | if(pNumSubstitutions!=NULL) { | |
704 | *pNumSubstitutions=numSubstitutions; | |
705 | } | |
706 | ||
707 | if(pDestLength){ | |
708 | *pDestLength = reqLength; | |
709 | } | |
710 | ||
711 | /* Terminate the buffer */ | |
712 | u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); | |
713 | ||
714 | return dest; | |
715 | } | |
716 | ||
717 | U_CAPI UChar* U_EXPORT2 | |
718 | u_strFromUTF8(UChar *dest, | |
719 | int32_t destCapacity, | |
720 | int32_t *pDestLength, | |
721 | const char* src, | |
722 | int32_t srcLength, | |
723 | UErrorCode *pErrorCode){ | |
724 | return u_strFromUTF8WithSub( | |
725 | dest, destCapacity, pDestLength, | |
726 | src, srcLength, | |
727 | U_SENTINEL, NULL, | |
728 | pErrorCode); | |
729 | } | |
730 | ||
731 | U_CAPI UChar * U_EXPORT2 | |
732 | u_strFromUTF8Lenient(UChar *dest, | |
733 | int32_t destCapacity, | |
734 | int32_t *pDestLength, | |
735 | const char *src, | |
736 | int32_t srcLength, | |
737 | UErrorCode *pErrorCode) { | |
73c04bcf A |
738 | UChar *pDest = dest; |
739 | UChar32 ch; | |
740 | int32_t reqLength = 0; | |
741 | uint8_t* pSrc = (uint8_t*) src; | |
742 | ||
743 | /* args check */ | |
744 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ | |
745 | return NULL; | |
746 | } | |
747 | ||
729e4ab9 A |
748 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
749 | (destCapacity<0) || (dest == NULL && destCapacity > 0) | |
750 | ) { | |
73c04bcf A |
751 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
752 | return NULL; | |
753 | } | |
754 | ||
755 | if(srcLength < 0) { | |
756 | /* Transform a NUL-terminated string. */ | |
757 | UChar *pDestLimit = dest+destCapacity; | |
758 | uint8_t t1, t2, t3; /* trail bytes */ | |
759 | ||
760 | while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { | |
761 | if(ch < 0xc0) { | |
762 | /* | |
763 | * ASCII, or a trail byte in lead position which is treated like | |
764 | * a single-byte sequence for better character boundary | |
765 | * resynchronization after illegal sequences. | |
766 | */ | |
767 | *pDest++=(UChar)ch; | |
768 | ++pSrc; | |
769 | continue; | |
770 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ | |
771 | if((t1 = pSrc[1]) != 0) { | |
772 | /* 0x3080 = (0xc0 << 6) + 0x80 */ | |
773 | *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); | |
774 | pSrc += 2; | |
775 | continue; | |
776 | } | |
777 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ | |
778 | if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { | |
779 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
780 | /* 0x2080 = (0x80 << 6) + 0x80 */ | |
781 | *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); | |
782 | pSrc += 3; | |
783 | continue; | |
784 | } | |
785 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ | |
786 | if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { | |
787 | pSrc += 4; | |
788 | /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ | |
789 | ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; | |
790 | *(pDest++) = U16_LEAD(ch); | |
791 | if(pDest < pDestLimit) { | |
792 | *(pDest++) = U16_TRAIL(ch); | |
793 | } else { | |
794 | reqLength = 1; | |
795 | break; | |
796 | } | |
797 | continue; | |
798 | } | |
b75a7d8f | 799 | } |
73c04bcf A |
800 | |
801 | /* truncated character at the end */ | |
802 | *pDest++ = 0xfffd; | |
803 | while(*++pSrc != 0) {} | |
804 | break; | |
805 | } | |
806 | ||
807 | /* Pre-flight the rest of the string. */ | |
808 | while((ch = *pSrc) != 0) { | |
809 | if(ch < 0xc0) { | |
810 | /* | |
811 | * ASCII, or a trail byte in lead position which is treated like | |
812 | * a single-byte sequence for better character boundary | |
813 | * resynchronization after illegal sequences. | |
814 | */ | |
815 | ++reqLength; | |
816 | ++pSrc; | |
817 | continue; | |
818 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ | |
819 | if(pSrc[1] != 0) { | |
820 | ++reqLength; | |
821 | pSrc += 2; | |
822 | continue; | |
823 | } | |
824 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ | |
825 | if(pSrc[1] != 0 && pSrc[2] != 0) { | |
826 | ++reqLength; | |
827 | pSrc += 3; | |
828 | continue; | |
829 | } | |
830 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ | |
831 | if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { | |
832 | reqLength += 2; | |
833 | pSrc += 4; | |
834 | continue; | |
835 | } | |
836 | } | |
837 | ||
838 | /* truncated character at the end */ | |
839 | ++reqLength; | |
840 | break; | |
841 | } | |
842 | } else /* srcLength >= 0 */ { | |
843 | const uint8_t *pSrcLimit = pSrc + srcLength; | |
844 | ||
845 | /* | |
846 | * This function requires that if srcLength is given, then it must be | |
847 | * destCapatity >= srcLength so that we need not check for | |
848 | * destination buffer overflow in the loop. | |
849 | */ | |
850 | if(destCapacity < srcLength) { | |
851 | if(pDestLength != NULL) { | |
852 | *pDestLength = srcLength; /* this likely overestimates the true destLength! */ | |
853 | } | |
854 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
855 | return NULL; | |
856 | } | |
857 | ||
858 | if((pSrcLimit - pSrc) >= 4) { | |
859 | pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ | |
860 | ||
861 | /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ | |
862 | do { | |
863 | ch = *pSrc++; | |
864 | if(ch < 0xc0) { | |
865 | /* | |
866 | * ASCII, or a trail byte in lead position which is treated like | |
867 | * a single-byte sequence for better character boundary | |
868 | * resynchronization after illegal sequences. | |
869 | */ | |
870 | *pDest++=(UChar)ch; | |
871 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ | |
872 | /* 0x3080 = (0xc0 << 6) + 0x80 */ | |
873 | *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); | |
874 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ | |
875 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
876 | /* 0x2080 = (0x80 << 6) + 0x80 */ | |
877 | ch = (ch << 12) + (*pSrc++ << 6); | |
878 | *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); | |
879 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ | |
880 | /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ | |
881 | ch = (ch << 18) + (*pSrc++ << 12); | |
882 | ch += *pSrc++ << 6; | |
883 | ch += *pSrc++ - 0x3c82080; | |
884 | *(pDest++) = U16_LEAD(ch); | |
885 | *(pDest++) = U16_TRAIL(ch); | |
886 | } | |
887 | } while(pSrc < pSrcLimit); | |
888 | ||
889 | pSrcLimit += 3; /* restore original pSrcLimit */ | |
890 | } | |
891 | ||
892 | while(pSrc < pSrcLimit) { | |
893 | ch = *pSrc++; | |
894 | if(ch < 0xc0) { | |
895 | /* | |
896 | * ASCII, or a trail byte in lead position which is treated like | |
897 | * a single-byte sequence for better character boundary | |
898 | * resynchronization after illegal sequences. | |
899 | */ | |
900 | *pDest++=(UChar)ch; | |
901 | continue; | |
902 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ | |
903 | if(pSrc < pSrcLimit) { | |
904 | /* 0x3080 = (0xc0 << 6) + 0x80 */ | |
46f4442e | 905 | *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
73c04bcf A |
906 | continue; |
907 | } | |
908 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ | |
909 | if((pSrcLimit - pSrc) >= 2) { | |
910 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
911 | /* 0x2080 = (0x80 << 6) + 0x80 */ | |
912 | ch = (ch << 12) + (*pSrc++ << 6); | |
913 | *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); | |
914 | pSrc += 3; | |
915 | continue; | |
916 | } | |
917 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ | |
918 | if((pSrcLimit - pSrc) >= 3) { | |
919 | /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ | |
920 | ch = (ch << 18) + (*pSrc++ << 12); | |
921 | ch += *pSrc++ << 6; | |
922 | ch += *pSrc++ - 0x3c82080; | |
923 | *(pDest++) = U16_LEAD(ch); | |
924 | *(pDest++) = U16_TRAIL(ch); | |
925 | pSrc += 4; | |
926 | continue; | |
927 | } | |
928 | } | |
929 | ||
930 | /* truncated character at the end */ | |
931 | *pDest++ = 0xfffd; | |
932 | break; | |
b75a7d8f A |
933 | } |
934 | } | |
935 | ||
73c04bcf | 936 | reqLength+=(int32_t)(pDest - dest); |
b75a7d8f A |
937 | |
938 | if(pDestLength){ | |
939 | *pDestLength = reqLength; | |
940 | } | |
941 | ||
942 | /* Terminate the buffer */ | |
943 | u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); | |
944 | ||
945 | return dest; | |
946 | } | |
947 | ||
948 | static U_INLINE uint8_t * | |
949 | _appendUTF8(uint8_t *pDest, UChar32 c) { | |
73c04bcf A |
950 | /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ |
951 | if((c)<=0x7f) { | |
952 | *pDest++=(uint8_t)c; | |
953 | } else if(c<=0x7ff) { | |
b75a7d8f A |
954 | *pDest++=(uint8_t)((c>>6)|0xc0); |
955 | *pDest++=(uint8_t)((c&0x3f)|0x80); | |
73c04bcf | 956 | } else if(c<=0xffff) { |
b75a7d8f A |
957 | *pDest++=(uint8_t)((c>>12)|0xe0); |
958 | *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); | |
959 | *pDest++=(uint8_t)(((c)&0x3f)|0x80); | |
960 | } else /* if((uint32_t)(c)<=0x10ffff) */ { | |
961 | *pDest++=(uint8_t)(((c)>>18)|0xf0); | |
962 | *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); | |
963 | *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); | |
964 | *pDest++=(uint8_t)(((c)&0x3f)|0x80); | |
965 | } | |
966 | return pDest; | |
967 | } | |
968 | ||
969 | ||
970 | U_CAPI char* U_EXPORT2 | |
73c04bcf | 971 | u_strToUTF8WithSub(char *dest, |
b75a7d8f A |
972 | int32_t destCapacity, |
973 | int32_t *pDestLength, | |
73c04bcf | 974 | const UChar *pSrc, |
b75a7d8f | 975 | int32_t srcLength, |
73c04bcf | 976 | UChar32 subchar, int32_t *pNumSubstitutions, |
b75a7d8f | 977 | UErrorCode *pErrorCode){ |
b75a7d8f | 978 | int32_t reqLength=0; |
b75a7d8f A |
979 | uint32_t ch=0,ch2=0; |
980 | uint8_t *pDest = (uint8_t *)dest; | |
981 | uint8_t *pDestLimit = pDest + destCapacity; | |
73c04bcf | 982 | int32_t numSubstitutions; |
b75a7d8f A |
983 | |
984 | /* args check */ | |
985 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ | |
986 | return NULL; | |
987 | } | |
988 | ||
729e4ab9 A |
989 | if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || |
990 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || | |
73c04bcf A |
991 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
992 | ) { | |
b75a7d8f A |
993 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
994 | return NULL; | |
995 | } | |
996 | ||
729e4ab9 A |
997 | if(pNumSubstitutions!=NULL) { |
998 | *pNumSubstitutions=0; | |
999 | } | |
73c04bcf A |
1000 | numSubstitutions=0; |
1001 | ||
b75a7d8f | 1002 | if(srcLength==-1) { |
73c04bcf | 1003 | while((ch=*pSrc)!=0) { |
b75a7d8f A |
1004 | ++pSrc; |
1005 | if(ch <= 0x7f) { | |
73c04bcf | 1006 | if(pDest<pDestLimit) { |
729e4ab9 | 1007 | *pDest++ = (uint8_t)ch; |
73c04bcf A |
1008 | } else { |
1009 | reqLength = 1; | |
1010 | break; | |
1011 | } | |
1012 | } else if(ch <= 0x7ff) { | |
1013 | if((pDestLimit - pDest) >= 2) { | |
1014 | *pDest++=(uint8_t)((ch>>6)|0xc0); | |
1015 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1016 | } else { | |
1017 | reqLength = 2; | |
1018 | break; | |
1019 | } | |
1020 | } else if(ch <= 0xd7ff || ch >= 0xe000) { | |
1021 | if((pDestLimit - pDest) >= 3) { | |
1022 | *pDest++=(uint8_t)((ch>>12)|0xe0); | |
1023 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); | |
1024 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1025 | } else { | |
1026 | reqLength = 3; | |
1027 | break; | |
1028 | } | |
1029 | } else /* ch is a surrogate */ { | |
1030 | int32_t length; | |
b75a7d8f | 1031 | |
73c04bcf | 1032 | /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/ |
b75a7d8f A |
1033 | if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
1034 | ++pSrc; | |
1035 | ch=UTF16_GET_PAIR_VALUE(ch, ch2); | |
73c04bcf A |
1036 | } else if(subchar>=0) { |
1037 | ch=subchar; | |
1038 | ++numSubstitutions; | |
b75a7d8f A |
1039 | } else { |
1040 | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ | |
1041 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1042 | return NULL; | |
1043 | } | |
73c04bcf A |
1044 | |
1045 | length = U8_LENGTH(ch); | |
1046 | if((pDestLimit - pDest) >= length) { | |
1047 | /* convert and append*/ | |
1048 | pDest=_appendUTF8(pDest, ch); | |
1049 | } else { | |
1050 | reqLength = length; | |
1051 | break; | |
1052 | } | |
b75a7d8f | 1053 | } |
b75a7d8f A |
1054 | } |
1055 | while((ch=*pSrc++)!=0) { | |
1056 | if(ch<=0x7f) { | |
1057 | ++reqLength; | |
1058 | } else if(ch<=0x7ff) { | |
1059 | reqLength+=2; | |
1060 | } else if(!UTF_IS_SURROGATE(ch)) { | |
1061 | reqLength+=3; | |
1062 | } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { | |
1063 | ++pSrc; | |
1064 | reqLength+=4; | |
73c04bcf A |
1065 | } else if(subchar>=0) { |
1066 | reqLength+=U8_LENGTH(subchar); | |
1067 | ++numSubstitutions; | |
b75a7d8f A |
1068 | } else { |
1069 | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ | |
1070 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1071 | return NULL; | |
1072 | } | |
1073 | } | |
1074 | } else { | |
73c04bcf A |
1075 | const UChar *pSrcLimit = pSrc+srcLength; |
1076 | int32_t count; | |
1077 | ||
1078 | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ | |
1079 | for(;;) { | |
1080 | /* | |
1081 | * Each iteration of the inner loop progresses by at most 3 UTF-8 | |
1082 | * bytes and one UChar, for most characters. | |
1083 | * For supplementary code points (4 & 2), which are rare, | |
1084 | * there is an additional adjustment. | |
1085 | */ | |
1086 | count = (int32_t)((pDestLimit - pDest) / 3); | |
1087 | srcLength = (int32_t)(pSrcLimit - pSrc); | |
1088 | if(count > srcLength) { | |
1089 | count = srcLength; /* min(remaining dest/3, remaining src) */ | |
1090 | } | |
1091 | if(count < 3) { | |
1092 | /* | |
1093 | * Too much overhead if we get near the end of the string, | |
1094 | * continue with the next loop. | |
1095 | */ | |
1096 | break; | |
1097 | } | |
1098 | do { | |
1099 | ch=*pSrc++; | |
1100 | if(ch <= 0x7f) { | |
729e4ab9 | 1101 | *pDest++ = (uint8_t)ch; |
73c04bcf A |
1102 | } else if(ch <= 0x7ff) { |
1103 | *pDest++=(uint8_t)((ch>>6)|0xc0); | |
1104 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1105 | } else if(ch <= 0xd7ff || ch >= 0xe000) { | |
1106 | *pDest++=(uint8_t)((ch>>12)|0xe0); | |
1107 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); | |
1108 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1109 | } else /* ch is a surrogate */ { | |
1110 | /* | |
1111 | * We will read two UChars and probably output four bytes, | |
1112 | * which we didn't account for with computing count, | |
1113 | * so we adjust it here. | |
1114 | */ | |
1115 | if(--count == 0) { | |
1116 | --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ | |
1117 | break; /* recompute count */ | |
1118 | } | |
1119 | ||
1120 | if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { | |
1121 | ++pSrc; | |
1122 | ch=UTF16_GET_PAIR_VALUE(ch, ch2); | |
1123 | ||
1124 | /* writing 4 bytes per 2 UChars is ok */ | |
1125 | *pDest++=(uint8_t)((ch>>18)|0xf0); | |
1126 | *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); | |
1127 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); | |
1128 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1129 | } else { | |
1130 | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ | |
1131 | if(subchar>=0) { | |
1132 | ch=subchar; | |
1133 | ++numSubstitutions; | |
1134 | } else { | |
1135 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1136 | return NULL; | |
1137 | } | |
1138 | ||
1139 | /* convert and append*/ | |
1140 | pDest=_appendUTF8(pDest, ch); | |
1141 | } | |
1142 | } | |
1143 | } while(--count > 0); | |
1144 | } | |
1145 | ||
1146 | while(pSrc<pSrcLimit) { | |
b75a7d8f A |
1147 | ch=*pSrc++; |
1148 | if(ch <= 0x7f) { | |
73c04bcf | 1149 | if(pDest<pDestLimit) { |
729e4ab9 | 1150 | *pDest++ = (uint8_t)ch; |
73c04bcf A |
1151 | } else { |
1152 | reqLength = 1; | |
1153 | break; | |
1154 | } | |
1155 | } else if(ch <= 0x7ff) { | |
1156 | if((pDestLimit - pDest) >= 2) { | |
1157 | *pDest++=(uint8_t)((ch>>6)|0xc0); | |
1158 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1159 | } else { | |
1160 | reqLength = 2; | |
1161 | break; | |
1162 | } | |
1163 | } else if(ch <= 0xd7ff || ch >= 0xe000) { | |
1164 | if((pDestLimit - pDest) >= 3) { | |
1165 | *pDest++=(uint8_t)((ch>>12)|0xe0); | |
1166 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); | |
1167 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1168 | } else { | |
1169 | reqLength = 3; | |
1170 | break; | |
1171 | } | |
1172 | } else /* ch is a surrogate */ { | |
1173 | int32_t length; | |
b75a7d8f | 1174 | |
b75a7d8f A |
1175 | if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { |
1176 | ++pSrc; | |
1177 | ch=UTF16_GET_PAIR_VALUE(ch, ch2); | |
73c04bcf A |
1178 | } else if(subchar>=0) { |
1179 | ch=subchar; | |
1180 | ++numSubstitutions; | |
b75a7d8f A |
1181 | } else { |
1182 | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ | |
1183 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1184 | return NULL; | |
1185 | } | |
73c04bcf A |
1186 | |
1187 | length = U8_LENGTH(ch); | |
1188 | if((pDestLimit - pDest) >= length) { | |
1189 | /* convert and append*/ | |
1190 | pDest=_appendUTF8(pDest, ch); | |
1191 | } else { | |
1192 | reqLength = length; | |
1193 | break; | |
1194 | } | |
b75a7d8f | 1195 | } |
b75a7d8f A |
1196 | } |
1197 | while(pSrc<pSrcLimit) { | |
1198 | ch=*pSrc++; | |
1199 | if(ch<=0x7f) { | |
1200 | ++reqLength; | |
1201 | } else if(ch<=0x7ff) { | |
1202 | reqLength+=2; | |
1203 | } else if(!UTF_IS_SURROGATE(ch)) { | |
1204 | reqLength+=3; | |
1205 | } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { | |
1206 | ++pSrc; | |
1207 | reqLength+=4; | |
73c04bcf A |
1208 | } else if(subchar>=0) { |
1209 | reqLength+=U8_LENGTH(subchar); | |
1210 | ++numSubstitutions; | |
b75a7d8f A |
1211 | } else { |
1212 | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ | |
1213 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1214 | return NULL; | |
1215 | } | |
1216 | } | |
1217 | } | |
1218 | ||
73c04bcf A |
1219 | reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
1220 | ||
1221 | if(pNumSubstitutions!=NULL) { | |
1222 | *pNumSubstitutions=numSubstitutions; | |
1223 | } | |
1224 | ||
b75a7d8f A |
1225 | if(pDestLength){ |
1226 | *pDestLength = reqLength; | |
1227 | } | |
1228 | ||
1229 | /* Terminate the buffer */ | |
729e4ab9 A |
1230 | u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
1231 | return dest; | |
b75a7d8f | 1232 | } |
73c04bcf A |
1233 | |
1234 | U_CAPI char* U_EXPORT2 | |
1235 | u_strToUTF8(char *dest, | |
1236 | int32_t destCapacity, | |
1237 | int32_t *pDestLength, | |
1238 | const UChar *pSrc, | |
1239 | int32_t srcLength, | |
1240 | UErrorCode *pErrorCode){ | |
1241 | return u_strToUTF8WithSub( | |
1242 | dest, destCapacity, pDestLength, | |
1243 | pSrc, srcLength, | |
1244 | U_SENTINEL, NULL, | |
1245 | pErrorCode); | |
1246 | } | |
729e4ab9 A |
1247 | |
1248 | U_CAPI UChar* U_EXPORT2 | |
1249 | u_strFromJavaModifiedUTF8WithSub( | |
1250 | UChar *dest, | |
1251 | int32_t destCapacity, | |
1252 | int32_t *pDestLength, | |
1253 | const char *src, | |
1254 | int32_t srcLength, | |
1255 | UChar32 subchar, int32_t *pNumSubstitutions, | |
1256 | UErrorCode *pErrorCode) { | |
1257 | UChar *pDest = dest; | |
1258 | UChar *pDestLimit = dest+destCapacity; | |
1259 | UChar32 ch; | |
1260 | int32_t reqLength = 0; | |
1261 | const uint8_t* pSrc = (const uint8_t*) src; | |
1262 | const uint8_t *pSrcLimit; | |
1263 | int32_t count; | |
1264 | uint8_t t1, t2; /* trail bytes */ | |
1265 | int32_t numSubstitutions; | |
1266 | ||
1267 | /* args check */ | |
1268 | if(U_FAILURE(*pErrorCode)){ | |
1269 | return NULL; | |
1270 | } | |
1271 | if( (src==NULL && srcLength!=0) || srcLength < -1 || | |
1272 | (dest==NULL && destCapacity!=0) || destCapacity<0 || | |
1273 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) | |
1274 | ) { | |
1275 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
1276 | return NULL; | |
1277 | } | |
1278 | ||
1279 | if(pNumSubstitutions!=NULL) { | |
1280 | *pNumSubstitutions=0; | |
1281 | } | |
1282 | numSubstitutions=0; | |
1283 | ||
1284 | if(srcLength < 0) { | |
1285 | /* | |
1286 | * Transform a NUL-terminated ASCII string. | |
1287 | * Handle non-ASCII strings with slower code. | |
1288 | */ | |
1289 | while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { | |
1290 | *pDest++=(UChar)ch; | |
1291 | ++pSrc; | |
1292 | } | |
1293 | if(ch == 0) { | |
1294 | reqLength=(int32_t)(pDest - dest); | |
1295 | if(pDestLength) { | |
1296 | *pDestLength = reqLength; | |
1297 | } | |
1298 | ||
1299 | /* Terminate the buffer */ | |
1300 | u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); | |
1301 | return dest; | |
1302 | } | |
1303 | srcLength = uprv_strlen((const char *)pSrc); | |
1304 | } | |
1305 | ||
1306 | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ | |
1307 | pSrcLimit = pSrc + srcLength; | |
1308 | for(;;) { | |
1309 | count = (int32_t)(pDestLimit - pDest); | |
1310 | srcLength = (int32_t)(pSrcLimit - pSrc); | |
1311 | if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { | |
1312 | /* fast ASCII loop */ | |
1313 | const uint8_t *prevSrc = pSrc; | |
1314 | int32_t delta; | |
1315 | while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { | |
1316 | *pDest++=(UChar)ch; | |
1317 | ++pSrc; | |
1318 | } | |
1319 | delta = (int32_t)(pSrc - prevSrc); | |
1320 | count -= delta; | |
1321 | srcLength -= delta; | |
1322 | } | |
1323 | /* | |
1324 | * Each iteration of the inner loop progresses by at most 3 UTF-8 | |
1325 | * bytes and one UChar. | |
1326 | */ | |
1327 | srcLength /= 3; | |
1328 | if(count > srcLength) { | |
1329 | count = srcLength; /* min(remaining dest, remaining src/3) */ | |
1330 | } | |
1331 | if(count < 3) { | |
1332 | /* | |
1333 | * Too much overhead if we get near the end of the string, | |
1334 | * continue with the next loop. | |
1335 | */ | |
1336 | break; | |
1337 | } | |
1338 | do { | |
1339 | ch = *pSrc; | |
1340 | if(ch <= 0x7f){ | |
1341 | *pDest++=(UChar)ch; | |
1342 | ++pSrc; | |
1343 | } else { | |
1344 | if(ch >= 0xe0) { | |
1345 | if( /* handle U+0000..U+FFFF inline */ | |
1346 | ch <= 0xef && | |
1347 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && | |
1348 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f | |
1349 | ) { | |
1350 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
1351 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); | |
1352 | pSrc += 3; | |
1353 | continue; | |
1354 | } | |
1355 | } else { | |
1356 | if( /* handle U+0000..U+07FF inline */ | |
1357 | ch >= 0xc0 && | |
1358 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f | |
1359 | ) { | |
1360 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); | |
1361 | pSrc += 2; | |
1362 | continue; | |
1363 | } | |
1364 | } | |
1365 | ||
1366 | if(subchar < 0) { | |
1367 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1368 | return NULL; | |
1369 | } else if(subchar > 0xffff && --count == 0) { | |
1370 | /* | |
1371 | * We need to write two UChars, adjusted count for that, | |
1372 | * and ran out of space. | |
1373 | */ | |
1374 | break; | |
1375 | } else { | |
1376 | /* function call for error cases */ | |
1377 | ++pSrc; /* continue after the lead byte */ | |
1378 | utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); | |
1379 | ++numSubstitutions; | |
1380 | if(subchar<=0xFFFF) { | |
1381 | *(pDest++)=(UChar)subchar; | |
1382 | } else { | |
1383 | *(pDest++)=U16_LEAD(subchar); | |
1384 | *(pDest++)=U16_TRAIL(subchar); | |
1385 | } | |
1386 | } | |
1387 | } | |
1388 | } while(--count > 0); | |
1389 | } | |
1390 | ||
1391 | while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { | |
1392 | ch = *pSrc; | |
1393 | if(ch <= 0x7f){ | |
1394 | *pDest++=(UChar)ch; | |
1395 | ++pSrc; | |
1396 | } else { | |
1397 | if(ch >= 0xe0) { | |
1398 | if( /* handle U+0000..U+FFFF inline */ | |
1399 | ch <= 0xef && | |
1400 | ((pSrcLimit - pSrc) >= 3) && | |
1401 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && | |
1402 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f | |
1403 | ) { | |
1404 | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ | |
1405 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); | |
1406 | pSrc += 3; | |
1407 | continue; | |
1408 | } | |
1409 | } else { | |
1410 | if( /* handle U+0000..U+07FF inline */ | |
1411 | ch >= 0xc0 && | |
1412 | ((pSrcLimit - pSrc) >= 2) && | |
1413 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f | |
1414 | ) { | |
1415 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); | |
1416 | pSrc += 2; | |
1417 | continue; | |
1418 | } | |
1419 | } | |
1420 | ||
1421 | if(subchar < 0) { | |
1422 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1423 | return NULL; | |
1424 | } else { | |
1425 | /* function call for error cases */ | |
1426 | ++pSrc; /* continue after the lead byte */ | |
1427 | utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); | |
1428 | ++numSubstitutions; | |
1429 | if(subchar<=0xFFFF) { | |
1430 | *(pDest++)=(UChar)subchar; | |
1431 | } else { | |
1432 | *(pDest++)=U16_LEAD(subchar); | |
1433 | if(pDest<pDestLimit) { | |
1434 | *(pDest++)=U16_TRAIL(subchar); | |
1435 | } else { | |
1436 | reqLength++; | |
1437 | break; | |
1438 | } | |
1439 | } | |
1440 | } | |
1441 | } | |
1442 | } | |
1443 | ||
1444 | /* do not fill the dest buffer just count the UChars needed */ | |
1445 | while(pSrc < pSrcLimit){ | |
1446 | ch = *pSrc; | |
1447 | if(ch <= 0x7f) { | |
1448 | reqLength++; | |
1449 | ++pSrc; | |
1450 | } else { | |
1451 | if(ch >= 0xe0) { | |
1452 | if( /* handle U+0000..U+FFFF inline */ | |
1453 | ch <= 0xef && | |
1454 | ((pSrcLimit - pSrc) >= 3) && | |
1455 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f && | |
1456 | (uint8_t)(pSrc[2] - 0x80) <= 0x3f | |
1457 | ) { | |
1458 | reqLength++; | |
1459 | pSrc += 3; | |
1460 | continue; | |
1461 | } | |
1462 | } else { | |
1463 | if( /* handle U+0000..U+07FF inline */ | |
1464 | ch >= 0xc0 && | |
1465 | ((pSrcLimit - pSrc) >= 2) && | |
1466 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f | |
1467 | ) { | |
1468 | reqLength++; | |
1469 | pSrc += 2; | |
1470 | continue; | |
1471 | } | |
1472 | } | |
1473 | ||
1474 | if(subchar < 0) { | |
1475 | *pErrorCode = U_INVALID_CHAR_FOUND; | |
1476 | return NULL; | |
1477 | } else { | |
1478 | /* function call for error cases */ | |
1479 | ++pSrc; /* continue after the lead byte */ | |
1480 | utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); | |
1481 | ++numSubstitutions; | |
1482 | reqLength+=U16_LENGTH(ch); | |
1483 | } | |
1484 | } | |
1485 | } | |
1486 | ||
1487 | if(pNumSubstitutions!=NULL) { | |
1488 | *pNumSubstitutions=numSubstitutions; | |
1489 | } | |
1490 | ||
1491 | reqLength+=(int32_t)(pDest - dest); | |
1492 | if(pDestLength) { | |
1493 | *pDestLength = reqLength; | |
1494 | } | |
1495 | ||
1496 | /* Terminate the buffer */ | |
1497 | u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); | |
1498 | return dest; | |
1499 | } | |
1500 | ||
1501 | U_CAPI char* U_EXPORT2 | |
1502 | u_strToJavaModifiedUTF8( | |
1503 | char *dest, | |
1504 | int32_t destCapacity, | |
1505 | int32_t *pDestLength, | |
1506 | const UChar *src, | |
1507 | int32_t srcLength, | |
1508 | UErrorCode *pErrorCode) { | |
1509 | int32_t reqLength=0; | |
1510 | uint32_t ch=0; | |
1511 | uint8_t *pDest = (uint8_t *)dest; | |
1512 | uint8_t *pDestLimit = pDest + destCapacity; | |
1513 | const UChar *pSrcLimit; | |
1514 | int32_t count; | |
1515 | ||
1516 | /* args check */ | |
1517 | if(U_FAILURE(*pErrorCode)){ | |
1518 | return NULL; | |
1519 | } | |
1520 | if( (src==NULL && srcLength!=0) || srcLength < -1 || | |
1521 | (dest==NULL && destCapacity!=0) || destCapacity<0 | |
1522 | ) { | |
1523 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
1524 | return NULL; | |
1525 | } | |
1526 | ||
1527 | if(srcLength==-1) { | |
1528 | /* Convert NUL-terminated ASCII, then find the string length. */ | |
1529 | while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { | |
1530 | *pDest++ = (uint8_t)ch; | |
1531 | ++src; | |
1532 | } | |
1533 | if(ch == 0) { | |
1534 | reqLength=(int32_t)(pDest - (uint8_t *)dest); | |
1535 | if(pDestLength) { | |
1536 | *pDestLength = reqLength; | |
1537 | } | |
1538 | ||
1539 | /* Terminate the buffer */ | |
1540 | u_terminateChars(dest, destCapacity, reqLength, pErrorCode); | |
1541 | return dest; | |
1542 | } | |
1543 | srcLength = u_strlen(src); | |
1544 | } | |
1545 | ||
1546 | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ | |
1547 | pSrcLimit = src+srcLength; | |
1548 | for(;;) { | |
1549 | count = (int32_t)(pDestLimit - pDest); | |
1550 | srcLength = (int32_t)(pSrcLimit - src); | |
1551 | if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { | |
1552 | /* fast ASCII loop */ | |
1553 | const UChar *prevSrc = src; | |
1554 | int32_t delta; | |
1555 | while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { | |
1556 | *pDest++=(uint8_t)ch; | |
1557 | ++src; | |
1558 | } | |
1559 | delta = (int32_t)(src - prevSrc); | |
1560 | count -= delta; | |
1561 | srcLength -= delta; | |
1562 | } | |
1563 | /* | |
1564 | * Each iteration of the inner loop progresses by at most 3 UTF-8 | |
1565 | * bytes and one UChar. | |
1566 | */ | |
1567 | count /= 3; | |
1568 | if(count > srcLength) { | |
1569 | count = srcLength; /* min(remaining dest/3, remaining src) */ | |
1570 | } | |
1571 | if(count < 3) { | |
1572 | /* | |
1573 | * Too much overhead if we get near the end of the string, | |
1574 | * continue with the next loop. | |
1575 | */ | |
1576 | break; | |
1577 | } | |
1578 | do { | |
1579 | ch=*src++; | |
1580 | if(ch <= 0x7f && ch != 0) { | |
1581 | *pDest++ = (uint8_t)ch; | |
1582 | } else if(ch <= 0x7ff) { | |
1583 | *pDest++=(uint8_t)((ch>>6)|0xc0); | |
1584 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1585 | } else { | |
1586 | *pDest++=(uint8_t)((ch>>12)|0xe0); | |
1587 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); | |
1588 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1589 | } | |
1590 | } while(--count > 0); | |
1591 | } | |
1592 | ||
1593 | while(src<pSrcLimit) { | |
1594 | ch=*src++; | |
1595 | if(ch <= 0x7f && ch != 0) { | |
1596 | if(pDest<pDestLimit) { | |
1597 | *pDest++ = (uint8_t)ch; | |
1598 | } else { | |
1599 | reqLength = 1; | |
1600 | break; | |
1601 | } | |
1602 | } else if(ch <= 0x7ff) { | |
1603 | if((pDestLimit - pDest) >= 2) { | |
1604 | *pDest++=(uint8_t)((ch>>6)|0xc0); | |
1605 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1606 | } else { | |
1607 | reqLength = 2; | |
1608 | break; | |
1609 | } | |
1610 | } else { | |
1611 | if((pDestLimit - pDest) >= 3) { | |
1612 | *pDest++=(uint8_t)((ch>>12)|0xe0); | |
1613 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); | |
1614 | *pDest++=(uint8_t)((ch&0x3f)|0x80); | |
1615 | } else { | |
1616 | reqLength = 3; | |
1617 | break; | |
1618 | } | |
1619 | } | |
1620 | } | |
1621 | while(src<pSrcLimit) { | |
1622 | ch=*src++; | |
1623 | if(ch <= 0x7f && ch != 0) { | |
1624 | ++reqLength; | |
1625 | } else if(ch<=0x7ff) { | |
1626 | reqLength+=2; | |
1627 | } else { | |
1628 | reqLength+=3; | |
1629 | } | |
1630 | } | |
1631 | ||
1632 | reqLength+=(int32_t)(pDest - (uint8_t *)dest); | |
1633 | if(pDestLength){ | |
1634 | *pDestLength = reqLength; | |
1635 | } | |
1636 | ||
1637 | /* Terminate the buffer */ | |
1638 | u_terminateChars(dest, destCapacity, reqLength, pErrorCode); | |
1639 | return dest; | |
1640 | } |