]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / ustrtrns.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2001-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustrtrns.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 9/10/2001 Ram Creation.
17 ******************************************************************************
18 */
19
20 /*******************************************************************************
21 *
22 * u_strTo* and u_strFrom* APIs
23 * WCS functions moved to ustr_wcs.c for better modularization
24 *
25 *******************************************************************************
26 */
27
28
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
34 #include "cstring.h"
35 #include "cmemory.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38
39 U_CAPI UChar* U_EXPORT2
40 u_strFromUTF32WithSub(UChar *dest,
41 int32_t destCapacity,
42 int32_t *pDestLength,
43 const UChar32 *src,
44 int32_t srcLength,
45 UChar32 subchar, int32_t *pNumSubstitutions,
46 UErrorCode *pErrorCode) {
47 const UChar32 *srcLimit;
48 UChar32 ch;
49 UChar *destLimit;
50 UChar *pDest;
51 int32_t reqLength;
52 int32_t numSubstitutions;
53
54 /* args check */
55 if(U_FAILURE(*pErrorCode)){
56 return NULL;
57 }
58 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61 ) {
62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63 return NULL;
64 }
65
66 if(pNumSubstitutions != NULL) {
67 *pNumSubstitutions = 0;
68 }
69
70 pDest = dest;
71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72 reqLength = 0;
73 numSubstitutions = 0;
74
75 if(srcLength < 0) {
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch=*src) != 0 &&
78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79 ++src;
80 if(pDest < destLimit) {
81 *pDest++ = (UChar)ch;
82 } else {
83 ++reqLength;
84 }
85 }
86 srcLimit = src;
87 if(ch != 0) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit != 0) {}
90 }
91 } else {
92 srcLimit = (src!=NULL)?(src + srcLength):NULL;
93 }
94
95 /* convert with length */
96 while(src < srcLimit) {
97 ch = *src++;
98 do {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101 if(pDest < destLimit) {
102 *pDest++ = (UChar)ch;
103 } else {
104 ++reqLength;
105 }
106 break;
107 } else if(0x10000 <= ch && ch <= 0x10ffff) {
108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109 *pDest++ = U16_LEAD(ch);
110 *pDest++ = U16_TRAIL(ch);
111 } else {
112 reqLength += 2;
113 }
114 break;
115 } else if((ch = subchar) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode = U_INVALID_CHAR_FOUND;
118 return NULL;
119 } else {
120 ++numSubstitutions;
121 }
122 } while(TRUE);
123 }
124
125 reqLength += (int32_t)(pDest - dest);
126 if(pDestLength) {
127 *pDestLength = reqLength;
128 }
129 if(pNumSubstitutions != NULL) {
130 *pNumSubstitutions = numSubstitutions;
131 }
132
133 /* Terminate the buffer */
134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
136 return dest;
137 }
138
139 U_CAPI UChar* U_EXPORT2
140 u_strFromUTF32(UChar *dest,
141 int32_t destCapacity,
142 int32_t *pDestLength,
143 const UChar32 *src,
144 int32_t srcLength,
145 UErrorCode *pErrorCode) {
146 return u_strFromUTF32WithSub(
147 dest, destCapacity, pDestLength,
148 src, srcLength,
149 U_SENTINEL, NULL,
150 pErrorCode);
151 }
152
153 U_CAPI UChar32* U_EXPORT2
154 u_strToUTF32WithSub(UChar32 *dest,
155 int32_t destCapacity,
156 int32_t *pDestLength,
157 const UChar *src,
158 int32_t srcLength,
159 UChar32 subchar, int32_t *pNumSubstitutions,
160 UErrorCode *pErrorCode) {
161 const UChar *srcLimit;
162 UChar32 ch;
163 UChar ch2;
164 UChar32 *destLimit;
165 UChar32 *pDest;
166 int32_t reqLength;
167 int32_t numSubstitutions;
168
169 /* args check */
170 if(U_FAILURE(*pErrorCode)){
171 return NULL;
172 }
173 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176 ) {
177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178 return NULL;
179 }
180
181 if(pNumSubstitutions != NULL) {
182 *pNumSubstitutions = 0;
183 }
184
185 pDest = dest;
186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187 reqLength = 0;
188 numSubstitutions = 0;
189
190 if(srcLength < 0) {
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193 ++src;
194 if(pDest < destLimit) {
195 *pDest++ = ch;
196 } else {
197 ++reqLength;
198 }
199 }
200 srcLimit = src;
201 if(ch != 0) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit != 0) {}
204 }
205 } else {
206 srcLimit = (src!=NULL)?(src + srcLength):NULL;
207 }
208
209 /* convert with length */
210 while(src < srcLimit) {
211 ch = *src++;
212 if(!U16_IS_SURROGATE(ch)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215 ++src;
216 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217 } else if((ch = subchar) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode = U_INVALID_CHAR_FOUND;
220 return NULL;
221 } else {
222 ++numSubstitutions;
223 }
224 if(pDest < destLimit) {
225 *pDest++ = ch;
226 } else {
227 ++reqLength;
228 }
229 }
230
231 reqLength += (int32_t)(pDest - dest);
232 if(pDestLength) {
233 *pDestLength = reqLength;
234 }
235 if(pNumSubstitutions != NULL) {
236 *pNumSubstitutions = numSubstitutions;
237 }
238
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242 return dest;
243 }
244
245 U_CAPI UChar32* U_EXPORT2
246 u_strToUTF32(UChar32 *dest,
247 int32_t destCapacity,
248 int32_t *pDestLength,
249 const UChar *src,
250 int32_t srcLength,
251 UErrorCode *pErrorCode) {
252 return u_strToUTF32WithSub(
253 dest, destCapacity, pDestLength,
254 src, srcLength,
255 U_SENTINEL, NULL,
256 pErrorCode);
257 }
258
259 /* for utf8_nextCharSafeBodyTerminated() */
260 static const UChar32
261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
262
263 /*
264 * Version of utf8_nextCharSafeBody() with the following differences:
265 * - checks for NUL termination instead of length
266 * - works with pointers instead of indexes
267 * - always strict (strict==-1)
268 *
269 * *ps points to after the lead byte and will be moved to after the last trail byte.
270 * c is the lead byte.
271 * @return the code point, or U_SENTINEL
272 */
273 static UChar32
274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275 const uint8_t *s=*ps;
276 uint8_t trail, illegal=0;
277 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
278 U_ASSERT(count<6);
279 U8_MASK_LEAD_BYTE((c), count);
280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
281 switch(count) {
282 /* each branch falls through to the next one */
283 case 5:
284 case 4:
285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
286 illegal=1;
287 break;
288 case 3:
289 trail=(uint8_t)(*s++ - 0x80);
290 c=(c<<6)|trail;
291 if(trail>0x3f || c>=0x110) {
292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
293 illegal=1;
294 break;
295 }
296 U_FALLTHROUGH;
297 case 2:
298 trail=(uint8_t)(*s++ - 0x80);
299 if(trail>0x3f) {
300 /* not a trail byte */
301 illegal=1;
302 break;
303 }
304 c=(c<<6)|trail;
305 U_FALLTHROUGH;
306 case 1:
307 trail=(uint8_t)(*s++ - 0x80);
308 if(trail>0x3f) {
309 /* not a trail byte */
310 illegal=1;
311 }
312 c=(c<<6)|trail;
313 break;
314 case 0:
315 return U_SENTINEL;
316 /* no default branch to optimize switch() - all values are covered */
317 }
318
319 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
320 /* illegal is also set if count>=4 */
321 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
322 /* error handling */
323 /* don't go beyond this sequence */
324 s=*ps;
325 while(count>0 && U8_IS_TRAIL(*s)) {
326 ++s;
327 --count;
328 }
329 c=U_SENTINEL;
330 }
331 *ps=s;
332 return c;
333 }
334
335 /*
336 * Version of utf8_nextCharSafeBody() with the following differences:
337 * - works with pointers instead of indexes
338 * - always strict (strict==-1)
339 *
340 * *ps points to after the lead byte and will be moved to after the last trail byte.
341 * c is the lead byte.
342 * @return the code point, or U_SENTINEL
343 */
344 static UChar32
345 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
346 const uint8_t *s=*ps;
347 uint8_t trail, illegal=0;
348 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
349 if((limit-s)>=count) {
350 U8_MASK_LEAD_BYTE((c), count);
351 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
352 switch(count) {
353 /* each branch falls through to the next one */
354 case 5:
355 case 4:
356 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
357 illegal=1;
358 break;
359 case 3:
360 trail=*s++;
361 c=(c<<6)|(trail&0x3f);
362 if(c<0x110) {
363 illegal|=(trail&0xc0)^0x80;
364 } else {
365 /* code point>0x10ffff, outside Unicode */
366 illegal=1;
367 break;
368 }
369 U_FALLTHROUGH;
370 case 2:
371 trail=*s++;
372 c=(c<<6)|(trail&0x3f);
373 illegal|=(trail&0xc0)^0x80;
374 U_FALLTHROUGH;
375 case 1:
376 trail=*s++;
377 c=(c<<6)|(trail&0x3f);
378 illegal|=(trail&0xc0)^0x80;
379 break;
380 case 0:
381 return U_SENTINEL;
382 /* no default branch to optimize switch() - all values are covered */
383 }
384 } else {
385 illegal=1; /* too few bytes left */
386 }
387
388 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
389 /* illegal is also set if count>=4 */
390 U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
391 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
392 /* error handling */
393 /* don't go beyond this sequence */
394 s=*ps;
395 while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
396 ++s;
397 --count;
398 }
399 c=U_SENTINEL;
400 }
401 *ps=s;
402 return c;
403 }
404
405 U_CAPI UChar* U_EXPORT2
406 u_strFromUTF8WithSub(UChar *dest,
407 int32_t destCapacity,
408 int32_t *pDestLength,
409 const char* src,
410 int32_t srcLength,
411 UChar32 subchar, int32_t *pNumSubstitutions,
412 UErrorCode *pErrorCode){
413 UChar *pDest = dest;
414 UChar *pDestLimit = dest+destCapacity;
415 UChar32 ch;
416 int32_t reqLength = 0;
417 const uint8_t* pSrc = (const uint8_t*) src;
418 uint8_t t1, t2; /* trail bytes */
419 int32_t numSubstitutions;
420
421 /* args check */
422 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
423 return NULL;
424 }
425
426 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
427 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
428 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
429 ) {
430 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
431 return NULL;
432 }
433
434 if(pNumSubstitutions!=NULL) {
435 *pNumSubstitutions=0;
436 }
437 numSubstitutions=0;
438
439 /*
440 * Inline processing of UTF-8 byte sequences:
441 *
442 * Byte sequences for the most common characters are handled inline in
443 * the conversion loops. In order to reduce the path lengths for those
444 * characters, the tests are arranged in a kind of binary search.
445 * ASCII (<=0x7f) is checked first, followed by the dividing point
446 * between 2- and 3-byte sequences (0xe0).
447 * The 3-byte branch is tested first to speed up CJK text.
448 * The compiler should combine the subtractions for the two tests for 0xe0.
449 * Each branch then tests for the other end of its range.
450 */
451
452 if(srcLength < 0){
453 /*
454 * Transform a NUL-terminated string.
455 * The code explicitly checks for NULs only in the lead byte position.
456 * A NUL byte in the trail byte position fails the trail byte range check anyway.
457 */
458 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
459 if(ch <= 0x7f){
460 *pDest++=(UChar)ch;
461 ++pSrc;
462 } else {
463 if(ch > 0xe0) {
464 if( /* handle U+1000..U+CFFF inline */
465 ch <= 0xec &&
466 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
467 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
468 ) {
469 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
470 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
471 pSrc += 3;
472 continue;
473 }
474 } else if(ch < 0xe0) {
475 if( /* handle U+0080..U+07FF inline */
476 ch >= 0xc2 &&
477 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
478 ) {
479 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
480 pSrc += 2;
481 continue;
482 }
483 }
484
485 /* function call for "complicated" and error cases */
486 ++pSrc; /* continue after the lead byte */
487 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
488 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
489 *pErrorCode = U_INVALID_CHAR_FOUND;
490 return NULL;
491 } else if(ch<=0xFFFF) {
492 *(pDest++)=(UChar)ch;
493 } else {
494 *(pDest++)=U16_LEAD(ch);
495 if(pDest<pDestLimit) {
496 *(pDest++)=U16_TRAIL(ch);
497 } else {
498 reqLength++;
499 break;
500 }
501 }
502 }
503 }
504
505 /* Pre-flight the rest of the string. */
506 while((ch = *pSrc) != 0) {
507 if(ch <= 0x7f){
508 ++reqLength;
509 ++pSrc;
510 } else {
511 if(ch > 0xe0) {
512 if( /* handle U+1000..U+CFFF inline */
513 ch <= 0xec &&
514 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
515 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
516 ) {
517 ++reqLength;
518 pSrc += 3;
519 continue;
520 }
521 } else if(ch < 0xe0) {
522 if( /* handle U+0080..U+07FF inline */
523 ch >= 0xc2 &&
524 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
525 ) {
526 ++reqLength;
527 pSrc += 2;
528 continue;
529 }
530 }
531
532 /* function call for "complicated" and error cases */
533 ++pSrc; /* continue after the lead byte */
534 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
535 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
536 *pErrorCode = U_INVALID_CHAR_FOUND;
537 return NULL;
538 }
539 reqLength += U16_LENGTH(ch);
540 }
541 }
542 } else /* srcLength >= 0 */ {
543 const uint8_t *pSrcLimit = pSrc + srcLength;
544 int32_t count;
545
546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
547 for(;;) {
548 /*
549 * Each iteration of the inner loop progresses by at most 3 UTF-8
550 * bytes and one UChar, for most characters.
551 * For supplementary code points (4 & 2), which are rare,
552 * there is an additional adjustment.
553 */
554 count = (int32_t)(pDestLimit - pDest);
555 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
556 if(count > srcLength) {
557 count = srcLength; /* min(remaining dest, remaining src/3) */
558 }
559 if(count < 3) {
560 /*
561 * Too much overhead if we get near the end of the string,
562 * continue with the next loop.
563 */
564 break;
565 }
566
567 do {
568 ch = *pSrc;
569 if(ch <= 0x7f){
570 *pDest++=(UChar)ch;
571 ++pSrc;
572 } else {
573 if(ch > 0xe0) {
574 if( /* handle U+1000..U+CFFF inline */
575 ch <= 0xec &&
576 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
577 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
578 ) {
579 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
580 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
581 pSrc += 3;
582 continue;
583 }
584 } else if(ch < 0xe0) {
585 if( /* handle U+0080..U+07FF inline */
586 ch >= 0xc2 &&
587 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
588 ) {
589 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
590 pSrc += 2;
591 continue;
592 }
593 }
594
595 if(ch >= 0xf0 || subchar > 0xffff) {
596 /*
597 * We may read up to six bytes and write up to two UChars,
598 * which we didn't account for with computing count,
599 * so we adjust it here.
600 */
601 if(--count == 0) {
602 break;
603 }
604 }
605
606 /* function call for "complicated" and error cases */
607 ++pSrc; /* continue after the lead byte */
608 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
609 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
610 *pErrorCode = U_INVALID_CHAR_FOUND;
611 return NULL;
612 }else if(ch<=0xFFFF){
613 *(pDest++)=(UChar)ch;
614 }else{
615 *(pDest++)=U16_LEAD(ch);
616 *(pDest++)=U16_TRAIL(ch);
617 }
618 }
619 } while(--count > 0);
620 }
621
622 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
623 ch = *pSrc;
624 if(ch <= 0x7f){
625 *pDest++=(UChar)ch;
626 ++pSrc;
627 } else {
628 if(ch > 0xe0) {
629 if( /* handle U+1000..U+CFFF inline */
630 ch <= 0xec &&
631 ((pSrcLimit - pSrc) >= 3) &&
632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
633 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
634 ) {
635 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
636 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
637 pSrc += 3;
638 continue;
639 }
640 } else if(ch < 0xe0) {
641 if( /* handle U+0080..U+07FF inline */
642 ch >= 0xc2 &&
643 ((pSrcLimit - pSrc) >= 2) &&
644 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
645 ) {
646 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
647 pSrc += 2;
648 continue;
649 }
650 }
651
652 /* function call for "complicated" and error cases */
653 ++pSrc; /* continue after the lead byte */
654 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
655 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
656 *pErrorCode = U_INVALID_CHAR_FOUND;
657 return NULL;
658 }else if(ch<=0xFFFF){
659 *(pDest++)=(UChar)ch;
660 }else{
661 *(pDest++)=U16_LEAD(ch);
662 if(pDest<pDestLimit){
663 *(pDest++)=U16_TRAIL(ch);
664 }else{
665 reqLength++;
666 break;
667 }
668 }
669 }
670 }
671 /* do not fill the dest buffer just count the UChars needed */
672 while(pSrc < pSrcLimit){
673 ch = *pSrc;
674 if(ch <= 0x7f){
675 reqLength++;
676 ++pSrc;
677 } else {
678 if(ch > 0xe0) {
679 if( /* handle U+1000..U+CFFF inline */
680 ch <= 0xec &&
681 ((pSrcLimit - pSrc) >= 3) &&
682 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
683 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
684 ) {
685 reqLength++;
686 pSrc += 3;
687 continue;
688 }
689 } else if(ch < 0xe0) {
690 if( /* handle U+0080..U+07FF inline */
691 ch >= 0xc2 &&
692 ((pSrcLimit - pSrc) >= 2) &&
693 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
694 ) {
695 reqLength++;
696 pSrc += 2;
697 continue;
698 }
699 }
700
701 /* function call for "complicated" and error cases */
702 ++pSrc; /* continue after the lead byte */
703 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
704 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
705 *pErrorCode = U_INVALID_CHAR_FOUND;
706 return NULL;
707 }
708 reqLength+=U16_LENGTH(ch);
709 }
710 }
711 }
712
713 reqLength+=(int32_t)(pDest - dest);
714
715 if(pNumSubstitutions!=NULL) {
716 *pNumSubstitutions=numSubstitutions;
717 }
718
719 if(pDestLength){
720 *pDestLength = reqLength;
721 }
722
723 /* Terminate the buffer */
724 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
725
726 return dest;
727 }
728
729 U_CAPI UChar* U_EXPORT2
730 u_strFromUTF8(UChar *dest,
731 int32_t destCapacity,
732 int32_t *pDestLength,
733 const char* src,
734 int32_t srcLength,
735 UErrorCode *pErrorCode){
736 return u_strFromUTF8WithSub(
737 dest, destCapacity, pDestLength,
738 src, srcLength,
739 U_SENTINEL, NULL,
740 pErrorCode);
741 }
742
743 U_CAPI UChar * U_EXPORT2
744 u_strFromUTF8Lenient(UChar *dest,
745 int32_t destCapacity,
746 int32_t *pDestLength,
747 const char *src,
748 int32_t srcLength,
749 UErrorCode *pErrorCode) {
750 UChar *pDest = dest;
751 UChar32 ch;
752 int32_t reqLength = 0;
753 uint8_t* pSrc = (uint8_t*) src;
754
755 /* args check */
756 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
757 return NULL;
758 }
759
760 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
761 (destCapacity<0) || (dest == NULL && destCapacity > 0)
762 ) {
763 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
764 return NULL;
765 }
766
767 if(srcLength < 0) {
768 /* Transform a NUL-terminated string. */
769 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
770 uint8_t t1, t2, t3; /* trail bytes */
771
772 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
773 if(ch < 0xc0) {
774 /*
775 * ASCII, or a trail byte in lead position which is treated like
776 * a single-byte sequence for better character boundary
777 * resynchronization after illegal sequences.
778 */
779 *pDest++=(UChar)ch;
780 ++pSrc;
781 continue;
782 } else if(ch < 0xe0) { /* U+0080..U+07FF */
783 if((t1 = pSrc[1]) != 0) {
784 /* 0x3080 = (0xc0 << 6) + 0x80 */
785 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
786 pSrc += 2;
787 continue;
788 }
789 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
790 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
791 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
792 /* 0x2080 = (0x80 << 6) + 0x80 */
793 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
794 pSrc += 3;
795 continue;
796 }
797 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
798 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
799 pSrc += 4;
800 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
801 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
802 *(pDest++) = U16_LEAD(ch);
803 if(pDest < pDestLimit) {
804 *(pDest++) = U16_TRAIL(ch);
805 } else {
806 reqLength = 1;
807 break;
808 }
809 continue;
810 }
811 }
812
813 /* truncated character at the end */
814 *pDest++ = 0xfffd;
815 while(*++pSrc != 0) {}
816 break;
817 }
818
819 /* Pre-flight the rest of the string. */
820 while((ch = *pSrc) != 0) {
821 if(ch < 0xc0) {
822 /*
823 * ASCII, or a trail byte in lead position which is treated like
824 * a single-byte sequence for better character boundary
825 * resynchronization after illegal sequences.
826 */
827 ++reqLength;
828 ++pSrc;
829 continue;
830 } else if(ch < 0xe0) { /* U+0080..U+07FF */
831 if(pSrc[1] != 0) {
832 ++reqLength;
833 pSrc += 2;
834 continue;
835 }
836 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
837 if(pSrc[1] != 0 && pSrc[2] != 0) {
838 ++reqLength;
839 pSrc += 3;
840 continue;
841 }
842 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
843 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
844 reqLength += 2;
845 pSrc += 4;
846 continue;
847 }
848 }
849
850 /* truncated character at the end */
851 ++reqLength;
852 break;
853 }
854 } else /* srcLength >= 0 */ {
855 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
856
857 /*
858 * This function requires that if srcLength is given, then it must be
859 * destCapatity >= srcLength so that we need not check for
860 * destination buffer overflow in the loop.
861 */
862 if(destCapacity < srcLength) {
863 if(pDestLength != NULL) {
864 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
865 }
866 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
867 return NULL;
868 }
869
870 if((pSrcLimit - pSrc) >= 4) {
871 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
872
873 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
874 do {
875 ch = *pSrc++;
876 if(ch < 0xc0) {
877 /*
878 * ASCII, or a trail byte in lead position which is treated like
879 * a single-byte sequence for better character boundary
880 * resynchronization after illegal sequences.
881 */
882 *pDest++=(UChar)ch;
883 } else if(ch < 0xe0) { /* U+0080..U+07FF */
884 /* 0x3080 = (0xc0 << 6) + 0x80 */
885 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
886 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
887 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
888 /* 0x2080 = (0x80 << 6) + 0x80 */
889 ch = (ch << 12) + (*pSrc++ << 6);
890 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
891 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
892 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
893 ch = (ch << 18) + (*pSrc++ << 12);
894 ch += *pSrc++ << 6;
895 ch += *pSrc++ - 0x3c82080;
896 *(pDest++) = U16_LEAD(ch);
897 *(pDest++) = U16_TRAIL(ch);
898 }
899 } while(pSrc < pSrcLimit);
900
901 pSrcLimit += 3; /* restore original pSrcLimit */
902 }
903
904 while(pSrc < pSrcLimit) {
905 ch = *pSrc++;
906 if(ch < 0xc0) {
907 /*
908 * ASCII, or a trail byte in lead position which is treated like
909 * a single-byte sequence for better character boundary
910 * resynchronization after illegal sequences.
911 */
912 *pDest++=(UChar)ch;
913 continue;
914 } else if(ch < 0xe0) { /* U+0080..U+07FF */
915 if(pSrc < pSrcLimit) {
916 /* 0x3080 = (0xc0 << 6) + 0x80 */
917 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
918 continue;
919 }
920 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
921 if((pSrcLimit - pSrc) >= 2) {
922 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
923 /* 0x2080 = (0x80 << 6) + 0x80 */
924 ch = (ch << 12) + (*pSrc++ << 6);
925 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
926 pSrc += 3;
927 continue;
928 }
929 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
930 if((pSrcLimit - pSrc) >= 3) {
931 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
932 ch = (ch << 18) + (*pSrc++ << 12);
933 ch += *pSrc++ << 6;
934 ch += *pSrc++ - 0x3c82080;
935 *(pDest++) = U16_LEAD(ch);
936 *(pDest++) = U16_TRAIL(ch);
937 pSrc += 4;
938 continue;
939 }
940 }
941
942 /* truncated character at the end */
943 *pDest++ = 0xfffd;
944 break;
945 }
946 }
947
948 reqLength+=(int32_t)(pDest - dest);
949
950 if(pDestLength){
951 *pDestLength = reqLength;
952 }
953
954 /* Terminate the buffer */
955 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
956
957 return dest;
958 }
959
960 static inline uint8_t *
961 _appendUTF8(uint8_t *pDest, UChar32 c) {
962 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
963 if((c)<=0x7f) {
964 *pDest++=(uint8_t)c;
965 } else if(c<=0x7ff) {
966 *pDest++=(uint8_t)((c>>6)|0xc0);
967 *pDest++=(uint8_t)((c&0x3f)|0x80);
968 } else if(c<=0xffff) {
969 *pDest++=(uint8_t)((c>>12)|0xe0);
970 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
971 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
972 } else /* if((uint32_t)(c)<=0x10ffff) */ {
973 *pDest++=(uint8_t)(((c)>>18)|0xf0);
974 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
975 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
976 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
977 }
978 return pDest;
979 }
980
981
982 U_CAPI char* U_EXPORT2
983 u_strToUTF8WithSub(char *dest,
984 int32_t destCapacity,
985 int32_t *pDestLength,
986 const UChar *pSrc,
987 int32_t srcLength,
988 UChar32 subchar, int32_t *pNumSubstitutions,
989 UErrorCode *pErrorCode){
990 int32_t reqLength=0;
991 uint32_t ch=0,ch2=0;
992 uint8_t *pDest = (uint8_t *)dest;
993 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
994 int32_t numSubstitutions;
995
996 /* args check */
997 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
998 return NULL;
999 }
1000
1001 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1002 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1003 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1004 ) {
1005 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1006 return NULL;
1007 }
1008
1009 if(pNumSubstitutions!=NULL) {
1010 *pNumSubstitutions=0;
1011 }
1012 numSubstitutions=0;
1013
1014 if(srcLength==-1) {
1015 while((ch=*pSrc)!=0) {
1016 ++pSrc;
1017 if(ch <= 0x7f) {
1018 if(pDest<pDestLimit) {
1019 *pDest++ = (uint8_t)ch;
1020 } else {
1021 reqLength = 1;
1022 break;
1023 }
1024 } else if(ch <= 0x7ff) {
1025 if((pDestLimit - pDest) >= 2) {
1026 *pDest++=(uint8_t)((ch>>6)|0xc0);
1027 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1028 } else {
1029 reqLength = 2;
1030 break;
1031 }
1032 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1033 if((pDestLimit - pDest) >= 3) {
1034 *pDest++=(uint8_t)((ch>>12)|0xe0);
1035 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1036 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1037 } else {
1038 reqLength = 3;
1039 break;
1040 }
1041 } else /* ch is a surrogate */ {
1042 int32_t length;
1043
1044 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1046 ++pSrc;
1047 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1048 } else if(subchar>=0) {
1049 ch=subchar;
1050 ++numSubstitutions;
1051 } else {
1052 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053 *pErrorCode = U_INVALID_CHAR_FOUND;
1054 return NULL;
1055 }
1056
1057 length = U8_LENGTH(ch);
1058 if((pDestLimit - pDest) >= length) {
1059 /* convert and append*/
1060 pDest=_appendUTF8(pDest, ch);
1061 } else {
1062 reqLength = length;
1063 break;
1064 }
1065 }
1066 }
1067 while((ch=*pSrc++)!=0) {
1068 if(ch<=0x7f) {
1069 ++reqLength;
1070 } else if(ch<=0x7ff) {
1071 reqLength+=2;
1072 } else if(!U16_IS_SURROGATE(ch)) {
1073 reqLength+=3;
1074 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1075 ++pSrc;
1076 reqLength+=4;
1077 } else if(subchar>=0) {
1078 reqLength+=U8_LENGTH(subchar);
1079 ++numSubstitutions;
1080 } else {
1081 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082 *pErrorCode = U_INVALID_CHAR_FOUND;
1083 return NULL;
1084 }
1085 }
1086 } else {
1087 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1088 int32_t count;
1089
1090 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1091 for(;;) {
1092 /*
1093 * Each iteration of the inner loop progresses by at most 3 UTF-8
1094 * bytes and one UChar, for most characters.
1095 * For supplementary code points (4 & 2), which are rare,
1096 * there is an additional adjustment.
1097 */
1098 count = (int32_t)((pDestLimit - pDest) / 3);
1099 srcLength = (int32_t)(pSrcLimit - pSrc);
1100 if(count > srcLength) {
1101 count = srcLength; /* min(remaining dest/3, remaining src) */
1102 }
1103 if(count < 3) {
1104 /*
1105 * Too much overhead if we get near the end of the string,
1106 * continue with the next loop.
1107 */
1108 break;
1109 }
1110 do {
1111 ch=*pSrc++;
1112 if(ch <= 0x7f) {
1113 *pDest++ = (uint8_t)ch;
1114 } else if(ch <= 0x7ff) {
1115 *pDest++=(uint8_t)((ch>>6)|0xc0);
1116 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1118 *pDest++=(uint8_t)((ch>>12)|0xe0);
1119 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1120 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1121 } else /* ch is a surrogate */ {
1122 /*
1123 * We will read two UChars and probably output four bytes,
1124 * which we didn't account for with computing count,
1125 * so we adjust it here.
1126 */
1127 if(--count == 0) {
1128 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1129 break; /* recompute count */
1130 }
1131
1132 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1133 ++pSrc;
1134 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1135
1136 /* writing 4 bytes per 2 UChars is ok */
1137 *pDest++=(uint8_t)((ch>>18)|0xf0);
1138 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1139 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1140 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1141 } else {
1142 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1143 if(subchar>=0) {
1144 ch=subchar;
1145 ++numSubstitutions;
1146 } else {
1147 *pErrorCode = U_INVALID_CHAR_FOUND;
1148 return NULL;
1149 }
1150
1151 /* convert and append*/
1152 pDest=_appendUTF8(pDest, ch);
1153 }
1154 }
1155 } while(--count > 0);
1156 }
1157
1158 while(pSrc<pSrcLimit) {
1159 ch=*pSrc++;
1160 if(ch <= 0x7f) {
1161 if(pDest<pDestLimit) {
1162 *pDest++ = (uint8_t)ch;
1163 } else {
1164 reqLength = 1;
1165 break;
1166 }
1167 } else if(ch <= 0x7ff) {
1168 if((pDestLimit - pDest) >= 2) {
1169 *pDest++=(uint8_t)((ch>>6)|0xc0);
1170 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1171 } else {
1172 reqLength = 2;
1173 break;
1174 }
1175 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1176 if((pDestLimit - pDest) >= 3) {
1177 *pDest++=(uint8_t)((ch>>12)|0xe0);
1178 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1179 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1180 } else {
1181 reqLength = 3;
1182 break;
1183 }
1184 } else /* ch is a surrogate */ {
1185 int32_t length;
1186
1187 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1188 ++pSrc;
1189 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1190 } else if(subchar>=0) {
1191 ch=subchar;
1192 ++numSubstitutions;
1193 } else {
1194 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195 *pErrorCode = U_INVALID_CHAR_FOUND;
1196 return NULL;
1197 }
1198
1199 length = U8_LENGTH(ch);
1200 if((pDestLimit - pDest) >= length) {
1201 /* convert and append*/
1202 pDest=_appendUTF8(pDest, ch);
1203 } else {
1204 reqLength = length;
1205 break;
1206 }
1207 }
1208 }
1209 while(pSrc<pSrcLimit) {
1210 ch=*pSrc++;
1211 if(ch<=0x7f) {
1212 ++reqLength;
1213 } else if(ch<=0x7ff) {
1214 reqLength+=2;
1215 } else if(!U16_IS_SURROGATE(ch)) {
1216 reqLength+=3;
1217 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1218 ++pSrc;
1219 reqLength+=4;
1220 } else if(subchar>=0) {
1221 reqLength+=U8_LENGTH(subchar);
1222 ++numSubstitutions;
1223 } else {
1224 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225 *pErrorCode = U_INVALID_CHAR_FOUND;
1226 return NULL;
1227 }
1228 }
1229 }
1230
1231 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1232
1233 if(pNumSubstitutions!=NULL) {
1234 *pNumSubstitutions=numSubstitutions;
1235 }
1236
1237 if(pDestLength){
1238 *pDestLength = reqLength;
1239 }
1240
1241 /* Terminate the buffer */
1242 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1243 return dest;
1244 }
1245
1246 U_CAPI char* U_EXPORT2
1247 u_strToUTF8(char *dest,
1248 int32_t destCapacity,
1249 int32_t *pDestLength,
1250 const UChar *pSrc,
1251 int32_t srcLength,
1252 UErrorCode *pErrorCode){
1253 return u_strToUTF8WithSub(
1254 dest, destCapacity, pDestLength,
1255 pSrc, srcLength,
1256 U_SENTINEL, NULL,
1257 pErrorCode);
1258 }
1259
1260 U_CAPI UChar* U_EXPORT2
1261 u_strFromJavaModifiedUTF8WithSub(
1262 UChar *dest,
1263 int32_t destCapacity,
1264 int32_t *pDestLength,
1265 const char *src,
1266 int32_t srcLength,
1267 UChar32 subchar, int32_t *pNumSubstitutions,
1268 UErrorCode *pErrorCode) {
1269 UChar *pDest = dest;
1270 UChar *pDestLimit = dest+destCapacity;
1271 UChar32 ch;
1272 int32_t reqLength = 0;
1273 const uint8_t* pSrc = (const uint8_t*) src;
1274 const uint8_t *pSrcLimit;
1275 int32_t count;
1276 uint8_t t1, t2; /* trail bytes */
1277 int32_t numSubstitutions;
1278
1279 /* args check */
1280 if(U_FAILURE(*pErrorCode)){
1281 return NULL;
1282 }
1283 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1284 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1285 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1286 ) {
1287 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1288 return NULL;
1289 }
1290
1291 if(pNumSubstitutions!=NULL) {
1292 *pNumSubstitutions=0;
1293 }
1294 numSubstitutions=0;
1295
1296 if(srcLength < 0) {
1297 /*
1298 * Transform a NUL-terminated ASCII string.
1299 * Handle non-ASCII strings with slower code.
1300 */
1301 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1302 *pDest++=(UChar)ch;
1303 ++pSrc;
1304 }
1305 if(ch == 0) {
1306 reqLength=(int32_t)(pDest - dest);
1307 if(pDestLength) {
1308 *pDestLength = reqLength;
1309 }
1310
1311 /* Terminate the buffer */
1312 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1313 return dest;
1314 }
1315 srcLength = uprv_strlen((const char *)pSrc);
1316 }
1317
1318 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1320 for(;;) {
1321 count = (int32_t)(pDestLimit - pDest);
1322 srcLength = (int32_t)(pSrcLimit - pSrc);
1323 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1324 /* fast ASCII loop */
1325 const uint8_t *prevSrc = pSrc;
1326 int32_t delta;
1327 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1328 *pDest++=(UChar)ch;
1329 ++pSrc;
1330 }
1331 delta = (int32_t)(pSrc - prevSrc);
1332 count -= delta;
1333 srcLength -= delta;
1334 }
1335 /*
1336 * Each iteration of the inner loop progresses by at most 3 UTF-8
1337 * bytes and one UChar.
1338 */
1339 srcLength /= 3;
1340 if(count > srcLength) {
1341 count = srcLength; /* min(remaining dest, remaining src/3) */
1342 }
1343 if(count < 3) {
1344 /*
1345 * Too much overhead if we get near the end of the string,
1346 * continue with the next loop.
1347 */
1348 break;
1349 }
1350 do {
1351 ch = *pSrc;
1352 if(ch <= 0x7f){
1353 *pDest++=(UChar)ch;
1354 ++pSrc;
1355 } else {
1356 if(ch >= 0xe0) {
1357 if( /* handle U+0000..U+FFFF inline */
1358 ch <= 0xef &&
1359 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1360 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1361 ) {
1362 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1364 pSrc += 3;
1365 continue;
1366 }
1367 } else {
1368 if( /* handle U+0000..U+07FF inline */
1369 ch >= 0xc0 &&
1370 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1371 ) {
1372 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1373 pSrc += 2;
1374 continue;
1375 }
1376 }
1377
1378 if(subchar < 0) {
1379 *pErrorCode = U_INVALID_CHAR_FOUND;
1380 return NULL;
1381 } else if(subchar > 0xffff && --count == 0) {
1382 /*
1383 * We need to write two UChars, adjusted count for that,
1384 * and ran out of space.
1385 */
1386 break;
1387 } else {
1388 /* function call for error cases */
1389 ++pSrc; /* continue after the lead byte */
1390 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1391 ++numSubstitutions;
1392 if(subchar<=0xFFFF) {
1393 *(pDest++)=(UChar)subchar;
1394 } else {
1395 *(pDest++)=U16_LEAD(subchar);
1396 *(pDest++)=U16_TRAIL(subchar);
1397 }
1398 }
1399 }
1400 } while(--count > 0);
1401 }
1402
1403 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1404 ch = *pSrc;
1405 if(ch <= 0x7f){
1406 *pDest++=(UChar)ch;
1407 ++pSrc;
1408 } else {
1409 if(ch >= 0xe0) {
1410 if( /* handle U+0000..U+FFFF inline */
1411 ch <= 0xef &&
1412 ((pSrcLimit - pSrc) >= 3) &&
1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1414 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1415 ) {
1416 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1418 pSrc += 3;
1419 continue;
1420 }
1421 } else {
1422 if( /* handle U+0000..U+07FF inline */
1423 ch >= 0xc0 &&
1424 ((pSrcLimit - pSrc) >= 2) &&
1425 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1426 ) {
1427 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1428 pSrc += 2;
1429 continue;
1430 }
1431 }
1432
1433 if(subchar < 0) {
1434 *pErrorCode = U_INVALID_CHAR_FOUND;
1435 return NULL;
1436 } else {
1437 /* function call for error cases */
1438 ++pSrc; /* continue after the lead byte */
1439 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1440 ++numSubstitutions;
1441 if(subchar<=0xFFFF) {
1442 *(pDest++)=(UChar)subchar;
1443 } else {
1444 *(pDest++)=U16_LEAD(subchar);
1445 if(pDest<pDestLimit) {
1446 *(pDest++)=U16_TRAIL(subchar);
1447 } else {
1448 reqLength++;
1449 break;
1450 }
1451 }
1452 }
1453 }
1454 }
1455
1456 /* do not fill the dest buffer just count the UChars needed */
1457 while(pSrc < pSrcLimit){
1458 ch = *pSrc;
1459 if(ch <= 0x7f) {
1460 reqLength++;
1461 ++pSrc;
1462 } else {
1463 if(ch >= 0xe0) {
1464 if( /* handle U+0000..U+FFFF inline */
1465 ch <= 0xef &&
1466 ((pSrcLimit - pSrc) >= 3) &&
1467 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1468 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1469 ) {
1470 reqLength++;
1471 pSrc += 3;
1472 continue;
1473 }
1474 } else {
1475 if( /* handle U+0000..U+07FF inline */
1476 ch >= 0xc0 &&
1477 ((pSrcLimit - pSrc) >= 2) &&
1478 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1479 ) {
1480 reqLength++;
1481 pSrc += 2;
1482 continue;
1483 }
1484 }
1485
1486 if(subchar < 0) {
1487 *pErrorCode = U_INVALID_CHAR_FOUND;
1488 return NULL;
1489 } else {
1490 /* function call for error cases */
1491 ++pSrc; /* continue after the lead byte */
1492 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1493 ++numSubstitutions;
1494 reqLength+=U16_LENGTH(ch);
1495 }
1496 }
1497 }
1498
1499 if(pNumSubstitutions!=NULL) {
1500 *pNumSubstitutions=numSubstitutions;
1501 }
1502
1503 reqLength+=(int32_t)(pDest - dest);
1504 if(pDestLength) {
1505 *pDestLength = reqLength;
1506 }
1507
1508 /* Terminate the buffer */
1509 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1510 return dest;
1511 }
1512
1513 U_CAPI char* U_EXPORT2
1514 u_strToJavaModifiedUTF8(
1515 char *dest,
1516 int32_t destCapacity,
1517 int32_t *pDestLength,
1518 const UChar *src,
1519 int32_t srcLength,
1520 UErrorCode *pErrorCode) {
1521 int32_t reqLength=0;
1522 uint32_t ch=0;
1523 uint8_t *pDest = (uint8_t *)dest;
1524 uint8_t *pDestLimit = pDest + destCapacity;
1525 const UChar *pSrcLimit;
1526 int32_t count;
1527
1528 /* args check */
1529 if(U_FAILURE(*pErrorCode)){
1530 return NULL;
1531 }
1532 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1533 (dest==NULL && destCapacity!=0) || destCapacity<0
1534 ) {
1535 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1536 return NULL;
1537 }
1538
1539 if(srcLength==-1) {
1540 /* Convert NUL-terminated ASCII, then find the string length. */
1541 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1542 *pDest++ = (uint8_t)ch;
1543 ++src;
1544 }
1545 if(ch == 0) {
1546 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1547 if(pDestLength) {
1548 *pDestLength = reqLength;
1549 }
1550
1551 /* Terminate the buffer */
1552 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1553 return dest;
1554 }
1555 srcLength = u_strlen(src);
1556 }
1557
1558 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1560 for(;;) {
1561 count = (int32_t)(pDestLimit - pDest);
1562 srcLength = (int32_t)(pSrcLimit - src);
1563 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1564 /* fast ASCII loop */
1565 const UChar *prevSrc = src;
1566 int32_t delta;
1567 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1568 *pDest++=(uint8_t)ch;
1569 ++src;
1570 }
1571 delta = (int32_t)(src - prevSrc);
1572 count -= delta;
1573 srcLength -= delta;
1574 }
1575 /*
1576 * Each iteration of the inner loop progresses by at most 3 UTF-8
1577 * bytes and one UChar.
1578 */
1579 count /= 3;
1580 if(count > srcLength) {
1581 count = srcLength; /* min(remaining dest/3, remaining src) */
1582 }
1583 if(count < 3) {
1584 /*
1585 * Too much overhead if we get near the end of the string,
1586 * continue with the next loop.
1587 */
1588 break;
1589 }
1590 do {
1591 ch=*src++;
1592 if(ch <= 0x7f && ch != 0) {
1593 *pDest++ = (uint8_t)ch;
1594 } else if(ch <= 0x7ff) {
1595 *pDest++=(uint8_t)((ch>>6)|0xc0);
1596 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597 } else {
1598 *pDest++=(uint8_t)((ch>>12)|0xe0);
1599 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1600 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1601 }
1602 } while(--count > 0);
1603 }
1604
1605 while(src<pSrcLimit) {
1606 ch=*src++;
1607 if(ch <= 0x7f && ch != 0) {
1608 if(pDest<pDestLimit) {
1609 *pDest++ = (uint8_t)ch;
1610 } else {
1611 reqLength = 1;
1612 break;
1613 }
1614 } else if(ch <= 0x7ff) {
1615 if((pDestLimit - pDest) >= 2) {
1616 *pDest++=(uint8_t)((ch>>6)|0xc0);
1617 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1618 } else {
1619 reqLength = 2;
1620 break;
1621 }
1622 } else {
1623 if((pDestLimit - pDest) >= 3) {
1624 *pDest++=(uint8_t)((ch>>12)|0xe0);
1625 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1626 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1627 } else {
1628 reqLength = 3;
1629 break;
1630 }
1631 }
1632 }
1633 while(src<pSrcLimit) {
1634 ch=*src++;
1635 if(ch <= 0x7f && ch != 0) {
1636 ++reqLength;
1637 } else if(ch<=0x7ff) {
1638 reqLength+=2;
1639 } else {
1640 reqLength+=3;
1641 }
1642 }
1643
1644 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1645 if(pDestLength){
1646 *pDestLength = reqLength;
1647 }
1648
1649 /* Terminate the buffer */
1650 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1651 return dest;
1652 }