]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / common / ustrtrns.cpp
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 *
9 * File ustrtrns.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
16 */
17
18 /*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
22 *
23 *******************************************************************************
24 */
25
26
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
32 #include "cstring.h"
33 #include "cmemory.h"
34 #include "ustr_imp.h"
35 #include "uassert.h"
36
37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
38
39 U_CAPI UChar* U_EXPORT2
40 u_strFromUTF32WithSub(UChar *dest,
41 int32_t destCapacity,
42 int32_t *pDestLength,
43 const UChar32 *src,
44 int32_t srcLength,
45 UChar32 subchar, int32_t *pNumSubstitutions,
46 UErrorCode *pErrorCode) {
47 const UChar32 *srcLimit;
48 UChar32 ch;
49 UChar *destLimit;
50 UChar *pDest;
51 int32_t reqLength;
52 int32_t numSubstitutions;
53
54 /* args check */
55 if(U_FAILURE(*pErrorCode)){
56 return NULL;
57 }
58 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61 ) {
62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63 return NULL;
64 }
65
66 if(pNumSubstitutions != NULL) {
67 *pNumSubstitutions = 0;
68 }
69
70 pDest = dest;
71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72 reqLength = 0;
73 numSubstitutions = 0;
74
75 if(srcLength < 0) {
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch=*src) != 0 &&
78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79 ++src;
80 if(pDest < destLimit) {
81 *pDest++ = (UChar)ch;
82 } else {
83 ++reqLength;
84 }
85 }
86 srcLimit = src;
87 if(ch != 0) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit != 0) {}
90 }
91 } else {
92 srcLimit = (src!=NULL)?(src + srcLength):NULL;
93 }
94
95 /* convert with length */
96 while(src < srcLimit) {
97 ch = *src++;
98 do {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101 if(pDest < destLimit) {
102 *pDest++ = (UChar)ch;
103 } else {
104 ++reqLength;
105 }
106 break;
107 } else if(0x10000 <= ch && ch <= 0x10ffff) {
108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109 *pDest++ = U16_LEAD(ch);
110 *pDest++ = U16_TRAIL(ch);
111 } else {
112 reqLength += 2;
113 }
114 break;
115 } else if((ch = subchar) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode = U_INVALID_CHAR_FOUND;
118 return NULL;
119 } else {
120 ++numSubstitutions;
121 }
122 } while(TRUE);
123 }
124
125 reqLength += (int32_t)(pDest - dest);
126 if(pDestLength) {
127 *pDestLength = reqLength;
128 }
129 if(pNumSubstitutions != NULL) {
130 *pNumSubstitutions = numSubstitutions;
131 }
132
133 /* Terminate the buffer */
134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
136 return dest;
137 }
138
139 U_CAPI UChar* U_EXPORT2
140 u_strFromUTF32(UChar *dest,
141 int32_t destCapacity,
142 int32_t *pDestLength,
143 const UChar32 *src,
144 int32_t srcLength,
145 UErrorCode *pErrorCode) {
146 return u_strFromUTF32WithSub(
147 dest, destCapacity, pDestLength,
148 src, srcLength,
149 U_SENTINEL, NULL,
150 pErrorCode);
151 }
152
153 U_CAPI UChar32* U_EXPORT2
154 u_strToUTF32WithSub(UChar32 *dest,
155 int32_t destCapacity,
156 int32_t *pDestLength,
157 const UChar *src,
158 int32_t srcLength,
159 UChar32 subchar, int32_t *pNumSubstitutions,
160 UErrorCode *pErrorCode) {
161 const UChar *srcLimit;
162 UChar32 ch;
163 UChar ch2;
164 UChar32 *destLimit;
165 UChar32 *pDest;
166 int32_t reqLength;
167 int32_t numSubstitutions;
168
169 /* args check */
170 if(U_FAILURE(*pErrorCode)){
171 return NULL;
172 }
173 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176 ) {
177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178 return NULL;
179 }
180
181 if(pNumSubstitutions != NULL) {
182 *pNumSubstitutions = 0;
183 }
184
185 pDest = dest;
186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187 reqLength = 0;
188 numSubstitutions = 0;
189
190 if(srcLength < 0) {
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193 ++src;
194 if(pDest < destLimit) {
195 *pDest++ = ch;
196 } else {
197 ++reqLength;
198 }
199 }
200 srcLimit = src;
201 if(ch != 0) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit != 0) {}
204 }
205 } else {
206 srcLimit = (src!=NULL)?(src + srcLength):NULL;
207 }
208
209 /* convert with length */
210 while(src < srcLimit) {
211 ch = *src++;
212 if(!U16_IS_SURROGATE(ch)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215 ++src;
216 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217 } else if((ch = subchar) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode = U_INVALID_CHAR_FOUND;
220 return NULL;
221 } else {
222 ++numSubstitutions;
223 }
224 if(pDest < destLimit) {
225 *pDest++ = ch;
226 } else {
227 ++reqLength;
228 }
229 }
230
231 reqLength += (int32_t)(pDest - dest);
232 if(pDestLength) {
233 *pDestLength = reqLength;
234 }
235 if(pNumSubstitutions != NULL) {
236 *pNumSubstitutions = numSubstitutions;
237 }
238
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242 return dest;
243 }
244
245 U_CAPI UChar32* U_EXPORT2
246 u_strToUTF32(UChar32 *dest,
247 int32_t destCapacity,
248 int32_t *pDestLength,
249 const UChar *src,
250 int32_t srcLength,
251 UErrorCode *pErrorCode) {
252 return u_strToUTF32WithSub(
253 dest, destCapacity, pDestLength,
254 src, srcLength,
255 U_SENTINEL, NULL,
256 pErrorCode);
257 }
258
259 /* for utf8_nextCharSafeBodyTerminated() */
260 static const UChar32
261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
262
263 /*
264 * Version of utf8_nextCharSafeBody() with the following differences:
265 * - checks for NUL termination instead of length
266 * - works with pointers instead of indexes
267 * - always strict (strict==-1)
268 *
269 * *ps points to after the lead byte and will be moved to after the last trail byte.
270 * c is the lead byte.
271 * @return the code point, or U_SENTINEL
272 */
273 static UChar32
274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275 const uint8_t *s=*ps;
276 uint8_t trail, illegal=0;
277 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
278 U_ASSERT(count<6);
279 U8_MASK_LEAD_BYTE((c), count);
280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
281 switch(count) {
282 /* each branch falls through to the next one */
283 case 5:
284 case 4:
285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
286 illegal=1;
287 break;
288 case 3:
289 trail=(uint8_t)(*s++ - 0x80);
290 c=(c<<6)|trail;
291 if(trail>0x3f || c>=0x110) {
292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
293 illegal=1;
294 break;
295 }
296 case 2: /*fall through*/
297 trail=(uint8_t)(*s++ - 0x80);
298 if(trail>0x3f) {
299 /* not a trail byte */
300 illegal=1;
301 break;
302 }
303 c=(c<<6)|trail;
304 case 1: /*fall through*/
305 trail=(uint8_t)(*s++ - 0x80);
306 if(trail>0x3f) {
307 /* not a trail byte */
308 illegal=1;
309 }
310 c=(c<<6)|trail;
311 break;
312 case 0:
313 return U_SENTINEL;
314 /* no default branch to optimize switch() - all values are covered */
315 }
316
317 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
318 /* illegal is also set if count>=4 */
319 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
320 /* error handling */
321 /* don't go beyond this sequence */
322 s=*ps;
323 while(count>0 && U8_IS_TRAIL(*s)) {
324 ++s;
325 --count;
326 }
327 c=U_SENTINEL;
328 }
329 *ps=s;
330 return c;
331 }
332
333 /*
334 * Version of utf8_nextCharSafeBody() with the following differences:
335 * - works with pointers instead of indexes
336 * - always strict (strict==-1)
337 *
338 * *ps points to after the lead byte and will be moved to after the last trail byte.
339 * c is the lead byte.
340 * @return the code point, or U_SENTINEL
341 */
342 static UChar32
343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
344 const uint8_t *s=*ps;
345 uint8_t trail, illegal=0;
346 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
347 if((limit-s)>=count) {
348 U8_MASK_LEAD_BYTE((c), count);
349 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
350 switch(count) {
351 /* each branch falls through to the next one */
352 case 5:
353 case 4:
354 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
355 illegal=1;
356 break;
357 case 3:
358 trail=*s++;
359 c=(c<<6)|(trail&0x3f);
360 if(c<0x110) {
361 illegal|=(trail&0xc0)^0x80;
362 } else {
363 /* code point>0x10ffff, outside Unicode */
364 illegal=1;
365 break;
366 }
367 case 2: /*fall through*/
368 trail=*s++;
369 c=(c<<6)|(trail&0x3f);
370 illegal|=(trail&0xc0)^0x80;
371 case 1: /*fall through*/
372 trail=*s++;
373 c=(c<<6)|(trail&0x3f);
374 illegal|=(trail&0xc0)^0x80;
375 break;
376 case 0:
377 return U_SENTINEL;
378 /* no default branch to optimize switch() - all values are covered */
379 }
380 } else {
381 illegal=1; /* too few bytes left */
382 }
383
384 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
385 /* illegal is also set if count>=4 */
386 U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
387 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
388 /* error handling */
389 /* don't go beyond this sequence */
390 s=*ps;
391 while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
392 ++s;
393 --count;
394 }
395 c=U_SENTINEL;
396 }
397 *ps=s;
398 return c;
399 }
400
401 U_CAPI UChar* U_EXPORT2
402 u_strFromUTF8WithSub(UChar *dest,
403 int32_t destCapacity,
404 int32_t *pDestLength,
405 const char* src,
406 int32_t srcLength,
407 UChar32 subchar, int32_t *pNumSubstitutions,
408 UErrorCode *pErrorCode){
409 UChar *pDest = dest;
410 UChar *pDestLimit = dest+destCapacity;
411 UChar32 ch;
412 int32_t reqLength = 0;
413 const uint8_t* pSrc = (const uint8_t*) src;
414 uint8_t t1, t2; /* trail bytes */
415 int32_t numSubstitutions;
416
417 /* args check */
418 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
419 return NULL;
420 }
421
422 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
423 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
424 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
425 ) {
426 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
427 return NULL;
428 }
429
430 if(pNumSubstitutions!=NULL) {
431 *pNumSubstitutions=0;
432 }
433 numSubstitutions=0;
434
435 /*
436 * Inline processing of UTF-8 byte sequences:
437 *
438 * Byte sequences for the most common characters are handled inline in
439 * the conversion loops. In order to reduce the path lengths for those
440 * characters, the tests are arranged in a kind of binary search.
441 * ASCII (<=0x7f) is checked first, followed by the dividing point
442 * between 2- and 3-byte sequences (0xe0).
443 * The 3-byte branch is tested first to speed up CJK text.
444 * The compiler should combine the subtractions for the two tests for 0xe0.
445 * Each branch then tests for the other end of its range.
446 */
447
448 if(srcLength < 0){
449 /*
450 * Transform a NUL-terminated string.
451 * The code explicitly checks for NULs only in the lead byte position.
452 * A NUL byte in the trail byte position fails the trail byte range check anyway.
453 */
454 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
455 if(ch <= 0x7f){
456 *pDest++=(UChar)ch;
457 ++pSrc;
458 } else {
459 if(ch > 0xe0) {
460 if( /* handle U+1000..U+CFFF inline */
461 ch <= 0xec &&
462 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
463 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
464 ) {
465 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
466 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
467 pSrc += 3;
468 continue;
469 }
470 } else if(ch < 0xe0) {
471 if( /* handle U+0080..U+07FF inline */
472 ch >= 0xc2 &&
473 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
474 ) {
475 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
476 pSrc += 2;
477 continue;
478 }
479 }
480
481 /* function call for "complicated" and error cases */
482 ++pSrc; /* continue after the lead byte */
483 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
484 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
485 *pErrorCode = U_INVALID_CHAR_FOUND;
486 return NULL;
487 } else if(ch<=0xFFFF) {
488 *(pDest++)=(UChar)ch;
489 } else {
490 *(pDest++)=U16_LEAD(ch);
491 if(pDest<pDestLimit) {
492 *(pDest++)=U16_TRAIL(ch);
493 } else {
494 reqLength++;
495 break;
496 }
497 }
498 }
499 }
500
501 /* Pre-flight the rest of the string. */
502 while((ch = *pSrc) != 0) {
503 if(ch <= 0x7f){
504 ++reqLength;
505 ++pSrc;
506 } else {
507 if(ch > 0xe0) {
508 if( /* handle U+1000..U+CFFF inline */
509 ch <= 0xec &&
510 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
511 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
512 ) {
513 ++reqLength;
514 pSrc += 3;
515 continue;
516 }
517 } else if(ch < 0xe0) {
518 if( /* handle U+0080..U+07FF inline */
519 ch >= 0xc2 &&
520 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
521 ) {
522 ++reqLength;
523 pSrc += 2;
524 continue;
525 }
526 }
527
528 /* function call for "complicated" and error cases */
529 ++pSrc; /* continue after the lead byte */
530 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
531 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
532 *pErrorCode = U_INVALID_CHAR_FOUND;
533 return NULL;
534 }
535 reqLength += U16_LENGTH(ch);
536 }
537 }
538 } else /* srcLength >= 0 */ {
539 const uint8_t *pSrcLimit = pSrc + srcLength;
540 int32_t count;
541
542 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
543 for(;;) {
544 /*
545 * Each iteration of the inner loop progresses by at most 3 UTF-8
546 * bytes and one UChar, for most characters.
547 * For supplementary code points (4 & 2), which are rare,
548 * there is an additional adjustment.
549 */
550 count = (int32_t)(pDestLimit - pDest);
551 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
552 if(count > srcLength) {
553 count = srcLength; /* min(remaining dest, remaining src/3) */
554 }
555 if(count < 3) {
556 /*
557 * Too much overhead if we get near the end of the string,
558 * continue with the next loop.
559 */
560 break;
561 }
562
563 do {
564 ch = *pSrc;
565 if(ch <= 0x7f){
566 *pDest++=(UChar)ch;
567 ++pSrc;
568 } else {
569 if(ch > 0xe0) {
570 if( /* handle U+1000..U+CFFF inline */
571 ch <= 0xec &&
572 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
573 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
574 ) {
575 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
576 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
577 pSrc += 3;
578 continue;
579 }
580 } else if(ch < 0xe0) {
581 if( /* handle U+0080..U+07FF inline */
582 ch >= 0xc2 &&
583 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
584 ) {
585 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
586 pSrc += 2;
587 continue;
588 }
589 }
590
591 if(ch >= 0xf0 || subchar > 0xffff) {
592 /*
593 * We may read up to six bytes and write up to two UChars,
594 * which we didn't account for with computing count,
595 * so we adjust it here.
596 */
597 if(--count == 0) {
598 break;
599 }
600 }
601
602 /* function call for "complicated" and error cases */
603 ++pSrc; /* continue after the lead byte */
604 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
605 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
606 *pErrorCode = U_INVALID_CHAR_FOUND;
607 return NULL;
608 }else if(ch<=0xFFFF){
609 *(pDest++)=(UChar)ch;
610 }else{
611 *(pDest++)=U16_LEAD(ch);
612 *(pDest++)=U16_TRAIL(ch);
613 }
614 }
615 } while(--count > 0);
616 }
617
618 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
619 ch = *pSrc;
620 if(ch <= 0x7f){
621 *pDest++=(UChar)ch;
622 ++pSrc;
623 } else {
624 if(ch > 0xe0) {
625 if( /* handle U+1000..U+CFFF inline */
626 ch <= 0xec &&
627 ((pSrcLimit - pSrc) >= 3) &&
628 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
629 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
630 ) {
631 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
632 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
633 pSrc += 3;
634 continue;
635 }
636 } else if(ch < 0xe0) {
637 if( /* handle U+0080..U+07FF inline */
638 ch >= 0xc2 &&
639 ((pSrcLimit - pSrc) >= 2) &&
640 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
641 ) {
642 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
643 pSrc += 2;
644 continue;
645 }
646 }
647
648 /* function call for "complicated" and error cases */
649 ++pSrc; /* continue after the lead byte */
650 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
651 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
652 *pErrorCode = U_INVALID_CHAR_FOUND;
653 return NULL;
654 }else if(ch<=0xFFFF){
655 *(pDest++)=(UChar)ch;
656 }else{
657 *(pDest++)=U16_LEAD(ch);
658 if(pDest<pDestLimit){
659 *(pDest++)=U16_TRAIL(ch);
660 }else{
661 reqLength++;
662 break;
663 }
664 }
665 }
666 }
667 /* do not fill the dest buffer just count the UChars needed */
668 while(pSrc < pSrcLimit){
669 ch = *pSrc;
670 if(ch <= 0x7f){
671 reqLength++;
672 ++pSrc;
673 } else {
674 if(ch > 0xe0) {
675 if( /* handle U+1000..U+CFFF inline */
676 ch <= 0xec &&
677 ((pSrcLimit - pSrc) >= 3) &&
678 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
679 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
680 ) {
681 reqLength++;
682 pSrc += 3;
683 continue;
684 }
685 } else if(ch < 0xe0) {
686 if( /* handle U+0080..U+07FF inline */
687 ch >= 0xc2 &&
688 ((pSrcLimit - pSrc) >= 2) &&
689 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
690 ) {
691 reqLength++;
692 pSrc += 2;
693 continue;
694 }
695 }
696
697 /* function call for "complicated" and error cases */
698 ++pSrc; /* continue after the lead byte */
699 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
700 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
701 *pErrorCode = U_INVALID_CHAR_FOUND;
702 return NULL;
703 }
704 reqLength+=U16_LENGTH(ch);
705 }
706 }
707 }
708
709 reqLength+=(int32_t)(pDest - dest);
710
711 if(pNumSubstitutions!=NULL) {
712 *pNumSubstitutions=numSubstitutions;
713 }
714
715 if(pDestLength){
716 *pDestLength = reqLength;
717 }
718
719 /* Terminate the buffer */
720 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
721
722 return dest;
723 }
724
725 U_CAPI UChar* U_EXPORT2
726 u_strFromUTF8(UChar *dest,
727 int32_t destCapacity,
728 int32_t *pDestLength,
729 const char* src,
730 int32_t srcLength,
731 UErrorCode *pErrorCode){
732 return u_strFromUTF8WithSub(
733 dest, destCapacity, pDestLength,
734 src, srcLength,
735 U_SENTINEL, NULL,
736 pErrorCode);
737 }
738
739 U_CAPI UChar * U_EXPORT2
740 u_strFromUTF8Lenient(UChar *dest,
741 int32_t destCapacity,
742 int32_t *pDestLength,
743 const char *src,
744 int32_t srcLength,
745 UErrorCode *pErrorCode) {
746 UChar *pDest = dest;
747 UChar32 ch;
748 int32_t reqLength = 0;
749 uint8_t* pSrc = (uint8_t*) src;
750
751 /* args check */
752 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
753 return NULL;
754 }
755
756 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
757 (destCapacity<0) || (dest == NULL && destCapacity > 0)
758 ) {
759 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
760 return NULL;
761 }
762
763 if(srcLength < 0) {
764 /* Transform a NUL-terminated string. */
765 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
766 uint8_t t1, t2, t3; /* trail bytes */
767
768 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
769 if(ch < 0xc0) {
770 /*
771 * ASCII, or a trail byte in lead position which is treated like
772 * a single-byte sequence for better character boundary
773 * resynchronization after illegal sequences.
774 */
775 *pDest++=(UChar)ch;
776 ++pSrc;
777 continue;
778 } else if(ch < 0xe0) { /* U+0080..U+07FF */
779 if((t1 = pSrc[1]) != 0) {
780 /* 0x3080 = (0xc0 << 6) + 0x80 */
781 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
782 pSrc += 2;
783 continue;
784 }
785 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
787 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
788 /* 0x2080 = (0x80 << 6) + 0x80 */
789 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
790 pSrc += 3;
791 continue;
792 }
793 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
794 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
795 pSrc += 4;
796 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
797 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
798 *(pDest++) = U16_LEAD(ch);
799 if(pDest < pDestLimit) {
800 *(pDest++) = U16_TRAIL(ch);
801 } else {
802 reqLength = 1;
803 break;
804 }
805 continue;
806 }
807 }
808
809 /* truncated character at the end */
810 *pDest++ = 0xfffd;
811 while(*++pSrc != 0) {}
812 break;
813 }
814
815 /* Pre-flight the rest of the string. */
816 while((ch = *pSrc) != 0) {
817 if(ch < 0xc0) {
818 /*
819 * ASCII, or a trail byte in lead position which is treated like
820 * a single-byte sequence for better character boundary
821 * resynchronization after illegal sequences.
822 */
823 ++reqLength;
824 ++pSrc;
825 continue;
826 } else if(ch < 0xe0) { /* U+0080..U+07FF */
827 if(pSrc[1] != 0) {
828 ++reqLength;
829 pSrc += 2;
830 continue;
831 }
832 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
833 if(pSrc[1] != 0 && pSrc[2] != 0) {
834 ++reqLength;
835 pSrc += 3;
836 continue;
837 }
838 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
839 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
840 reqLength += 2;
841 pSrc += 4;
842 continue;
843 }
844 }
845
846 /* truncated character at the end */
847 ++reqLength;
848 break;
849 }
850 } else /* srcLength >= 0 */ {
851 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
852
853 /*
854 * This function requires that if srcLength is given, then it must be
855 * destCapatity >= srcLength so that we need not check for
856 * destination buffer overflow in the loop.
857 */
858 if(destCapacity < srcLength) {
859 if(pDestLength != NULL) {
860 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
861 }
862 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
863 return NULL;
864 }
865
866 if((pSrcLimit - pSrc) >= 4) {
867 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
868
869 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
870 do {
871 ch = *pSrc++;
872 if(ch < 0xc0) {
873 /*
874 * ASCII, or a trail byte in lead position which is treated like
875 * a single-byte sequence for better character boundary
876 * resynchronization after illegal sequences.
877 */
878 *pDest++=(UChar)ch;
879 } else if(ch < 0xe0) { /* U+0080..U+07FF */
880 /* 0x3080 = (0xc0 << 6) + 0x80 */
881 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
882 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
883 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
884 /* 0x2080 = (0x80 << 6) + 0x80 */
885 ch = (ch << 12) + (*pSrc++ << 6);
886 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
887 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
888 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
889 ch = (ch << 18) + (*pSrc++ << 12);
890 ch += *pSrc++ << 6;
891 ch += *pSrc++ - 0x3c82080;
892 *(pDest++) = U16_LEAD(ch);
893 *(pDest++) = U16_TRAIL(ch);
894 }
895 } while(pSrc < pSrcLimit);
896
897 pSrcLimit += 3; /* restore original pSrcLimit */
898 }
899
900 while(pSrc < pSrcLimit) {
901 ch = *pSrc++;
902 if(ch < 0xc0) {
903 /*
904 * ASCII, or a trail byte in lead position which is treated like
905 * a single-byte sequence for better character boundary
906 * resynchronization after illegal sequences.
907 */
908 *pDest++=(UChar)ch;
909 continue;
910 } else if(ch < 0xe0) { /* U+0080..U+07FF */
911 if(pSrc < pSrcLimit) {
912 /* 0x3080 = (0xc0 << 6) + 0x80 */
913 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
914 continue;
915 }
916 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
917 if((pSrcLimit - pSrc) >= 2) {
918 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
919 /* 0x2080 = (0x80 << 6) + 0x80 */
920 ch = (ch << 12) + (*pSrc++ << 6);
921 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
922 pSrc += 3;
923 continue;
924 }
925 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
926 if((pSrcLimit - pSrc) >= 3) {
927 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
928 ch = (ch << 18) + (*pSrc++ << 12);
929 ch += *pSrc++ << 6;
930 ch += *pSrc++ - 0x3c82080;
931 *(pDest++) = U16_LEAD(ch);
932 *(pDest++) = U16_TRAIL(ch);
933 pSrc += 4;
934 continue;
935 }
936 }
937
938 /* truncated character at the end */
939 *pDest++ = 0xfffd;
940 break;
941 }
942 }
943
944 reqLength+=(int32_t)(pDest - dest);
945
946 if(pDestLength){
947 *pDestLength = reqLength;
948 }
949
950 /* Terminate the buffer */
951 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
952
953 return dest;
954 }
955
956 static inline uint8_t *
957 _appendUTF8(uint8_t *pDest, UChar32 c) {
958 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
959 if((c)<=0x7f) {
960 *pDest++=(uint8_t)c;
961 } else if(c<=0x7ff) {
962 *pDest++=(uint8_t)((c>>6)|0xc0);
963 *pDest++=(uint8_t)((c&0x3f)|0x80);
964 } else if(c<=0xffff) {
965 *pDest++=(uint8_t)((c>>12)|0xe0);
966 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
967 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
968 } else /* if((uint32_t)(c)<=0x10ffff) */ {
969 *pDest++=(uint8_t)(((c)>>18)|0xf0);
970 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
971 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
972 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
973 }
974 return pDest;
975 }
976
977
978 U_CAPI char* U_EXPORT2
979 u_strToUTF8WithSub(char *dest,
980 int32_t destCapacity,
981 int32_t *pDestLength,
982 const UChar *pSrc,
983 int32_t srcLength,
984 UChar32 subchar, int32_t *pNumSubstitutions,
985 UErrorCode *pErrorCode){
986 int32_t reqLength=0;
987 uint32_t ch=0,ch2=0;
988 uint8_t *pDest = (uint8_t *)dest;
989 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
990 int32_t numSubstitutions;
991
992 /* args check */
993 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
994 return NULL;
995 }
996
997 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
998 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
999 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1000 ) {
1001 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1002 return NULL;
1003 }
1004
1005 if(pNumSubstitutions!=NULL) {
1006 *pNumSubstitutions=0;
1007 }
1008 numSubstitutions=0;
1009
1010 if(srcLength==-1) {
1011 while((ch=*pSrc)!=0) {
1012 ++pSrc;
1013 if(ch <= 0x7f) {
1014 if(pDest<pDestLimit) {
1015 *pDest++ = (uint8_t)ch;
1016 } else {
1017 reqLength = 1;
1018 break;
1019 }
1020 } else if(ch <= 0x7ff) {
1021 if((pDestLimit - pDest) >= 2) {
1022 *pDest++=(uint8_t)((ch>>6)|0xc0);
1023 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1024 } else {
1025 reqLength = 2;
1026 break;
1027 }
1028 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1029 if((pDestLimit - pDest) >= 3) {
1030 *pDest++=(uint8_t)((ch>>12)|0xe0);
1031 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1032 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1033 } else {
1034 reqLength = 3;
1035 break;
1036 }
1037 } else /* ch is a surrogate */ {
1038 int32_t length;
1039
1040 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1041 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1042 ++pSrc;
1043 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1044 } else if(subchar>=0) {
1045 ch=subchar;
1046 ++numSubstitutions;
1047 } else {
1048 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1049 *pErrorCode = U_INVALID_CHAR_FOUND;
1050 return NULL;
1051 }
1052
1053 length = U8_LENGTH(ch);
1054 if((pDestLimit - pDest) >= length) {
1055 /* convert and append*/
1056 pDest=_appendUTF8(pDest, ch);
1057 } else {
1058 reqLength = length;
1059 break;
1060 }
1061 }
1062 }
1063 while((ch=*pSrc++)!=0) {
1064 if(ch<=0x7f) {
1065 ++reqLength;
1066 } else if(ch<=0x7ff) {
1067 reqLength+=2;
1068 } else if(!U16_IS_SURROGATE(ch)) {
1069 reqLength+=3;
1070 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1071 ++pSrc;
1072 reqLength+=4;
1073 } else if(subchar>=0) {
1074 reqLength+=U8_LENGTH(subchar);
1075 ++numSubstitutions;
1076 } else {
1077 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1078 *pErrorCode = U_INVALID_CHAR_FOUND;
1079 return NULL;
1080 }
1081 }
1082 } else {
1083 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1084 int32_t count;
1085
1086 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1087 for(;;) {
1088 /*
1089 * Each iteration of the inner loop progresses by at most 3 UTF-8
1090 * bytes and one UChar, for most characters.
1091 * For supplementary code points (4 & 2), which are rare,
1092 * there is an additional adjustment.
1093 */
1094 count = (int32_t)((pDestLimit - pDest) / 3);
1095 srcLength = (int32_t)(pSrcLimit - pSrc);
1096 if(count > srcLength) {
1097 count = srcLength; /* min(remaining dest/3, remaining src) */
1098 }
1099 if(count < 3) {
1100 /*
1101 * Too much overhead if we get near the end of the string,
1102 * continue with the next loop.
1103 */
1104 break;
1105 }
1106 do {
1107 ch=*pSrc++;
1108 if(ch <= 0x7f) {
1109 *pDest++ = (uint8_t)ch;
1110 } else if(ch <= 0x7ff) {
1111 *pDest++=(uint8_t)((ch>>6)|0xc0);
1112 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1113 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1114 *pDest++=(uint8_t)((ch>>12)|0xe0);
1115 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1116 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117 } else /* ch is a surrogate */ {
1118 /*
1119 * We will read two UChars and probably output four bytes,
1120 * which we didn't account for with computing count,
1121 * so we adjust it here.
1122 */
1123 if(--count == 0) {
1124 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1125 break; /* recompute count */
1126 }
1127
1128 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1129 ++pSrc;
1130 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1131
1132 /* writing 4 bytes per 2 UChars is ok */
1133 *pDest++=(uint8_t)((ch>>18)|0xf0);
1134 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1135 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1136 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1137 } else {
1138 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1139 if(subchar>=0) {
1140 ch=subchar;
1141 ++numSubstitutions;
1142 } else {
1143 *pErrorCode = U_INVALID_CHAR_FOUND;
1144 return NULL;
1145 }
1146
1147 /* convert and append*/
1148 pDest=_appendUTF8(pDest, ch);
1149 }
1150 }
1151 } while(--count > 0);
1152 }
1153
1154 while(pSrc<pSrcLimit) {
1155 ch=*pSrc++;
1156 if(ch <= 0x7f) {
1157 if(pDest<pDestLimit) {
1158 *pDest++ = (uint8_t)ch;
1159 } else {
1160 reqLength = 1;
1161 break;
1162 }
1163 } else if(ch <= 0x7ff) {
1164 if((pDestLimit - pDest) >= 2) {
1165 *pDest++=(uint8_t)((ch>>6)|0xc0);
1166 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1167 } else {
1168 reqLength = 2;
1169 break;
1170 }
1171 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1172 if((pDestLimit - pDest) >= 3) {
1173 *pDest++=(uint8_t)((ch>>12)|0xe0);
1174 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1175 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1176 } else {
1177 reqLength = 3;
1178 break;
1179 }
1180 } else /* ch is a surrogate */ {
1181 int32_t length;
1182
1183 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1184 ++pSrc;
1185 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1186 } else if(subchar>=0) {
1187 ch=subchar;
1188 ++numSubstitutions;
1189 } else {
1190 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1191 *pErrorCode = U_INVALID_CHAR_FOUND;
1192 return NULL;
1193 }
1194
1195 length = U8_LENGTH(ch);
1196 if((pDestLimit - pDest) >= length) {
1197 /* convert and append*/
1198 pDest=_appendUTF8(pDest, ch);
1199 } else {
1200 reqLength = length;
1201 break;
1202 }
1203 }
1204 }
1205 while(pSrc<pSrcLimit) {
1206 ch=*pSrc++;
1207 if(ch<=0x7f) {
1208 ++reqLength;
1209 } else if(ch<=0x7ff) {
1210 reqLength+=2;
1211 } else if(!U16_IS_SURROGATE(ch)) {
1212 reqLength+=3;
1213 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1214 ++pSrc;
1215 reqLength+=4;
1216 } else if(subchar>=0) {
1217 reqLength+=U8_LENGTH(subchar);
1218 ++numSubstitutions;
1219 } else {
1220 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1221 *pErrorCode = U_INVALID_CHAR_FOUND;
1222 return NULL;
1223 }
1224 }
1225 }
1226
1227 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1228
1229 if(pNumSubstitutions!=NULL) {
1230 *pNumSubstitutions=numSubstitutions;
1231 }
1232
1233 if(pDestLength){
1234 *pDestLength = reqLength;
1235 }
1236
1237 /* Terminate the buffer */
1238 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1239 return dest;
1240 }
1241
1242 U_CAPI char* U_EXPORT2
1243 u_strToUTF8(char *dest,
1244 int32_t destCapacity,
1245 int32_t *pDestLength,
1246 const UChar *pSrc,
1247 int32_t srcLength,
1248 UErrorCode *pErrorCode){
1249 return u_strToUTF8WithSub(
1250 dest, destCapacity, pDestLength,
1251 pSrc, srcLength,
1252 U_SENTINEL, NULL,
1253 pErrorCode);
1254 }
1255
1256 U_CAPI UChar* U_EXPORT2
1257 u_strFromJavaModifiedUTF8WithSub(
1258 UChar *dest,
1259 int32_t destCapacity,
1260 int32_t *pDestLength,
1261 const char *src,
1262 int32_t srcLength,
1263 UChar32 subchar, int32_t *pNumSubstitutions,
1264 UErrorCode *pErrorCode) {
1265 UChar *pDest = dest;
1266 UChar *pDestLimit = dest+destCapacity;
1267 UChar32 ch;
1268 int32_t reqLength = 0;
1269 const uint8_t* pSrc = (const uint8_t*) src;
1270 const uint8_t *pSrcLimit;
1271 int32_t count;
1272 uint8_t t1, t2; /* trail bytes */
1273 int32_t numSubstitutions;
1274
1275 /* args check */
1276 if(U_FAILURE(*pErrorCode)){
1277 return NULL;
1278 }
1279 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1280 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1281 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1282 ) {
1283 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1284 return NULL;
1285 }
1286
1287 if(pNumSubstitutions!=NULL) {
1288 *pNumSubstitutions=0;
1289 }
1290 numSubstitutions=0;
1291
1292 if(srcLength < 0) {
1293 /*
1294 * Transform a NUL-terminated ASCII string.
1295 * Handle non-ASCII strings with slower code.
1296 */
1297 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1298 *pDest++=(UChar)ch;
1299 ++pSrc;
1300 }
1301 if(ch == 0) {
1302 reqLength=(int32_t)(pDest - dest);
1303 if(pDestLength) {
1304 *pDestLength = reqLength;
1305 }
1306
1307 /* Terminate the buffer */
1308 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309 return dest;
1310 }
1311 srcLength = uprv_strlen((const char *)pSrc);
1312 }
1313
1314 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1315 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1316 for(;;) {
1317 count = (int32_t)(pDestLimit - pDest);
1318 srcLength = (int32_t)(pSrcLimit - pSrc);
1319 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1320 /* fast ASCII loop */
1321 const uint8_t *prevSrc = pSrc;
1322 int32_t delta;
1323 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1324 *pDest++=(UChar)ch;
1325 ++pSrc;
1326 }
1327 delta = (int32_t)(pSrc - prevSrc);
1328 count -= delta;
1329 srcLength -= delta;
1330 }
1331 /*
1332 * Each iteration of the inner loop progresses by at most 3 UTF-8
1333 * bytes and one UChar.
1334 */
1335 srcLength /= 3;
1336 if(count > srcLength) {
1337 count = srcLength; /* min(remaining dest, remaining src/3) */
1338 }
1339 if(count < 3) {
1340 /*
1341 * Too much overhead if we get near the end of the string,
1342 * continue with the next loop.
1343 */
1344 break;
1345 }
1346 do {
1347 ch = *pSrc;
1348 if(ch <= 0x7f){
1349 *pDest++=(UChar)ch;
1350 ++pSrc;
1351 } else {
1352 if(ch >= 0xe0) {
1353 if( /* handle U+0000..U+FFFF inline */
1354 ch <= 0xef &&
1355 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1356 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1357 ) {
1358 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1359 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1360 pSrc += 3;
1361 continue;
1362 }
1363 } else {
1364 if( /* handle U+0000..U+07FF inline */
1365 ch >= 0xc0 &&
1366 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1367 ) {
1368 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1369 pSrc += 2;
1370 continue;
1371 }
1372 }
1373
1374 if(subchar < 0) {
1375 *pErrorCode = U_INVALID_CHAR_FOUND;
1376 return NULL;
1377 } else if(subchar > 0xffff && --count == 0) {
1378 /*
1379 * We need to write two UChars, adjusted count for that,
1380 * and ran out of space.
1381 */
1382 break;
1383 } else {
1384 /* function call for error cases */
1385 ++pSrc; /* continue after the lead byte */
1386 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1387 ++numSubstitutions;
1388 if(subchar<=0xFFFF) {
1389 *(pDest++)=(UChar)subchar;
1390 } else {
1391 *(pDest++)=U16_LEAD(subchar);
1392 *(pDest++)=U16_TRAIL(subchar);
1393 }
1394 }
1395 }
1396 } while(--count > 0);
1397 }
1398
1399 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1400 ch = *pSrc;
1401 if(ch <= 0x7f){
1402 *pDest++=(UChar)ch;
1403 ++pSrc;
1404 } else {
1405 if(ch >= 0xe0) {
1406 if( /* handle U+0000..U+FFFF inline */
1407 ch <= 0xef &&
1408 ((pSrcLimit - pSrc) >= 3) &&
1409 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1410 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1411 ) {
1412 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1413 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1414 pSrc += 3;
1415 continue;
1416 }
1417 } else {
1418 if( /* handle U+0000..U+07FF inline */
1419 ch >= 0xc0 &&
1420 ((pSrcLimit - pSrc) >= 2) &&
1421 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1422 ) {
1423 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1424 pSrc += 2;
1425 continue;
1426 }
1427 }
1428
1429 if(subchar < 0) {
1430 *pErrorCode = U_INVALID_CHAR_FOUND;
1431 return NULL;
1432 } else {
1433 /* function call for error cases */
1434 ++pSrc; /* continue after the lead byte */
1435 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1436 ++numSubstitutions;
1437 if(subchar<=0xFFFF) {
1438 *(pDest++)=(UChar)subchar;
1439 } else {
1440 *(pDest++)=U16_LEAD(subchar);
1441 if(pDest<pDestLimit) {
1442 *(pDest++)=U16_TRAIL(subchar);
1443 } else {
1444 reqLength++;
1445 break;
1446 }
1447 }
1448 }
1449 }
1450 }
1451
1452 /* do not fill the dest buffer just count the UChars needed */
1453 while(pSrc < pSrcLimit){
1454 ch = *pSrc;
1455 if(ch <= 0x7f) {
1456 reqLength++;
1457 ++pSrc;
1458 } else {
1459 if(ch >= 0xe0) {
1460 if( /* handle U+0000..U+FFFF inline */
1461 ch <= 0xef &&
1462 ((pSrcLimit - pSrc) >= 3) &&
1463 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1464 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1465 ) {
1466 reqLength++;
1467 pSrc += 3;
1468 continue;
1469 }
1470 } else {
1471 if( /* handle U+0000..U+07FF inline */
1472 ch >= 0xc0 &&
1473 ((pSrcLimit - pSrc) >= 2) &&
1474 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1475 ) {
1476 reqLength++;
1477 pSrc += 2;
1478 continue;
1479 }
1480 }
1481
1482 if(subchar < 0) {
1483 *pErrorCode = U_INVALID_CHAR_FOUND;
1484 return NULL;
1485 } else {
1486 /* function call for error cases */
1487 ++pSrc; /* continue after the lead byte */
1488 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1489 ++numSubstitutions;
1490 reqLength+=U16_LENGTH(ch);
1491 }
1492 }
1493 }
1494
1495 if(pNumSubstitutions!=NULL) {
1496 *pNumSubstitutions=numSubstitutions;
1497 }
1498
1499 reqLength+=(int32_t)(pDest - dest);
1500 if(pDestLength) {
1501 *pDestLength = reqLength;
1502 }
1503
1504 /* Terminate the buffer */
1505 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1506 return dest;
1507 }
1508
1509 U_CAPI char* U_EXPORT2
1510 u_strToJavaModifiedUTF8(
1511 char *dest,
1512 int32_t destCapacity,
1513 int32_t *pDestLength,
1514 const UChar *src,
1515 int32_t srcLength,
1516 UErrorCode *pErrorCode) {
1517 int32_t reqLength=0;
1518 uint32_t ch=0;
1519 uint8_t *pDest = (uint8_t *)dest;
1520 uint8_t *pDestLimit = pDest + destCapacity;
1521 const UChar *pSrcLimit;
1522 int32_t count;
1523
1524 /* args check */
1525 if(U_FAILURE(*pErrorCode)){
1526 return NULL;
1527 }
1528 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1529 (dest==NULL && destCapacity!=0) || destCapacity<0
1530 ) {
1531 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1532 return NULL;
1533 }
1534
1535 if(srcLength==-1) {
1536 /* Convert NUL-terminated ASCII, then find the string length. */
1537 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1538 *pDest++ = (uint8_t)ch;
1539 ++src;
1540 }
1541 if(ch == 0) {
1542 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1543 if(pDestLength) {
1544 *pDestLength = reqLength;
1545 }
1546
1547 /* Terminate the buffer */
1548 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1549 return dest;
1550 }
1551 srcLength = u_strlen(src);
1552 }
1553
1554 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1555 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1556 for(;;) {
1557 count = (int32_t)(pDestLimit - pDest);
1558 srcLength = (int32_t)(pSrcLimit - src);
1559 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1560 /* fast ASCII loop */
1561 const UChar *prevSrc = src;
1562 int32_t delta;
1563 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1564 *pDest++=(uint8_t)ch;
1565 ++src;
1566 }
1567 delta = (int32_t)(src - prevSrc);
1568 count -= delta;
1569 srcLength -= delta;
1570 }
1571 /*
1572 * Each iteration of the inner loop progresses by at most 3 UTF-8
1573 * bytes and one UChar.
1574 */
1575 count /= 3;
1576 if(count > srcLength) {
1577 count = srcLength; /* min(remaining dest/3, remaining src) */
1578 }
1579 if(count < 3) {
1580 /*
1581 * Too much overhead if we get near the end of the string,
1582 * continue with the next loop.
1583 */
1584 break;
1585 }
1586 do {
1587 ch=*src++;
1588 if(ch <= 0x7f && ch != 0) {
1589 *pDest++ = (uint8_t)ch;
1590 } else if(ch <= 0x7ff) {
1591 *pDest++=(uint8_t)((ch>>6)|0xc0);
1592 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1593 } else {
1594 *pDest++=(uint8_t)((ch>>12)|0xe0);
1595 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1596 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597 }
1598 } while(--count > 0);
1599 }
1600
1601 while(src<pSrcLimit) {
1602 ch=*src++;
1603 if(ch <= 0x7f && ch != 0) {
1604 if(pDest<pDestLimit) {
1605 *pDest++ = (uint8_t)ch;
1606 } else {
1607 reqLength = 1;
1608 break;
1609 }
1610 } else if(ch <= 0x7ff) {
1611 if((pDestLimit - pDest) >= 2) {
1612 *pDest++=(uint8_t)((ch>>6)|0xc0);
1613 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1614 } else {
1615 reqLength = 2;
1616 break;
1617 }
1618 } else {
1619 if((pDestLimit - pDest) >= 3) {
1620 *pDest++=(uint8_t)((ch>>12)|0xe0);
1621 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1622 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1623 } else {
1624 reqLength = 3;
1625 break;
1626 }
1627 }
1628 }
1629 while(src<pSrcLimit) {
1630 ch=*src++;
1631 if(ch <= 0x7f && ch != 0) {
1632 ++reqLength;
1633 } else if(ch<=0x7ff) {
1634 reqLength+=2;
1635 } else {
1636 reqLength+=3;
1637 }
1638 }
1639
1640 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1641 if(pDestLength){
1642 *pDestLength = reqLength;
1643 }
1644
1645 /* Terminate the buffer */
1646 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1647 return dest;
1648 }