]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
ICU-491.11.3.tar.gz
[apple/icu.git] / icuSources / common / ustrtrns.cpp
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 *
9 * File ustrtrns.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
16 */
17
18 /*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
22 *
23 *******************************************************************************
24 */
25
26
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
32 #include "cstring.h"
33 #include "cmemory.h"
34 #include "ustr_imp.h"
35 #include "uassert.h"
36
37 U_CAPI UChar* U_EXPORT2
38 u_strFromUTF32WithSub(UChar *dest,
39 int32_t destCapacity,
40 int32_t *pDestLength,
41 const UChar32 *src,
42 int32_t srcLength,
43 UChar32 subchar, int32_t *pNumSubstitutions,
44 UErrorCode *pErrorCode) {
45 const UChar32 *srcLimit;
46 UChar32 ch;
47 UChar *destLimit;
48 UChar *pDest;
49 int32_t reqLength;
50 int32_t numSubstitutions;
51
52 /* args check */
53 if(U_FAILURE(*pErrorCode)){
54 return NULL;
55 }
56 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
57 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
58 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
59 ) {
60 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
61 return NULL;
62 }
63
64 if(pNumSubstitutions != NULL) {
65 *pNumSubstitutions = 0;
66 }
67
68 pDest = dest;
69 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
70 reqLength = 0;
71 numSubstitutions = 0;
72
73 if(srcLength < 0) {
74 /* simple loop for conversion of a NUL-terminated BMP string */
75 while((ch=*src) != 0 &&
76 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
77 ++src;
78 if(pDest < destLimit) {
79 *pDest++ = (UChar)ch;
80 } else {
81 ++reqLength;
82 }
83 }
84 srcLimit = src;
85 if(ch != 0) {
86 /* "complicated" case, find the end of the remaining string */
87 while(*++srcLimit != 0) {}
88 }
89 } else {
90 srcLimit = (src!=NULL)?(src + srcLength):NULL;
91 }
92
93 /* convert with length */
94 while(src < srcLimit) {
95 ch = *src++;
96 do {
97 /* usually "loops" once; twice only for writing subchar */
98 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
99 if(pDest < destLimit) {
100 *pDest++ = (UChar)ch;
101 } else {
102 ++reqLength;
103 }
104 break;
105 } else if(0x10000 <= ch && ch <= 0x10ffff) {
106 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
107 *pDest++ = U16_LEAD(ch);
108 *pDest++ = U16_TRAIL(ch);
109 } else {
110 reqLength += 2;
111 }
112 break;
113 } else if((ch = subchar) < 0) {
114 /* surrogate code point, or not a Unicode code point at all */
115 *pErrorCode = U_INVALID_CHAR_FOUND;
116 return NULL;
117 } else {
118 ++numSubstitutions;
119 }
120 } while(TRUE);
121 }
122
123 reqLength += (int32_t)(pDest - dest);
124 if(pDestLength) {
125 *pDestLength = reqLength;
126 }
127 if(pNumSubstitutions != NULL) {
128 *pNumSubstitutions = numSubstitutions;
129 }
130
131 /* Terminate the buffer */
132 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
133
134 return dest;
135 }
136
137 U_CAPI UChar* U_EXPORT2
138 u_strFromUTF32(UChar *dest,
139 int32_t destCapacity,
140 int32_t *pDestLength,
141 const UChar32 *src,
142 int32_t srcLength,
143 UErrorCode *pErrorCode) {
144 return u_strFromUTF32WithSub(
145 dest, destCapacity, pDestLength,
146 src, srcLength,
147 U_SENTINEL, NULL,
148 pErrorCode);
149 }
150
151 U_CAPI UChar32* U_EXPORT2
152 u_strToUTF32WithSub(UChar32 *dest,
153 int32_t destCapacity,
154 int32_t *pDestLength,
155 const UChar *src,
156 int32_t srcLength,
157 UChar32 subchar, int32_t *pNumSubstitutions,
158 UErrorCode *pErrorCode) {
159 const UChar *srcLimit;
160 UChar32 ch;
161 UChar ch2;
162 UChar32 *destLimit;
163 UChar32 *pDest;
164 int32_t reqLength;
165 int32_t numSubstitutions;
166
167 /* args check */
168 if(U_FAILURE(*pErrorCode)){
169 return NULL;
170 }
171 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
172 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
173 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
174 ) {
175 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
176 return NULL;
177 }
178
179 if(pNumSubstitutions != NULL) {
180 *pNumSubstitutions = 0;
181 }
182
183 pDest = dest;
184 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
185 reqLength = 0;
186 numSubstitutions = 0;
187
188 if(srcLength < 0) {
189 /* simple loop for conversion of a NUL-terminated BMP string */
190 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
191 ++src;
192 if(pDest < destLimit) {
193 *pDest++ = ch;
194 } else {
195 ++reqLength;
196 }
197 }
198 srcLimit = src;
199 if(ch != 0) {
200 /* "complicated" case, find the end of the remaining string */
201 while(*++srcLimit != 0) {}
202 }
203 } else {
204 srcLimit = (src!=NULL)?(src + srcLength):NULL;
205 }
206
207 /* convert with length */
208 while(src < srcLimit) {
209 ch = *src++;
210 if(!U16_IS_SURROGATE(ch)) {
211 /* write or count ch below */
212 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
213 ++src;
214 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
215 } else if((ch = subchar) < 0) {
216 /* unpaired surrogate */
217 *pErrorCode = U_INVALID_CHAR_FOUND;
218 return NULL;
219 } else {
220 ++numSubstitutions;
221 }
222 if(pDest < destLimit) {
223 *pDest++ = ch;
224 } else {
225 ++reqLength;
226 }
227 }
228
229 reqLength += (int32_t)(pDest - dest);
230 if(pDestLength) {
231 *pDestLength = reqLength;
232 }
233 if(pNumSubstitutions != NULL) {
234 *pNumSubstitutions = numSubstitutions;
235 }
236
237 /* Terminate the buffer */
238 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
239
240 return dest;
241 }
242
243 U_CAPI UChar32* U_EXPORT2
244 u_strToUTF32(UChar32 *dest,
245 int32_t destCapacity,
246 int32_t *pDestLength,
247 const UChar *src,
248 int32_t srcLength,
249 UErrorCode *pErrorCode) {
250 return u_strToUTF32WithSub(
251 dest, destCapacity, pDestLength,
252 src, srcLength,
253 U_SENTINEL, NULL,
254 pErrorCode);
255 }
256
257 /* for utf8_nextCharSafeBodyTerminated() */
258 static const UChar32
259 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
260
261 /*
262 * Version of utf8_nextCharSafeBody() with the following differences:
263 * - checks for NUL termination instead of length
264 * - works with pointers instead of indexes
265 * - always strict (strict==-1)
266 *
267 * *ps points to after the lead byte and will be moved to after the last trail byte.
268 * c is the lead byte.
269 * @return the code point, or U_SENTINEL
270 */
271 static UChar32
272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
273 const uint8_t *s=*ps;
274 uint8_t trail, illegal=0;
275 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
276 U_ASSERT(count<6);
277 U8_MASK_LEAD_BYTE((c), count);
278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
279 switch(count) {
280 /* each branch falls through to the next one */
281 case 5:
282 case 4:
283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
284 illegal=1;
285 break;
286 case 3:
287 trail=(uint8_t)(*s++ - 0x80);
288 c=(c<<6)|trail;
289 if(trail>0x3f || c>=0x110) {
290 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
291 illegal=1;
292 break;
293 }
294 case 2: /*fall through*/
295 trail=(uint8_t)(*s++ - 0x80);
296 if(trail>0x3f) {
297 /* not a trail byte */
298 illegal=1;
299 break;
300 }
301 c=(c<<6)|trail;
302 case 1: /*fall through*/
303 trail=(uint8_t)(*s++ - 0x80);
304 if(trail>0x3f) {
305 /* not a trail byte */
306 illegal=1;
307 }
308 c=(c<<6)|trail;
309 break;
310 case 0:
311 return U_SENTINEL;
312 /* no default branch to optimize switch() - all values are covered */
313 }
314
315 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
316 /* illegal is also set if count>=4 */
317 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
318 /* error handling */
319 /* don't go beyond this sequence */
320 s=*ps;
321 while(count>0 && U8_IS_TRAIL(*s)) {
322 ++s;
323 --count;
324 }
325 c=U_SENTINEL;
326 }
327 *ps=s;
328 return c;
329 }
330
331 /*
332 * Version of utf8_nextCharSafeBody() with the following differences:
333 * - works with pointers instead of indexes
334 * - always strict (strict==-1)
335 *
336 * *ps points to after the lead byte and will be moved to after the last trail byte.
337 * c is the lead byte.
338 * @return the code point, or U_SENTINEL
339 */
340 static UChar32
341 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
342 const uint8_t *s=*ps;
343 uint8_t trail, illegal=0;
344 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
345 if((limit-s)>=count) {
346 U8_MASK_LEAD_BYTE((c), count);
347 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
348 switch(count) {
349 /* each branch falls through to the next one */
350 case 5:
351 case 4:
352 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
353 illegal=1;
354 break;
355 case 3:
356 trail=*s++;
357 c=(c<<6)|(trail&0x3f);
358 if(c<0x110) {
359 illegal|=(trail&0xc0)^0x80;
360 } else {
361 /* code point>0x10ffff, outside Unicode */
362 illegal=1;
363 break;
364 }
365 case 2: /*fall through*/
366 trail=*s++;
367 c=(c<<6)|(trail&0x3f);
368 illegal|=(trail&0xc0)^0x80;
369 case 1: /*fall through*/
370 trail=*s++;
371 c=(c<<6)|(trail&0x3f);
372 illegal|=(trail&0xc0)^0x80;
373 break;
374 case 0:
375 return U_SENTINEL;
376 /* no default branch to optimize switch() - all values are covered */
377 }
378 } else {
379 illegal=1; /* too few bytes left */
380 }
381
382 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
383 /* illegal is also set if count>=4 */
384 U_ASSERT(count<sizeof(utf8_minLegal)/sizeof(utf8_minLegal[0]));
385 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
386 /* error handling */
387 /* don't go beyond this sequence */
388 s=*ps;
389 while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
390 ++s;
391 --count;
392 }
393 c=U_SENTINEL;
394 }
395 *ps=s;
396 return c;
397 }
398
399 U_CAPI UChar* U_EXPORT2
400 u_strFromUTF8WithSub(UChar *dest,
401 int32_t destCapacity,
402 int32_t *pDestLength,
403 const char* src,
404 int32_t srcLength,
405 UChar32 subchar, int32_t *pNumSubstitutions,
406 UErrorCode *pErrorCode){
407 UChar *pDest = dest;
408 UChar *pDestLimit = dest+destCapacity;
409 UChar32 ch;
410 int32_t reqLength = 0;
411 const uint8_t* pSrc = (const uint8_t*) src;
412 uint8_t t1, t2; /* trail bytes */
413 int32_t numSubstitutions;
414
415 /* args check */
416 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
417 return NULL;
418 }
419
420 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
421 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
422 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
423 ) {
424 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
425 return NULL;
426 }
427
428 if(pNumSubstitutions!=NULL) {
429 *pNumSubstitutions=0;
430 }
431 numSubstitutions=0;
432
433 /*
434 * Inline processing of UTF-8 byte sequences:
435 *
436 * Byte sequences for the most common characters are handled inline in
437 * the conversion loops. In order to reduce the path lengths for those
438 * characters, the tests are arranged in a kind of binary search.
439 * ASCII (<=0x7f) is checked first, followed by the dividing point
440 * between 2- and 3-byte sequences (0xe0).
441 * The 3-byte branch is tested first to speed up CJK text.
442 * The compiler should combine the subtractions for the two tests for 0xe0.
443 * Each branch then tests for the other end of its range.
444 */
445
446 if(srcLength < 0){
447 /*
448 * Transform a NUL-terminated string.
449 * The code explicitly checks for NULs only in the lead byte position.
450 * A NUL byte in the trail byte position fails the trail byte range check anyway.
451 */
452 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
453 if(ch <= 0x7f){
454 *pDest++=(UChar)ch;
455 ++pSrc;
456 } else {
457 if(ch > 0xe0) {
458 if( /* handle U+1000..U+CFFF inline */
459 ch <= 0xec &&
460 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
461 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
462 ) {
463 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
464 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
465 pSrc += 3;
466 continue;
467 }
468 } else if(ch < 0xe0) {
469 if( /* handle U+0080..U+07FF inline */
470 ch >= 0xc2 &&
471 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
472 ) {
473 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
474 pSrc += 2;
475 continue;
476 }
477 }
478
479 /* function call for "complicated" and error cases */
480 ++pSrc; /* continue after the lead byte */
481 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
482 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
483 *pErrorCode = U_INVALID_CHAR_FOUND;
484 return NULL;
485 } else if(ch<=0xFFFF) {
486 *(pDest++)=(UChar)ch;
487 } else {
488 *(pDest++)=U16_LEAD(ch);
489 if(pDest<pDestLimit) {
490 *(pDest++)=U16_TRAIL(ch);
491 } else {
492 reqLength++;
493 break;
494 }
495 }
496 }
497 }
498
499 /* Pre-flight the rest of the string. */
500 while((ch = *pSrc) != 0) {
501 if(ch <= 0x7f){
502 ++reqLength;
503 ++pSrc;
504 } else {
505 if(ch > 0xe0) {
506 if( /* handle U+1000..U+CFFF inline */
507 ch <= 0xec &&
508 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
509 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
510 ) {
511 ++reqLength;
512 pSrc += 3;
513 continue;
514 }
515 } else if(ch < 0xe0) {
516 if( /* handle U+0080..U+07FF inline */
517 ch >= 0xc2 &&
518 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
519 ) {
520 ++reqLength;
521 pSrc += 2;
522 continue;
523 }
524 }
525
526 /* function call for "complicated" and error cases */
527 ++pSrc; /* continue after the lead byte */
528 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
529 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
530 *pErrorCode = U_INVALID_CHAR_FOUND;
531 return NULL;
532 }
533 reqLength += U16_LENGTH(ch);
534 }
535 }
536 } else /* srcLength >= 0 */ {
537 const uint8_t *pSrcLimit = pSrc + srcLength;
538 int32_t count;
539
540 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
541 for(;;) {
542 /*
543 * Each iteration of the inner loop progresses by at most 3 UTF-8
544 * bytes and one UChar, for most characters.
545 * For supplementary code points (4 & 2), which are rare,
546 * there is an additional adjustment.
547 */
548 count = (int32_t)(pDestLimit - pDest);
549 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
550 if(count > srcLength) {
551 count = srcLength; /* min(remaining dest, remaining src/3) */
552 }
553 if(count < 3) {
554 /*
555 * Too much overhead if we get near the end of the string,
556 * continue with the next loop.
557 */
558 break;
559 }
560
561 do {
562 ch = *pSrc;
563 if(ch <= 0x7f){
564 *pDest++=(UChar)ch;
565 ++pSrc;
566 } else {
567 if(ch > 0xe0) {
568 if( /* handle U+1000..U+CFFF inline */
569 ch <= 0xec &&
570 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
571 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
572 ) {
573 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
574 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
575 pSrc += 3;
576 continue;
577 }
578 } else if(ch < 0xe0) {
579 if( /* handle U+0080..U+07FF inline */
580 ch >= 0xc2 &&
581 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
582 ) {
583 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
584 pSrc += 2;
585 continue;
586 }
587 }
588
589 if(ch >= 0xf0 || subchar > 0xffff) {
590 /*
591 * We may read up to six bytes and write up to two UChars,
592 * which we didn't account for with computing count,
593 * so we adjust it here.
594 */
595 if(--count == 0) {
596 break;
597 }
598 }
599
600 /* function call for "complicated" and error cases */
601 ++pSrc; /* continue after the lead byte */
602 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
603 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
604 *pErrorCode = U_INVALID_CHAR_FOUND;
605 return NULL;
606 }else if(ch<=0xFFFF){
607 *(pDest++)=(UChar)ch;
608 }else{
609 *(pDest++)=U16_LEAD(ch);
610 *(pDest++)=U16_TRAIL(ch);
611 }
612 }
613 } while(--count > 0);
614 }
615
616 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
617 ch = *pSrc;
618 if(ch <= 0x7f){
619 *pDest++=(UChar)ch;
620 ++pSrc;
621 } else {
622 if(ch > 0xe0) {
623 if( /* handle U+1000..U+CFFF inline */
624 ch <= 0xec &&
625 ((pSrcLimit - pSrc) >= 3) &&
626 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
627 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
628 ) {
629 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
630 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
631 pSrc += 3;
632 continue;
633 }
634 } else if(ch < 0xe0) {
635 if( /* handle U+0080..U+07FF inline */
636 ch >= 0xc2 &&
637 ((pSrcLimit - pSrc) >= 2) &&
638 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
639 ) {
640 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
641 pSrc += 2;
642 continue;
643 }
644 }
645
646 /* function call for "complicated" and error cases */
647 ++pSrc; /* continue after the lead byte */
648 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
649 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
650 *pErrorCode = U_INVALID_CHAR_FOUND;
651 return NULL;
652 }else if(ch<=0xFFFF){
653 *(pDest++)=(UChar)ch;
654 }else{
655 *(pDest++)=U16_LEAD(ch);
656 if(pDest<pDestLimit){
657 *(pDest++)=U16_TRAIL(ch);
658 }else{
659 reqLength++;
660 break;
661 }
662 }
663 }
664 }
665 /* do not fill the dest buffer just count the UChars needed */
666 while(pSrc < pSrcLimit){
667 ch = *pSrc;
668 if(ch <= 0x7f){
669 reqLength++;
670 ++pSrc;
671 } else {
672 if(ch > 0xe0) {
673 if( /* handle U+1000..U+CFFF inline */
674 ch <= 0xec &&
675 ((pSrcLimit - pSrc) >= 3) &&
676 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
677 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
678 ) {
679 reqLength++;
680 pSrc += 3;
681 continue;
682 }
683 } else if(ch < 0xe0) {
684 if( /* handle U+0080..U+07FF inline */
685 ch >= 0xc2 &&
686 ((pSrcLimit - pSrc) >= 2) &&
687 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
688 ) {
689 reqLength++;
690 pSrc += 2;
691 continue;
692 }
693 }
694
695 /* function call for "complicated" and error cases */
696 ++pSrc; /* continue after the lead byte */
697 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
698 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
699 *pErrorCode = U_INVALID_CHAR_FOUND;
700 return NULL;
701 }
702 reqLength+=U16_LENGTH(ch);
703 }
704 }
705 }
706
707 reqLength+=(int32_t)(pDest - dest);
708
709 if(pNumSubstitutions!=NULL) {
710 *pNumSubstitutions=numSubstitutions;
711 }
712
713 if(pDestLength){
714 *pDestLength = reqLength;
715 }
716
717 /* Terminate the buffer */
718 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
719
720 return dest;
721 }
722
723 U_CAPI UChar* U_EXPORT2
724 u_strFromUTF8(UChar *dest,
725 int32_t destCapacity,
726 int32_t *pDestLength,
727 const char* src,
728 int32_t srcLength,
729 UErrorCode *pErrorCode){
730 return u_strFromUTF8WithSub(
731 dest, destCapacity, pDestLength,
732 src, srcLength,
733 U_SENTINEL, NULL,
734 pErrorCode);
735 }
736
737 U_CAPI UChar * U_EXPORT2
738 u_strFromUTF8Lenient(UChar *dest,
739 int32_t destCapacity,
740 int32_t *pDestLength,
741 const char *src,
742 int32_t srcLength,
743 UErrorCode *pErrorCode) {
744 UChar *pDest = dest;
745 UChar32 ch;
746 int32_t reqLength = 0;
747 uint8_t* pSrc = (uint8_t*) src;
748
749 /* args check */
750 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
751 return NULL;
752 }
753
754 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
755 (destCapacity<0) || (dest == NULL && destCapacity > 0)
756 ) {
757 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
758 return NULL;
759 }
760
761 if(srcLength < 0) {
762 /* Transform a NUL-terminated string. */
763 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
764 uint8_t t1, t2, t3; /* trail bytes */
765
766 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
767 if(ch < 0xc0) {
768 /*
769 * ASCII, or a trail byte in lead position which is treated like
770 * a single-byte sequence for better character boundary
771 * resynchronization after illegal sequences.
772 */
773 *pDest++=(UChar)ch;
774 ++pSrc;
775 continue;
776 } else if(ch < 0xe0) { /* U+0080..U+07FF */
777 if((t1 = pSrc[1]) != 0) {
778 /* 0x3080 = (0xc0 << 6) + 0x80 */
779 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
780 pSrc += 2;
781 continue;
782 }
783 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
784 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
785 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
786 /* 0x2080 = (0x80 << 6) + 0x80 */
787 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
788 pSrc += 3;
789 continue;
790 }
791 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
792 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
793 pSrc += 4;
794 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
795 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
796 *(pDest++) = U16_LEAD(ch);
797 if(pDest < pDestLimit) {
798 *(pDest++) = U16_TRAIL(ch);
799 } else {
800 reqLength = 1;
801 break;
802 }
803 continue;
804 }
805 }
806
807 /* truncated character at the end */
808 *pDest++ = 0xfffd;
809 while(*++pSrc != 0) {}
810 break;
811 }
812
813 /* Pre-flight the rest of the string. */
814 while((ch = *pSrc) != 0) {
815 if(ch < 0xc0) {
816 /*
817 * ASCII, or a trail byte in lead position which is treated like
818 * a single-byte sequence for better character boundary
819 * resynchronization after illegal sequences.
820 */
821 ++reqLength;
822 ++pSrc;
823 continue;
824 } else if(ch < 0xe0) { /* U+0080..U+07FF */
825 if(pSrc[1] != 0) {
826 ++reqLength;
827 pSrc += 2;
828 continue;
829 }
830 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
831 if(pSrc[1] != 0 && pSrc[2] != 0) {
832 ++reqLength;
833 pSrc += 3;
834 continue;
835 }
836 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
837 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
838 reqLength += 2;
839 pSrc += 4;
840 continue;
841 }
842 }
843
844 /* truncated character at the end */
845 ++reqLength;
846 break;
847 }
848 } else /* srcLength >= 0 */ {
849 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
850
851 /*
852 * This function requires that if srcLength is given, then it must be
853 * destCapatity >= srcLength so that we need not check for
854 * destination buffer overflow in the loop.
855 */
856 if(destCapacity < srcLength) {
857 if(pDestLength != NULL) {
858 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
859 }
860 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
861 return NULL;
862 }
863
864 if((pSrcLimit - pSrc) >= 4) {
865 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
866
867 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
868 do {
869 ch = *pSrc++;
870 if(ch < 0xc0) {
871 /*
872 * ASCII, or a trail byte in lead position which is treated like
873 * a single-byte sequence for better character boundary
874 * resynchronization after illegal sequences.
875 */
876 *pDest++=(UChar)ch;
877 } else if(ch < 0xe0) { /* U+0080..U+07FF */
878 /* 0x3080 = (0xc0 << 6) + 0x80 */
879 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
880 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
881 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
882 /* 0x2080 = (0x80 << 6) + 0x80 */
883 ch = (ch << 12) + (*pSrc++ << 6);
884 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
885 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
886 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
887 ch = (ch << 18) + (*pSrc++ << 12);
888 ch += *pSrc++ << 6;
889 ch += *pSrc++ - 0x3c82080;
890 *(pDest++) = U16_LEAD(ch);
891 *(pDest++) = U16_TRAIL(ch);
892 }
893 } while(pSrc < pSrcLimit);
894
895 pSrcLimit += 3; /* restore original pSrcLimit */
896 }
897
898 while(pSrc < pSrcLimit) {
899 ch = *pSrc++;
900 if(ch < 0xc0) {
901 /*
902 * ASCII, or a trail byte in lead position which is treated like
903 * a single-byte sequence for better character boundary
904 * resynchronization after illegal sequences.
905 */
906 *pDest++=(UChar)ch;
907 continue;
908 } else if(ch < 0xe0) { /* U+0080..U+07FF */
909 if(pSrc < pSrcLimit) {
910 /* 0x3080 = (0xc0 << 6) + 0x80 */
911 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
912 continue;
913 }
914 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
915 if((pSrcLimit - pSrc) >= 2) {
916 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
917 /* 0x2080 = (0x80 << 6) + 0x80 */
918 ch = (ch << 12) + (*pSrc++ << 6);
919 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
920 pSrc += 3;
921 continue;
922 }
923 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
924 if((pSrcLimit - pSrc) >= 3) {
925 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
926 ch = (ch << 18) + (*pSrc++ << 12);
927 ch += *pSrc++ << 6;
928 ch += *pSrc++ - 0x3c82080;
929 *(pDest++) = U16_LEAD(ch);
930 *(pDest++) = U16_TRAIL(ch);
931 pSrc += 4;
932 continue;
933 }
934 }
935
936 /* truncated character at the end */
937 *pDest++ = 0xfffd;
938 break;
939 }
940 }
941
942 reqLength+=(int32_t)(pDest - dest);
943
944 if(pDestLength){
945 *pDestLength = reqLength;
946 }
947
948 /* Terminate the buffer */
949 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
950
951 return dest;
952 }
953
954 static inline uint8_t *
955 _appendUTF8(uint8_t *pDest, UChar32 c) {
956 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
957 if((c)<=0x7f) {
958 *pDest++=(uint8_t)c;
959 } else if(c<=0x7ff) {
960 *pDest++=(uint8_t)((c>>6)|0xc0);
961 *pDest++=(uint8_t)((c&0x3f)|0x80);
962 } else if(c<=0xffff) {
963 *pDest++=(uint8_t)((c>>12)|0xe0);
964 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
965 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
966 } else /* if((uint32_t)(c)<=0x10ffff) */ {
967 *pDest++=(uint8_t)(((c)>>18)|0xf0);
968 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
969 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
970 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
971 }
972 return pDest;
973 }
974
975
976 U_CAPI char* U_EXPORT2
977 u_strToUTF8WithSub(char *dest,
978 int32_t destCapacity,
979 int32_t *pDestLength,
980 const UChar *pSrc,
981 int32_t srcLength,
982 UChar32 subchar, int32_t *pNumSubstitutions,
983 UErrorCode *pErrorCode){
984 int32_t reqLength=0;
985 uint32_t ch=0,ch2=0;
986 uint8_t *pDest = (uint8_t *)dest;
987 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
988 int32_t numSubstitutions;
989
990 /* args check */
991 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
992 return NULL;
993 }
994
995 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
996 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
997 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
998 ) {
999 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1000 return NULL;
1001 }
1002
1003 if(pNumSubstitutions!=NULL) {
1004 *pNumSubstitutions=0;
1005 }
1006 numSubstitutions=0;
1007
1008 if(srcLength==-1) {
1009 while((ch=*pSrc)!=0) {
1010 ++pSrc;
1011 if(ch <= 0x7f) {
1012 if(pDest<pDestLimit) {
1013 *pDest++ = (uint8_t)ch;
1014 } else {
1015 reqLength = 1;
1016 break;
1017 }
1018 } else if(ch <= 0x7ff) {
1019 if((pDestLimit - pDest) >= 2) {
1020 *pDest++=(uint8_t)((ch>>6)|0xc0);
1021 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1022 } else {
1023 reqLength = 2;
1024 break;
1025 }
1026 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1027 if((pDestLimit - pDest) >= 3) {
1028 *pDest++=(uint8_t)((ch>>12)|0xe0);
1029 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1030 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1031 } else {
1032 reqLength = 3;
1033 break;
1034 }
1035 } else /* ch is a surrogate */ {
1036 int32_t length;
1037
1038 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1039 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1040 ++pSrc;
1041 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1042 } else if(subchar>=0) {
1043 ch=subchar;
1044 ++numSubstitutions;
1045 } else {
1046 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1047 *pErrorCode = U_INVALID_CHAR_FOUND;
1048 return NULL;
1049 }
1050
1051 length = U8_LENGTH(ch);
1052 if((pDestLimit - pDest) >= length) {
1053 /* convert and append*/
1054 pDest=_appendUTF8(pDest, ch);
1055 } else {
1056 reqLength = length;
1057 break;
1058 }
1059 }
1060 }
1061 while((ch=*pSrc++)!=0) {
1062 if(ch<=0x7f) {
1063 ++reqLength;
1064 } else if(ch<=0x7ff) {
1065 reqLength+=2;
1066 } else if(!U16_IS_SURROGATE(ch)) {
1067 reqLength+=3;
1068 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1069 ++pSrc;
1070 reqLength+=4;
1071 } else if(subchar>=0) {
1072 reqLength+=U8_LENGTH(subchar);
1073 ++numSubstitutions;
1074 } else {
1075 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1076 *pErrorCode = U_INVALID_CHAR_FOUND;
1077 return NULL;
1078 }
1079 }
1080 } else {
1081 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1082 int32_t count;
1083
1084 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1085 for(;;) {
1086 /*
1087 * Each iteration of the inner loop progresses by at most 3 UTF-8
1088 * bytes and one UChar, for most characters.
1089 * For supplementary code points (4 & 2), which are rare,
1090 * there is an additional adjustment.
1091 */
1092 count = (int32_t)((pDestLimit - pDest) / 3);
1093 srcLength = (int32_t)(pSrcLimit - pSrc);
1094 if(count > srcLength) {
1095 count = srcLength; /* min(remaining dest/3, remaining src) */
1096 }
1097 if(count < 3) {
1098 /*
1099 * Too much overhead if we get near the end of the string,
1100 * continue with the next loop.
1101 */
1102 break;
1103 }
1104 do {
1105 ch=*pSrc++;
1106 if(ch <= 0x7f) {
1107 *pDest++ = (uint8_t)ch;
1108 } else if(ch <= 0x7ff) {
1109 *pDest++=(uint8_t)((ch>>6)|0xc0);
1110 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1111 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1112 *pDest++=(uint8_t)((ch>>12)|0xe0);
1113 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1114 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1115 } else /* ch is a surrogate */ {
1116 /*
1117 * We will read two UChars and probably output four bytes,
1118 * which we didn't account for with computing count,
1119 * so we adjust it here.
1120 */
1121 if(--count == 0) {
1122 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1123 break; /* recompute count */
1124 }
1125
1126 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1127 ++pSrc;
1128 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1129
1130 /* writing 4 bytes per 2 UChars is ok */
1131 *pDest++=(uint8_t)((ch>>18)|0xf0);
1132 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1133 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1134 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1135 } else {
1136 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1137 if(subchar>=0) {
1138 ch=subchar;
1139 ++numSubstitutions;
1140 } else {
1141 *pErrorCode = U_INVALID_CHAR_FOUND;
1142 return NULL;
1143 }
1144
1145 /* convert and append*/
1146 pDest=_appendUTF8(pDest, ch);
1147 }
1148 }
1149 } while(--count > 0);
1150 }
1151
1152 while(pSrc<pSrcLimit) {
1153 ch=*pSrc++;
1154 if(ch <= 0x7f) {
1155 if(pDest<pDestLimit) {
1156 *pDest++ = (uint8_t)ch;
1157 } else {
1158 reqLength = 1;
1159 break;
1160 }
1161 } else if(ch <= 0x7ff) {
1162 if((pDestLimit - pDest) >= 2) {
1163 *pDest++=(uint8_t)((ch>>6)|0xc0);
1164 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1165 } else {
1166 reqLength = 2;
1167 break;
1168 }
1169 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1170 if((pDestLimit - pDest) >= 3) {
1171 *pDest++=(uint8_t)((ch>>12)|0xe0);
1172 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1173 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1174 } else {
1175 reqLength = 3;
1176 break;
1177 }
1178 } else /* ch is a surrogate */ {
1179 int32_t length;
1180
1181 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1182 ++pSrc;
1183 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1184 } else if(subchar>=0) {
1185 ch=subchar;
1186 ++numSubstitutions;
1187 } else {
1188 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1189 *pErrorCode = U_INVALID_CHAR_FOUND;
1190 return NULL;
1191 }
1192
1193 length = U8_LENGTH(ch);
1194 if((pDestLimit - pDest) >= length) {
1195 /* convert and append*/
1196 pDest=_appendUTF8(pDest, ch);
1197 } else {
1198 reqLength = length;
1199 break;
1200 }
1201 }
1202 }
1203 while(pSrc<pSrcLimit) {
1204 ch=*pSrc++;
1205 if(ch<=0x7f) {
1206 ++reqLength;
1207 } else if(ch<=0x7ff) {
1208 reqLength+=2;
1209 } else if(!U16_IS_SURROGATE(ch)) {
1210 reqLength+=3;
1211 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1212 ++pSrc;
1213 reqLength+=4;
1214 } else if(subchar>=0) {
1215 reqLength+=U8_LENGTH(subchar);
1216 ++numSubstitutions;
1217 } else {
1218 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1219 *pErrorCode = U_INVALID_CHAR_FOUND;
1220 return NULL;
1221 }
1222 }
1223 }
1224
1225 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1226
1227 if(pNumSubstitutions!=NULL) {
1228 *pNumSubstitutions=numSubstitutions;
1229 }
1230
1231 if(pDestLength){
1232 *pDestLength = reqLength;
1233 }
1234
1235 /* Terminate the buffer */
1236 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1237 return dest;
1238 }
1239
1240 U_CAPI char* U_EXPORT2
1241 u_strToUTF8(char *dest,
1242 int32_t destCapacity,
1243 int32_t *pDestLength,
1244 const UChar *pSrc,
1245 int32_t srcLength,
1246 UErrorCode *pErrorCode){
1247 return u_strToUTF8WithSub(
1248 dest, destCapacity, pDestLength,
1249 pSrc, srcLength,
1250 U_SENTINEL, NULL,
1251 pErrorCode);
1252 }
1253
1254 U_CAPI UChar* U_EXPORT2
1255 u_strFromJavaModifiedUTF8WithSub(
1256 UChar *dest,
1257 int32_t destCapacity,
1258 int32_t *pDestLength,
1259 const char *src,
1260 int32_t srcLength,
1261 UChar32 subchar, int32_t *pNumSubstitutions,
1262 UErrorCode *pErrorCode) {
1263 UChar *pDest = dest;
1264 UChar *pDestLimit = dest+destCapacity;
1265 UChar32 ch;
1266 int32_t reqLength = 0;
1267 const uint8_t* pSrc = (const uint8_t*) src;
1268 const uint8_t *pSrcLimit;
1269 int32_t count;
1270 uint8_t t1, t2; /* trail bytes */
1271 int32_t numSubstitutions;
1272
1273 /* args check */
1274 if(U_FAILURE(*pErrorCode)){
1275 return NULL;
1276 }
1277 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1278 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1279 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1280 ) {
1281 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1282 return NULL;
1283 }
1284
1285 if(pNumSubstitutions!=NULL) {
1286 *pNumSubstitutions=0;
1287 }
1288 numSubstitutions=0;
1289
1290 if(srcLength < 0) {
1291 /*
1292 * Transform a NUL-terminated ASCII string.
1293 * Handle non-ASCII strings with slower code.
1294 */
1295 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1296 *pDest++=(UChar)ch;
1297 ++pSrc;
1298 }
1299 if(ch == 0) {
1300 reqLength=(int32_t)(pDest - dest);
1301 if(pDestLength) {
1302 *pDestLength = reqLength;
1303 }
1304
1305 /* Terminate the buffer */
1306 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1307 return dest;
1308 }
1309 srcLength = uprv_strlen((const char *)pSrc);
1310 }
1311
1312 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1313 pSrcLimit = pSrc + srcLength;
1314 for(;;) {
1315 count = (int32_t)(pDestLimit - pDest);
1316 srcLength = (int32_t)(pSrcLimit - pSrc);
1317 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1318 /* fast ASCII loop */
1319 const uint8_t *prevSrc = pSrc;
1320 int32_t delta;
1321 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1322 *pDest++=(UChar)ch;
1323 ++pSrc;
1324 }
1325 delta = (int32_t)(pSrc - prevSrc);
1326 count -= delta;
1327 srcLength -= delta;
1328 }
1329 /*
1330 * Each iteration of the inner loop progresses by at most 3 UTF-8
1331 * bytes and one UChar.
1332 */
1333 srcLength /= 3;
1334 if(count > srcLength) {
1335 count = srcLength; /* min(remaining dest, remaining src/3) */
1336 }
1337 if(count < 3) {
1338 /*
1339 * Too much overhead if we get near the end of the string,
1340 * continue with the next loop.
1341 */
1342 break;
1343 }
1344 do {
1345 ch = *pSrc;
1346 if(ch <= 0x7f){
1347 *pDest++=(UChar)ch;
1348 ++pSrc;
1349 } else {
1350 if(ch >= 0xe0) {
1351 if( /* handle U+0000..U+FFFF inline */
1352 ch <= 0xef &&
1353 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1354 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1355 ) {
1356 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1357 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1358 pSrc += 3;
1359 continue;
1360 }
1361 } else {
1362 if( /* handle U+0000..U+07FF inline */
1363 ch >= 0xc0 &&
1364 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1365 ) {
1366 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1367 pSrc += 2;
1368 continue;
1369 }
1370 }
1371
1372 if(subchar < 0) {
1373 *pErrorCode = U_INVALID_CHAR_FOUND;
1374 return NULL;
1375 } else if(subchar > 0xffff && --count == 0) {
1376 /*
1377 * We need to write two UChars, adjusted count for that,
1378 * and ran out of space.
1379 */
1380 break;
1381 } else {
1382 /* function call for error cases */
1383 ++pSrc; /* continue after the lead byte */
1384 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1385 ++numSubstitutions;
1386 if(subchar<=0xFFFF) {
1387 *(pDest++)=(UChar)subchar;
1388 } else {
1389 *(pDest++)=U16_LEAD(subchar);
1390 *(pDest++)=U16_TRAIL(subchar);
1391 }
1392 }
1393 }
1394 } while(--count > 0);
1395 }
1396
1397 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1398 ch = *pSrc;
1399 if(ch <= 0x7f){
1400 *pDest++=(UChar)ch;
1401 ++pSrc;
1402 } else {
1403 if(ch >= 0xe0) {
1404 if( /* handle U+0000..U+FFFF inline */
1405 ch <= 0xef &&
1406 ((pSrcLimit - pSrc) >= 3) &&
1407 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1408 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1409 ) {
1410 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1411 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1412 pSrc += 3;
1413 continue;
1414 }
1415 } else {
1416 if( /* handle U+0000..U+07FF inline */
1417 ch >= 0xc0 &&
1418 ((pSrcLimit - pSrc) >= 2) &&
1419 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1420 ) {
1421 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1422 pSrc += 2;
1423 continue;
1424 }
1425 }
1426
1427 if(subchar < 0) {
1428 *pErrorCode = U_INVALID_CHAR_FOUND;
1429 return NULL;
1430 } else {
1431 /* function call for error cases */
1432 ++pSrc; /* continue after the lead byte */
1433 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1434 ++numSubstitutions;
1435 if(subchar<=0xFFFF) {
1436 *(pDest++)=(UChar)subchar;
1437 } else {
1438 *(pDest++)=U16_LEAD(subchar);
1439 if(pDest<pDestLimit) {
1440 *(pDest++)=U16_TRAIL(subchar);
1441 } else {
1442 reqLength++;
1443 break;
1444 }
1445 }
1446 }
1447 }
1448 }
1449
1450 /* do not fill the dest buffer just count the UChars needed */
1451 while(pSrc < pSrcLimit){
1452 ch = *pSrc;
1453 if(ch <= 0x7f) {
1454 reqLength++;
1455 ++pSrc;
1456 } else {
1457 if(ch >= 0xe0) {
1458 if( /* handle U+0000..U+FFFF inline */
1459 ch <= 0xef &&
1460 ((pSrcLimit - pSrc) >= 3) &&
1461 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1462 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1463 ) {
1464 reqLength++;
1465 pSrc += 3;
1466 continue;
1467 }
1468 } else {
1469 if( /* handle U+0000..U+07FF inline */
1470 ch >= 0xc0 &&
1471 ((pSrcLimit - pSrc) >= 2) &&
1472 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1473 ) {
1474 reqLength++;
1475 pSrc += 2;
1476 continue;
1477 }
1478 }
1479
1480 if(subchar < 0) {
1481 *pErrorCode = U_INVALID_CHAR_FOUND;
1482 return NULL;
1483 } else {
1484 /* function call for error cases */
1485 ++pSrc; /* continue after the lead byte */
1486 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1487 ++numSubstitutions;
1488 reqLength+=U16_LENGTH(ch);
1489 }
1490 }
1491 }
1492
1493 if(pNumSubstitutions!=NULL) {
1494 *pNumSubstitutions=numSubstitutions;
1495 }
1496
1497 reqLength+=(int32_t)(pDest - dest);
1498 if(pDestLength) {
1499 *pDestLength = reqLength;
1500 }
1501
1502 /* Terminate the buffer */
1503 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1504 return dest;
1505 }
1506
1507 U_CAPI char* U_EXPORT2
1508 u_strToJavaModifiedUTF8(
1509 char *dest,
1510 int32_t destCapacity,
1511 int32_t *pDestLength,
1512 const UChar *src,
1513 int32_t srcLength,
1514 UErrorCode *pErrorCode) {
1515 int32_t reqLength=0;
1516 uint32_t ch=0;
1517 uint8_t *pDest = (uint8_t *)dest;
1518 uint8_t *pDestLimit = pDest + destCapacity;
1519 const UChar *pSrcLimit;
1520 int32_t count;
1521
1522 /* args check */
1523 if(U_FAILURE(*pErrorCode)){
1524 return NULL;
1525 }
1526 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1527 (dest==NULL && destCapacity!=0) || destCapacity<0
1528 ) {
1529 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1530 return NULL;
1531 }
1532
1533 if(srcLength==-1) {
1534 /* Convert NUL-terminated ASCII, then find the string length. */
1535 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1536 *pDest++ = (uint8_t)ch;
1537 ++src;
1538 }
1539 if(ch == 0) {
1540 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1541 if(pDestLength) {
1542 *pDestLength = reqLength;
1543 }
1544
1545 /* Terminate the buffer */
1546 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1547 return dest;
1548 }
1549 srcLength = u_strlen(src);
1550 }
1551
1552 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1553 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1554 for(;;) {
1555 count = (int32_t)(pDestLimit - pDest);
1556 srcLength = (int32_t)(pSrcLimit - src);
1557 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1558 /* fast ASCII loop */
1559 const UChar *prevSrc = src;
1560 int32_t delta;
1561 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1562 *pDest++=(uint8_t)ch;
1563 ++src;
1564 }
1565 delta = (int32_t)(src - prevSrc);
1566 count -= delta;
1567 srcLength -= delta;
1568 }
1569 /*
1570 * Each iteration of the inner loop progresses by at most 3 UTF-8
1571 * bytes and one UChar.
1572 */
1573 count /= 3;
1574 if(count > srcLength) {
1575 count = srcLength; /* min(remaining dest/3, remaining src) */
1576 }
1577 if(count < 3) {
1578 /*
1579 * Too much overhead if we get near the end of the string,
1580 * continue with the next loop.
1581 */
1582 break;
1583 }
1584 do {
1585 ch=*src++;
1586 if(ch <= 0x7f && ch != 0) {
1587 *pDest++ = (uint8_t)ch;
1588 } else if(ch <= 0x7ff) {
1589 *pDest++=(uint8_t)((ch>>6)|0xc0);
1590 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1591 } else {
1592 *pDest++=(uint8_t)((ch>>12)|0xe0);
1593 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1594 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1595 }
1596 } while(--count > 0);
1597 }
1598
1599 while(src<pSrcLimit) {
1600 ch=*src++;
1601 if(ch <= 0x7f && ch != 0) {
1602 if(pDest<pDestLimit) {
1603 *pDest++ = (uint8_t)ch;
1604 } else {
1605 reqLength = 1;
1606 break;
1607 }
1608 } else if(ch <= 0x7ff) {
1609 if((pDestLimit - pDest) >= 2) {
1610 *pDest++=(uint8_t)((ch>>6)|0xc0);
1611 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1612 } else {
1613 reqLength = 2;
1614 break;
1615 }
1616 } else {
1617 if((pDestLimit - pDest) >= 3) {
1618 *pDest++=(uint8_t)((ch>>12)|0xe0);
1619 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1620 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1621 } else {
1622 reqLength = 3;
1623 break;
1624 }
1625 }
1626 }
1627 while(src<pSrcLimit) {
1628 ch=*src++;
1629 if(ch <= 0x7f && ch != 0) {
1630 ++reqLength;
1631 } else if(ch<=0x7ff) {
1632 reqLength+=2;
1633 } else {
1634 reqLength+=3;
1635 }
1636 }
1637
1638 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1639 if(pDestLength){
1640 *pDestLength = reqLength;
1641 }
1642
1643 /* Terminate the buffer */
1644 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1645 return dest;
1646 }