]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ustrtrns.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / common / ustrtrns.cpp
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
3*
2ca993e8 4* Copyright (C) 2001-2016, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8*
4388f060 9* File ustrtrns.cpp
b75a7d8f
A
10*
11* Modification History:
12*
13* Date Name Description
14* 9/10/2001 Ram Creation.
15******************************************************************************
16*/
17
18/*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
374ca955 21 * WCS functions moved to ustr_wcs.c for better modularization
b75a7d8f
A
22 *
23 *******************************************************************************
24 */
25
26
27#include "unicode/putil.h"
b75a7d8f 28#include "unicode/ustring.h"
4388f060
A
29#include "unicode/utf.h"
30#include "unicode/utf8.h"
31#include "unicode/utf16.h"
b75a7d8f 32#include "cstring.h"
b75a7d8f
A
33#include "cmemory.h"
34#include "ustr_imp.h"
4388f060 35#include "uassert.h"
b75a7d8f 36
b75a7d8f 37U_CAPI UChar* U_EXPORT2
729e4ab9
A
38u_strFromUTF32WithSub(UChar *dest,
39 int32_t destCapacity,
b75a7d8f
A
40 int32_t *pDestLength,
41 const UChar32 *src,
42 int32_t srcLength,
729e4ab9
A
43 UChar32 subchar, int32_t *pNumSubstitutions,
44 UErrorCode *pErrorCode) {
45 const UChar32 *srcLimit;
46 UChar32 ch;
47 UChar *destLimit;
48 UChar *pDest;
49 int32_t reqLength;
50 int32_t numSubstitutions;
b75a7d8f
A
51
52 /* args check */
729e4ab9 53 if(U_FAILURE(*pErrorCode)){
b75a7d8f
A
54 return NULL;
55 }
729e4ab9
A
56 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
57 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
58 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
59 ) {
b75a7d8f
A
60 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
61 return NULL;
62 }
63
729e4ab9
A
64 if(pNumSubstitutions != NULL) {
65 *pNumSubstitutions = 0;
66 }
67
68 pDest = dest;
4388f060 69 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
729e4ab9
A
70 reqLength = 0;
71 numSubstitutions = 0;
72
73 if(srcLength < 0) {
74 /* simple loop for conversion of a NUL-terminated BMP string */
75 while((ch=*src) != 0 &&
76 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
77 ++src;
78 if(pDest < destLimit) {
79 *pDest++ = (UChar)ch;
80 } else {
81 ++reqLength;
b75a7d8f
A
82 }
83 }
729e4ab9
A
84 srcLimit = src;
85 if(ch != 0) {
86 /* "complicated" case, find the end of the remaining string */
87 while(*++srcLimit != 0) {}
b75a7d8f 88 }
729e4ab9 89 } else {
4388f060 90 srcLimit = (src!=NULL)?(src + srcLength):NULL;
729e4ab9
A
91 }
92
93 /* convert with length */
94 while(src < srcLimit) {
95 ch = *src++;
96 do {
97 /* usually "loops" once; twice only for writing subchar */
98 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
99 if(pDest < destLimit) {
100 *pDest++ = (UChar)ch;
101 } else {
102 ++reqLength;
103 }
104 break;
105 } else if(0x10000 <= ch && ch <= 0x10ffff) {
4388f060 106 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
729e4ab9
A
107 *pDest++ = U16_LEAD(ch);
108 *pDest++ = U16_TRAIL(ch);
109 } else {
110 reqLength += 2;
b75a7d8f 111 }
729e4ab9
A
112 break;
113 } else if((ch = subchar) < 0) {
114 /* surrogate code point, or not a Unicode code point at all */
b75a7d8f
A
115 *pErrorCode = U_INVALID_CHAR_FOUND;
116 return NULL;
729e4ab9
A
117 } else {
118 ++numSubstitutions;
b75a7d8f 119 }
729e4ab9 120 } while(TRUE);
b75a7d8f
A
121 }
122
73c04bcf 123 reqLength += (int32_t)(pDest - dest);
729e4ab9 124 if(pDestLength) {
b75a7d8f
A
125 *pDestLength = reqLength;
126 }
729e4ab9
A
127 if(pNumSubstitutions != NULL) {
128 *pNumSubstitutions = numSubstitutions;
129 }
b75a7d8f
A
130
131 /* Terminate the buffer */
729e4ab9 132 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
b75a7d8f
A
133
134 return dest;
135}
136
729e4ab9
A
137U_CAPI UChar* U_EXPORT2
138u_strFromUTF32(UChar *dest,
139 int32_t destCapacity,
140 int32_t *pDestLength,
141 const UChar32 *src,
142 int32_t srcLength,
143 UErrorCode *pErrorCode) {
144 return u_strFromUTF32WithSub(
145 dest, destCapacity, pDestLength,
146 src, srcLength,
147 U_SENTINEL, NULL,
148 pErrorCode);
149}
b75a7d8f
A
150
151U_CAPI UChar32* U_EXPORT2
729e4ab9
A
152u_strToUTF32WithSub(UChar32 *dest,
153 int32_t destCapacity,
154 int32_t *pDestLength,
155 const UChar *src,
156 int32_t srcLength,
157 UChar32 subchar, int32_t *pNumSubstitutions,
158 UErrorCode *pErrorCode) {
159 const UChar *srcLimit;
160 UChar32 ch;
161 UChar ch2;
162 UChar32 *destLimit;
163 UChar32 *pDest;
164 int32_t reqLength;
165 int32_t numSubstitutions;
b75a7d8f
A
166
167 /* args check */
729e4ab9 168 if(U_FAILURE(*pErrorCode)){
b75a7d8f
A
169 return NULL;
170 }
729e4ab9
A
171 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
172 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
173 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
174 ) {
b75a7d8f
A
175 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
176 return NULL;
177 }
178
729e4ab9
A
179 if(pNumSubstitutions != NULL) {
180 *pNumSubstitutions = 0;
181 }
182
183 pDest = dest;
4388f060 184 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
729e4ab9
A
185 reqLength = 0;
186 numSubstitutions = 0;
187
188 if(srcLength < 0) {
189 /* simple loop for conversion of a NUL-terminated BMP string */
190 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
191 ++src;
192 if(pDest < destLimit) {
193 *pDest++ = ch;
194 } else {
195 ++reqLength;
b75a7d8f 196 }
b75a7d8f 197 }
729e4ab9
A
198 srcLimit = src;
199 if(ch != 0) {
200 /* "complicated" case, find the end of the remaining string */
201 while(*++srcLimit != 0) {}
b75a7d8f
A
202 }
203 } else {
4388f060 204 srcLimit = (src!=NULL)?(src + srcLength):NULL;
729e4ab9
A
205 }
206
207 /* convert with length */
208 while(src < srcLimit) {
209 ch = *src++;
210 if(!U16_IS_SURROGATE(ch)) {
211 /* write or count ch below */
212 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
213 ++src;
214 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
215 } else if((ch = subchar) < 0) {
216 /* unpaired surrogate */
217 *pErrorCode = U_INVALID_CHAR_FOUND;
218 return NULL;
219 } else {
220 ++numSubstitutions;
b75a7d8f 221 }
729e4ab9
A
222 if(pDest < destLimit) {
223 *pDest++ = ch;
224 } else {
b75a7d8f
A
225 ++reqLength;
226 }
227 }
228
729e4ab9
A
229 reqLength += (int32_t)(pDest - dest);
230 if(pDestLength) {
b75a7d8f
A
231 *pDestLength = reqLength;
232 }
729e4ab9
A
233 if(pNumSubstitutions != NULL) {
234 *pNumSubstitutions = numSubstitutions;
235 }
b75a7d8f
A
236
237 /* Terminate the buffer */
729e4ab9 238 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
b75a7d8f
A
239
240 return dest;
241}
242
729e4ab9
A
243U_CAPI UChar32* U_EXPORT2
244u_strToUTF32(UChar32 *dest,
245 int32_t destCapacity,
246 int32_t *pDestLength,
247 const UChar *src,
248 int32_t srcLength,
249 UErrorCode *pErrorCode) {
250 return u_strToUTF32WithSub(
251 dest, destCapacity, pDestLength,
252 src, srcLength,
253 U_SENTINEL, NULL,
254 pErrorCode);
255}
256
73c04bcf
A
257/* for utf8_nextCharSafeBodyTerminated() */
258static const UChar32
259utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
260
261/*
262 * Version of utf8_nextCharSafeBody() with the following differences:
263 * - checks for NUL termination instead of length
264 * - works with pointers instead of indexes
265 * - always strict (strict==-1)
266 *
267 * *ps points to after the lead byte and will be moved to after the last trail byte.
268 * c is the lead byte.
269 * @return the code point, or U_SENTINEL
270 */
271static UChar32
272utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
273 const uint8_t *s=*ps;
274 uint8_t trail, illegal=0;
4388f060
A
275 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
276 U_ASSERT(count<6);
277 U8_MASK_LEAD_BYTE((c), count);
73c04bcf
A
278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
279 switch(count) {
280 /* each branch falls through to the next one */
281 case 5:
282 case 4:
283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
284 illegal=1;
285 break;
286 case 3:
287 trail=(uint8_t)(*s++ - 0x80);
288 c=(c<<6)|trail;
289 if(trail>0x3f || c>=0x110) {
290 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
291 illegal=1;
292 break;
293 }
2ca993e8
A
294 U_FALLTHROUGH;
295 case 2:
73c04bcf
A
296 trail=(uint8_t)(*s++ - 0x80);
297 if(trail>0x3f) {
298 /* not a trail byte */
299 illegal=1;
300 break;
301 }
302 c=(c<<6)|trail;
2ca993e8
A
303 U_FALLTHROUGH;
304 case 1:
73c04bcf
A
305 trail=(uint8_t)(*s++ - 0x80);
306 if(trail>0x3f) {
307 /* not a trail byte */
308 illegal=1;
309 }
310 c=(c<<6)|trail;
311 break;
312 case 0:
313 return U_SENTINEL;
314 /* no default branch to optimize switch() - all values are covered */
315 }
316
317 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
318 /* illegal is also set if count>=4 */
4388f060 319 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
73c04bcf
A
320 /* error handling */
321 /* don't go beyond this sequence */
322 s=*ps;
4388f060 323 while(count>0 && U8_IS_TRAIL(*s)) {
73c04bcf
A
324 ++s;
325 --count;
326 }
327 c=U_SENTINEL;
328 }
329 *ps=s;
330 return c;
331}
332
333/*
334 * Version of utf8_nextCharSafeBody() with the following differences:
335 * - works with pointers instead of indexes
336 * - always strict (strict==-1)
337 *
338 * *ps points to after the lead byte and will be moved to after the last trail byte.
339 * c is the lead byte.
340 * @return the code point, or U_SENTINEL
341 */
342static UChar32
343utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
344 const uint8_t *s=*ps;
345 uint8_t trail, illegal=0;
4388f060 346 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
73c04bcf 347 if((limit-s)>=count) {
4388f060 348 U8_MASK_LEAD_BYTE((c), count);
73c04bcf
A
349 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
350 switch(count) {
351 /* each branch falls through to the next one */
352 case 5:
353 case 4:
354 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
355 illegal=1;
356 break;
357 case 3:
358 trail=*s++;
359 c=(c<<6)|(trail&0x3f);
360 if(c<0x110) {
361 illegal|=(trail&0xc0)^0x80;
362 } else {
363 /* code point>0x10ffff, outside Unicode */
364 illegal=1;
365 break;
366 }
2ca993e8
A
367 U_FALLTHROUGH;
368 case 2:
73c04bcf
A
369 trail=*s++;
370 c=(c<<6)|(trail&0x3f);
371 illegal|=(trail&0xc0)^0x80;
2ca993e8
A
372 U_FALLTHROUGH;
373 case 1:
73c04bcf
A
374 trail=*s++;
375 c=(c<<6)|(trail&0x3f);
376 illegal|=(trail&0xc0)^0x80;
377 break;
378 case 0:
379 return U_SENTINEL;
380 /* no default branch to optimize switch() - all values are covered */
381 }
382 } else {
383 illegal=1; /* too few bytes left */
384 }
385
386 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
387 /* illegal is also set if count>=4 */
b331163b 388 U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
4388f060 389 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
73c04bcf
A
390 /* error handling */
391 /* don't go beyond this sequence */
392 s=*ps;
4388f060 393 while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
73c04bcf
A
394 ++s;
395 --count;
396 }
397 c=U_SENTINEL;
398 }
399 *ps=s;
400 return c;
401}
402
b75a7d8f 403U_CAPI UChar* U_EXPORT2
73c04bcf 404u_strFromUTF8WithSub(UChar *dest,
b75a7d8f
A
405 int32_t destCapacity,
406 int32_t *pDestLength,
73c04bcf 407 const char* src,
b75a7d8f 408 int32_t srcLength,
73c04bcf 409 UChar32 subchar, int32_t *pNumSubstitutions,
b75a7d8f 410 UErrorCode *pErrorCode){
b75a7d8f
A
411 UChar *pDest = dest;
412 UChar *pDestLimit = dest+destCapacity;
73c04bcf 413 UChar32 ch;
b75a7d8f 414 int32_t reqLength = 0;
73c04bcf
A
415 const uint8_t* pSrc = (const uint8_t*) src;
416 uint8_t t1, t2; /* trail bytes */
417 int32_t numSubstitutions;
b75a7d8f
A
418
419 /* args check */
420 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
421 return NULL;
422 }
423
729e4ab9
A
424 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
425 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
73c04bcf
A
426 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
427 ) {
b75a7d8f
A
428 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
429 return NULL;
430 }
431
729e4ab9
A
432 if(pNumSubstitutions!=NULL) {
433 *pNumSubstitutions=0;
434 }
73c04bcf
A
435 numSubstitutions=0;
436
437 /*
438 * Inline processing of UTF-8 byte sequences:
439 *
440 * Byte sequences for the most common characters are handled inline in
441 * the conversion loops. In order to reduce the path lengths for those
442 * characters, the tests are arranged in a kind of binary search.
443 * ASCII (<=0x7f) is checked first, followed by the dividing point
444 * between 2- and 3-byte sequences (0xe0).
445 * The 3-byte branch is tested first to speed up CJK text.
446 * The compiler should combine the subtractions for the two tests for 0xe0.
447 * Each branch then tests for the other end of its range.
448 */
449
450 if(srcLength < 0){
451 /*
452 * Transform a NUL-terminated string.
453 * The code explicitly checks for NULs only in the lead byte position.
454 * A NUL byte in the trail byte position fails the trail byte range check anyway.
455 */
456 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
457 if(ch <= 0x7f){
458 *pDest++=(UChar)ch;
459 ++pSrc;
460 } else {
461 if(ch > 0xe0) {
462 if( /* handle U+1000..U+CFFF inline */
463 ch <= 0xec &&
464 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
465 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
466 ) {
467 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
468 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
469 pSrc += 3;
470 continue;
471 }
472 } else if(ch < 0xe0) {
473 if( /* handle U+0080..U+07FF inline */
474 ch >= 0xc2 &&
475 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
476 ) {
477 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
478 pSrc += 2;
479 continue;
480 }
481 }
482
483 /* function call for "complicated" and error cases */
484 ++pSrc; /* continue after the lead byte */
485 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
486 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
487 *pErrorCode = U_INVALID_CHAR_FOUND;
488 return NULL;
489 } else if(ch<=0xFFFF) {
490 *(pDest++)=(UChar)ch;
491 } else {
4388f060 492 *(pDest++)=U16_LEAD(ch);
73c04bcf 493 if(pDest<pDestLimit) {
4388f060 494 *(pDest++)=U16_TRAIL(ch);
73c04bcf
A
495 } else {
496 reqLength++;
497 break;
498 }
499 }
500 }
501 }
502
503 /* Pre-flight the rest of the string. */
504 while((ch = *pSrc) != 0) {
505 if(ch <= 0x7f){
506 ++reqLength;
507 ++pSrc;
508 } else {
509 if(ch > 0xe0) {
510 if( /* handle U+1000..U+CFFF inline */
511 ch <= 0xec &&
512 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
513 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
514 ) {
515 ++reqLength;
516 pSrc += 3;
517 continue;
518 }
519 } else if(ch < 0xe0) {
520 if( /* handle U+0080..U+07FF inline */
521 ch >= 0xc2 &&
522 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
523 ) {
524 ++reqLength;
525 pSrc += 2;
526 continue;
527 }
528 }
529
530 /* function call for "complicated" and error cases */
531 ++pSrc; /* continue after the lead byte */
532 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
533 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
534 *pErrorCode = U_INVALID_CHAR_FOUND;
535 return NULL;
536 }
537 reqLength += U16_LENGTH(ch);
538 }
539 }
540 } else /* srcLength >= 0 */ {
541 const uint8_t *pSrcLimit = pSrc + srcLength;
542 int32_t count;
543
544 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
545 for(;;) {
546 /*
547 * Each iteration of the inner loop progresses by at most 3 UTF-8
548 * bytes and one UChar, for most characters.
549 * For supplementary code points (4 & 2), which are rare,
550 * there is an additional adjustment.
551 */
552 count = (int32_t)(pDestLimit - pDest);
553 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
554 if(count > srcLength) {
555 count = srcLength; /* min(remaining dest, remaining src/3) */
556 }
557 if(count < 3) {
558 /*
559 * Too much overhead if we get near the end of the string,
560 * continue with the next loop.
561 */
562 break;
563 }
564
565 do {
566 ch = *pSrc;
567 if(ch <= 0x7f){
568 *pDest++=(UChar)ch;
569 ++pSrc;
570 } else {
571 if(ch > 0xe0) {
572 if( /* handle U+1000..U+CFFF inline */
573 ch <= 0xec &&
574 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
575 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
576 ) {
577 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
578 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
579 pSrc += 3;
580 continue;
581 }
582 } else if(ch < 0xe0) {
583 if( /* handle U+0080..U+07FF inline */
584 ch >= 0xc2 &&
585 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
586 ) {
587 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
588 pSrc += 2;
589 continue;
590 }
591 }
592
593 if(ch >= 0xf0 || subchar > 0xffff) {
594 /*
595 * We may read up to six bytes and write up to two UChars,
596 * which we didn't account for with computing count,
597 * so we adjust it here.
598 */
599 if(--count == 0) {
600 break;
601 }
602 }
603
604 /* function call for "complicated" and error cases */
605 ++pSrc; /* continue after the lead byte */
606 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
607 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
608 *pErrorCode = U_INVALID_CHAR_FOUND;
609 return NULL;
610 }else if(ch<=0xFFFF){
611 *(pDest++)=(UChar)ch;
612 }else{
4388f060
A
613 *(pDest++)=U16_LEAD(ch);
614 *(pDest++)=U16_TRAIL(ch);
73c04bcf
A
615 }
616 }
617 } while(--count > 0);
618 }
619
620 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
621 ch = *pSrc;
622 if(ch <= 0x7f){
623 *pDest++=(UChar)ch;
624 ++pSrc;
625 } else {
626 if(ch > 0xe0) {
627 if( /* handle U+1000..U+CFFF inline */
628 ch <= 0xec &&
629 ((pSrcLimit - pSrc) >= 3) &&
630 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
631 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
632 ) {
633 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
634 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
635 pSrc += 3;
636 continue;
637 }
638 } else if(ch < 0xe0) {
639 if( /* handle U+0080..U+07FF inline */
640 ch >= 0xc2 &&
641 ((pSrcLimit - pSrc) >= 2) &&
642 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
643 ) {
644 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
645 pSrc += 2;
646 continue;
647 }
648 }
649
650 /* function call for "complicated" and error cases */
651 ++pSrc; /* continue after the lead byte */
652 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
653 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
654 *pErrorCode = U_INVALID_CHAR_FOUND;
655 return NULL;
656 }else if(ch<=0xFFFF){
657 *(pDest++)=(UChar)ch;
b75a7d8f 658 }else{
4388f060 659 *(pDest++)=U16_LEAD(ch);
73c04bcf 660 if(pDest<pDestLimit){
4388f060 661 *(pDest++)=U16_TRAIL(ch);
73c04bcf
A
662 }else{
663 reqLength++;
664 break;
665 }
666 }
667 }
668 }
729e4ab9 669 /* do not fill the dest buffer just count the UChars needed */
73c04bcf
A
670 while(pSrc < pSrcLimit){
671 ch = *pSrc;
672 if(ch <= 0x7f){
673 reqLength++;
674 ++pSrc;
675 } else {
676 if(ch > 0xe0) {
677 if( /* handle U+1000..U+CFFF inline */
678 ch <= 0xec &&
679 ((pSrcLimit - pSrc) >= 3) &&
680 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
681 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
682 ) {
683 reqLength++;
684 pSrc += 3;
685 continue;
686 }
687 } else if(ch < 0xe0) {
688 if( /* handle U+0080..U+07FF inline */
689 ch >= 0xc2 &&
690 ((pSrcLimit - pSrc) >= 2) &&
691 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
692 ) {
693 reqLength++;
694 pSrc += 2;
695 continue;
696 }
b75a7d8f 697 }
73c04bcf
A
698
699 /* function call for "complicated" and error cases */
700 ++pSrc; /* continue after the lead byte */
701 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
702 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
703 *pErrorCode = U_INVALID_CHAR_FOUND;
704 return NULL;
705 }
4388f060 706 reqLength+=U16_LENGTH(ch);
b75a7d8f
A
707 }
708 }
709 }
73c04bcf
A
710
711 reqLength+=(int32_t)(pDest - dest);
712
713 if(pNumSubstitutions!=NULL) {
714 *pNumSubstitutions=numSubstitutions;
715 }
716
717 if(pDestLength){
718 *pDestLength = reqLength;
719 }
720
721 /* Terminate the buffer */
722 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
723
724 return dest;
725}
726
727U_CAPI UChar* U_EXPORT2
728u_strFromUTF8(UChar *dest,
729 int32_t destCapacity,
730 int32_t *pDestLength,
731 const char* src,
732 int32_t srcLength,
733 UErrorCode *pErrorCode){
734 return u_strFromUTF8WithSub(
735 dest, destCapacity, pDestLength,
736 src, srcLength,
737 U_SENTINEL, NULL,
738 pErrorCode);
739}
740
741U_CAPI UChar * U_EXPORT2
742u_strFromUTF8Lenient(UChar *dest,
743 int32_t destCapacity,
744 int32_t *pDestLength,
745 const char *src,
746 int32_t srcLength,
747 UErrorCode *pErrorCode) {
73c04bcf
A
748 UChar *pDest = dest;
749 UChar32 ch;
750 int32_t reqLength = 0;
751 uint8_t* pSrc = (uint8_t*) src;
752
753 /* args check */
754 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
755 return NULL;
756 }
757
729e4ab9
A
758 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
759 (destCapacity<0) || (dest == NULL && destCapacity > 0)
760 ) {
73c04bcf
A
761 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
762 return NULL;
763 }
764
765 if(srcLength < 0) {
766 /* Transform a NUL-terminated string. */
4388f060 767 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
73c04bcf
A
768 uint8_t t1, t2, t3; /* trail bytes */
769
770 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
771 if(ch < 0xc0) {
772 /*
773 * ASCII, or a trail byte in lead position which is treated like
774 * a single-byte sequence for better character boundary
775 * resynchronization after illegal sequences.
776 */
777 *pDest++=(UChar)ch;
778 ++pSrc;
779 continue;
780 } else if(ch < 0xe0) { /* U+0080..U+07FF */
781 if((t1 = pSrc[1]) != 0) {
782 /* 0x3080 = (0xc0 << 6) + 0x80 */
783 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
784 pSrc += 2;
785 continue;
786 }
787 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
788 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
789 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
790 /* 0x2080 = (0x80 << 6) + 0x80 */
791 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
792 pSrc += 3;
793 continue;
794 }
795 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
796 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
797 pSrc += 4;
798 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
799 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
800 *(pDest++) = U16_LEAD(ch);
801 if(pDest < pDestLimit) {
802 *(pDest++) = U16_TRAIL(ch);
803 } else {
804 reqLength = 1;
805 break;
806 }
807 continue;
808 }
b75a7d8f 809 }
73c04bcf
A
810
811 /* truncated character at the end */
812 *pDest++ = 0xfffd;
813 while(*++pSrc != 0) {}
814 break;
815 }
816
817 /* Pre-flight the rest of the string. */
818 while((ch = *pSrc) != 0) {
819 if(ch < 0xc0) {
820 /*
821 * ASCII, or a trail byte in lead position which is treated like
822 * a single-byte sequence for better character boundary
823 * resynchronization after illegal sequences.
824 */
825 ++reqLength;
826 ++pSrc;
827 continue;
828 } else if(ch < 0xe0) { /* U+0080..U+07FF */
829 if(pSrc[1] != 0) {
830 ++reqLength;
831 pSrc += 2;
832 continue;
833 }
834 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
835 if(pSrc[1] != 0 && pSrc[2] != 0) {
836 ++reqLength;
837 pSrc += 3;
838 continue;
839 }
840 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
841 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
842 reqLength += 2;
843 pSrc += 4;
844 continue;
845 }
846 }
847
848 /* truncated character at the end */
849 ++reqLength;
850 break;
851 }
852 } else /* srcLength >= 0 */ {
4388f060 853 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
73c04bcf
A
854
855 /*
856 * This function requires that if srcLength is given, then it must be
857 * destCapatity >= srcLength so that we need not check for
858 * destination buffer overflow in the loop.
859 */
860 if(destCapacity < srcLength) {
861 if(pDestLength != NULL) {
862 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
863 }
864 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
865 return NULL;
866 }
867
868 if((pSrcLimit - pSrc) >= 4) {
869 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
870
871 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
872 do {
873 ch = *pSrc++;
874 if(ch < 0xc0) {
875 /*
876 * ASCII, or a trail byte in lead position which is treated like
877 * a single-byte sequence for better character boundary
878 * resynchronization after illegal sequences.
879 */
880 *pDest++=(UChar)ch;
881 } else if(ch < 0xe0) { /* U+0080..U+07FF */
882 /* 0x3080 = (0xc0 << 6) + 0x80 */
883 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
884 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
885 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
886 /* 0x2080 = (0x80 << 6) + 0x80 */
887 ch = (ch << 12) + (*pSrc++ << 6);
888 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
889 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
890 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
891 ch = (ch << 18) + (*pSrc++ << 12);
892 ch += *pSrc++ << 6;
893 ch += *pSrc++ - 0x3c82080;
894 *(pDest++) = U16_LEAD(ch);
895 *(pDest++) = U16_TRAIL(ch);
896 }
897 } while(pSrc < pSrcLimit);
898
899 pSrcLimit += 3; /* restore original pSrcLimit */
900 }
901
902 while(pSrc < pSrcLimit) {
903 ch = *pSrc++;
904 if(ch < 0xc0) {
905 /*
906 * ASCII, or a trail byte in lead position which is treated like
907 * a single-byte sequence for better character boundary
908 * resynchronization after illegal sequences.
909 */
910 *pDest++=(UChar)ch;
911 continue;
912 } else if(ch < 0xe0) { /* U+0080..U+07FF */
913 if(pSrc < pSrcLimit) {
914 /* 0x3080 = (0xc0 << 6) + 0x80 */
46f4442e 915 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
73c04bcf
A
916 continue;
917 }
918 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
919 if((pSrcLimit - pSrc) >= 2) {
920 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
921 /* 0x2080 = (0x80 << 6) + 0x80 */
922 ch = (ch << 12) + (*pSrc++ << 6);
923 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
924 pSrc += 3;
925 continue;
926 }
927 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
928 if((pSrcLimit - pSrc) >= 3) {
929 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
930 ch = (ch << 18) + (*pSrc++ << 12);
931 ch += *pSrc++ << 6;
932 ch += *pSrc++ - 0x3c82080;
933 *(pDest++) = U16_LEAD(ch);
934 *(pDest++) = U16_TRAIL(ch);
935 pSrc += 4;
936 continue;
937 }
938 }
939
940 /* truncated character at the end */
941 *pDest++ = 0xfffd;
942 break;
b75a7d8f
A
943 }
944 }
945
73c04bcf 946 reqLength+=(int32_t)(pDest - dest);
b75a7d8f
A
947
948 if(pDestLength){
949 *pDestLength = reqLength;
950 }
951
952 /* Terminate the buffer */
953 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
954
955 return dest;
956}
957
4388f060 958static inline uint8_t *
b75a7d8f 959_appendUTF8(uint8_t *pDest, UChar32 c) {
73c04bcf
A
960 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
961 if((c)<=0x7f) {
962 *pDest++=(uint8_t)c;
963 } else if(c<=0x7ff) {
b75a7d8f
A
964 *pDest++=(uint8_t)((c>>6)|0xc0);
965 *pDest++=(uint8_t)((c&0x3f)|0x80);
73c04bcf 966 } else if(c<=0xffff) {
b75a7d8f
A
967 *pDest++=(uint8_t)((c>>12)|0xe0);
968 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
969 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
970 } else /* if((uint32_t)(c)<=0x10ffff) */ {
971 *pDest++=(uint8_t)(((c)>>18)|0xf0);
972 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
973 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
974 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
975 }
976 return pDest;
977}
978
979
980U_CAPI char* U_EXPORT2
73c04bcf 981u_strToUTF8WithSub(char *dest,
b75a7d8f
A
982 int32_t destCapacity,
983 int32_t *pDestLength,
73c04bcf 984 const UChar *pSrc,
b75a7d8f 985 int32_t srcLength,
73c04bcf 986 UChar32 subchar, int32_t *pNumSubstitutions,
b75a7d8f 987 UErrorCode *pErrorCode){
b75a7d8f 988 int32_t reqLength=0;
b75a7d8f
A
989 uint32_t ch=0,ch2=0;
990 uint8_t *pDest = (uint8_t *)dest;
4388f060 991 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
73c04bcf 992 int32_t numSubstitutions;
b75a7d8f
A
993
994 /* args check */
995 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
996 return NULL;
997 }
998
729e4ab9
A
999 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1000 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
73c04bcf
A
1001 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1002 ) {
b75a7d8f
A
1003 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1004 return NULL;
1005 }
1006
729e4ab9
A
1007 if(pNumSubstitutions!=NULL) {
1008 *pNumSubstitutions=0;
1009 }
73c04bcf
A
1010 numSubstitutions=0;
1011
b75a7d8f 1012 if(srcLength==-1) {
73c04bcf 1013 while((ch=*pSrc)!=0) {
b75a7d8f
A
1014 ++pSrc;
1015 if(ch <= 0x7f) {
73c04bcf 1016 if(pDest<pDestLimit) {
729e4ab9 1017 *pDest++ = (uint8_t)ch;
73c04bcf
A
1018 } else {
1019 reqLength = 1;
1020 break;
1021 }
1022 } else if(ch <= 0x7ff) {
1023 if((pDestLimit - pDest) >= 2) {
1024 *pDest++=(uint8_t)((ch>>6)|0xc0);
1025 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1026 } else {
1027 reqLength = 2;
1028 break;
1029 }
1030 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1031 if((pDestLimit - pDest) >= 3) {
1032 *pDest++=(uint8_t)((ch>>12)|0xe0);
1033 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1034 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1035 } else {
1036 reqLength = 3;
1037 break;
1038 }
1039 } else /* ch is a surrogate */ {
1040 int32_t length;
b75a7d8f 1041
4388f060
A
1042 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1043 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
b75a7d8f 1044 ++pSrc;
4388f060 1045 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
73c04bcf
A
1046 } else if(subchar>=0) {
1047 ch=subchar;
1048 ++numSubstitutions;
b75a7d8f
A
1049 } else {
1050 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1051 *pErrorCode = U_INVALID_CHAR_FOUND;
1052 return NULL;
1053 }
73c04bcf
A
1054
1055 length = U8_LENGTH(ch);
1056 if((pDestLimit - pDest) >= length) {
1057 /* convert and append*/
1058 pDest=_appendUTF8(pDest, ch);
1059 } else {
1060 reqLength = length;
1061 break;
1062 }
b75a7d8f 1063 }
b75a7d8f
A
1064 }
1065 while((ch=*pSrc++)!=0) {
1066 if(ch<=0x7f) {
1067 ++reqLength;
1068 } else if(ch<=0x7ff) {
1069 reqLength+=2;
4388f060 1070 } else if(!U16_IS_SURROGATE(ch)) {
b75a7d8f 1071 reqLength+=3;
4388f060 1072 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
b75a7d8f
A
1073 ++pSrc;
1074 reqLength+=4;
73c04bcf
A
1075 } else if(subchar>=0) {
1076 reqLength+=U8_LENGTH(subchar);
1077 ++numSubstitutions;
b75a7d8f
A
1078 } else {
1079 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1080 *pErrorCode = U_INVALID_CHAR_FOUND;
1081 return NULL;
1082 }
1083 }
1084 } else {
4388f060 1085 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
73c04bcf
A
1086 int32_t count;
1087
1088 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1089 for(;;) {
1090 /*
1091 * Each iteration of the inner loop progresses by at most 3 UTF-8
1092 * bytes and one UChar, for most characters.
1093 * For supplementary code points (4 & 2), which are rare,
1094 * there is an additional adjustment.
1095 */
1096 count = (int32_t)((pDestLimit - pDest) / 3);
1097 srcLength = (int32_t)(pSrcLimit - pSrc);
1098 if(count > srcLength) {
1099 count = srcLength; /* min(remaining dest/3, remaining src) */
1100 }
1101 if(count < 3) {
1102 /*
1103 * Too much overhead if we get near the end of the string,
1104 * continue with the next loop.
1105 */
1106 break;
1107 }
1108 do {
1109 ch=*pSrc++;
1110 if(ch <= 0x7f) {
729e4ab9 1111 *pDest++ = (uint8_t)ch;
73c04bcf
A
1112 } else if(ch <= 0x7ff) {
1113 *pDest++=(uint8_t)((ch>>6)|0xc0);
1114 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1115 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1116 *pDest++=(uint8_t)((ch>>12)|0xe0);
1117 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1118 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1119 } else /* ch is a surrogate */ {
1120 /*
1121 * We will read two UChars and probably output four bytes,
1122 * which we didn't account for with computing count,
1123 * so we adjust it here.
1124 */
1125 if(--count == 0) {
1126 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1127 break; /* recompute count */
1128 }
1129
4388f060 1130 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
73c04bcf 1131 ++pSrc;
4388f060 1132 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
73c04bcf
A
1133
1134 /* writing 4 bytes per 2 UChars is ok */
1135 *pDest++=(uint8_t)((ch>>18)|0xf0);
1136 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1137 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1138 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1139 } else {
1140 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1141 if(subchar>=0) {
1142 ch=subchar;
1143 ++numSubstitutions;
1144 } else {
1145 *pErrorCode = U_INVALID_CHAR_FOUND;
1146 return NULL;
1147 }
1148
1149 /* convert and append*/
1150 pDest=_appendUTF8(pDest, ch);
1151 }
1152 }
1153 } while(--count > 0);
1154 }
1155
1156 while(pSrc<pSrcLimit) {
b75a7d8f
A
1157 ch=*pSrc++;
1158 if(ch <= 0x7f) {
73c04bcf 1159 if(pDest<pDestLimit) {
729e4ab9 1160 *pDest++ = (uint8_t)ch;
73c04bcf
A
1161 } else {
1162 reqLength = 1;
1163 break;
1164 }
1165 } else if(ch <= 0x7ff) {
1166 if((pDestLimit - pDest) >= 2) {
1167 *pDest++=(uint8_t)((ch>>6)|0xc0);
1168 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1169 } else {
1170 reqLength = 2;
1171 break;
1172 }
1173 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1174 if((pDestLimit - pDest) >= 3) {
1175 *pDest++=(uint8_t)((ch>>12)|0xe0);
1176 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1177 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1178 } else {
1179 reqLength = 3;
1180 break;
1181 }
1182 } else /* ch is a surrogate */ {
1183 int32_t length;
b75a7d8f 1184
4388f060 1185 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
b75a7d8f 1186 ++pSrc;
4388f060 1187 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
73c04bcf
A
1188 } else if(subchar>=0) {
1189 ch=subchar;
1190 ++numSubstitutions;
b75a7d8f
A
1191 } else {
1192 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1193 *pErrorCode = U_INVALID_CHAR_FOUND;
1194 return NULL;
1195 }
73c04bcf
A
1196
1197 length = U8_LENGTH(ch);
1198 if((pDestLimit - pDest) >= length) {
1199 /* convert and append*/
1200 pDest=_appendUTF8(pDest, ch);
1201 } else {
1202 reqLength = length;
1203 break;
1204 }
b75a7d8f 1205 }
b75a7d8f
A
1206 }
1207 while(pSrc<pSrcLimit) {
1208 ch=*pSrc++;
1209 if(ch<=0x7f) {
1210 ++reqLength;
1211 } else if(ch<=0x7ff) {
1212 reqLength+=2;
4388f060 1213 } else if(!U16_IS_SURROGATE(ch)) {
b75a7d8f 1214 reqLength+=3;
4388f060 1215 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
b75a7d8f
A
1216 ++pSrc;
1217 reqLength+=4;
73c04bcf
A
1218 } else if(subchar>=0) {
1219 reqLength+=U8_LENGTH(subchar);
1220 ++numSubstitutions;
b75a7d8f
A
1221 } else {
1222 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1223 *pErrorCode = U_INVALID_CHAR_FOUND;
1224 return NULL;
1225 }
1226 }
1227 }
1228
73c04bcf
A
1229 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1230
1231 if(pNumSubstitutions!=NULL) {
1232 *pNumSubstitutions=numSubstitutions;
1233 }
1234
b75a7d8f
A
1235 if(pDestLength){
1236 *pDestLength = reqLength;
1237 }
1238
1239 /* Terminate the buffer */
729e4ab9
A
1240 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1241 return dest;
b75a7d8f 1242}
73c04bcf
A
1243
1244U_CAPI char* U_EXPORT2
1245u_strToUTF8(char *dest,
1246 int32_t destCapacity,
1247 int32_t *pDestLength,
1248 const UChar *pSrc,
1249 int32_t srcLength,
1250 UErrorCode *pErrorCode){
1251 return u_strToUTF8WithSub(
1252 dest, destCapacity, pDestLength,
1253 pSrc, srcLength,
1254 U_SENTINEL, NULL,
1255 pErrorCode);
1256}
729e4ab9
A
1257
1258U_CAPI UChar* U_EXPORT2
1259u_strFromJavaModifiedUTF8WithSub(
1260 UChar *dest,
1261 int32_t destCapacity,
1262 int32_t *pDestLength,
1263 const char *src,
1264 int32_t srcLength,
1265 UChar32 subchar, int32_t *pNumSubstitutions,
1266 UErrorCode *pErrorCode) {
1267 UChar *pDest = dest;
1268 UChar *pDestLimit = dest+destCapacity;
1269 UChar32 ch;
1270 int32_t reqLength = 0;
1271 const uint8_t* pSrc = (const uint8_t*) src;
1272 const uint8_t *pSrcLimit;
1273 int32_t count;
1274 uint8_t t1, t2; /* trail bytes */
1275 int32_t numSubstitutions;
1276
1277 /* args check */
1278 if(U_FAILURE(*pErrorCode)){
1279 return NULL;
1280 }
1281 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1282 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1283 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1284 ) {
1285 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1286 return NULL;
1287 }
1288
1289 if(pNumSubstitutions!=NULL) {
1290 *pNumSubstitutions=0;
1291 }
1292 numSubstitutions=0;
1293
1294 if(srcLength < 0) {
1295 /*
1296 * Transform a NUL-terminated ASCII string.
1297 * Handle non-ASCII strings with slower code.
1298 */
1299 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1300 *pDest++=(UChar)ch;
1301 ++pSrc;
1302 }
1303 if(ch == 0) {
1304 reqLength=(int32_t)(pDest - dest);
1305 if(pDestLength) {
1306 *pDestLength = reqLength;
1307 }
1308
1309 /* Terminate the buffer */
1310 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1311 return dest;
1312 }
1313 srcLength = uprv_strlen((const char *)pSrc);
1314 }
1315
1316 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
51004dcb 1317 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
729e4ab9
A
1318 for(;;) {
1319 count = (int32_t)(pDestLimit - pDest);
1320 srcLength = (int32_t)(pSrcLimit - pSrc);
1321 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1322 /* fast ASCII loop */
1323 const uint8_t *prevSrc = pSrc;
1324 int32_t delta;
1325 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1326 *pDest++=(UChar)ch;
1327 ++pSrc;
1328 }
1329 delta = (int32_t)(pSrc - prevSrc);
1330 count -= delta;
1331 srcLength -= delta;
1332 }
1333 /*
1334 * Each iteration of the inner loop progresses by at most 3 UTF-8
1335 * bytes and one UChar.
1336 */
1337 srcLength /= 3;
1338 if(count > srcLength) {
1339 count = srcLength; /* min(remaining dest, remaining src/3) */
1340 }
1341 if(count < 3) {
1342 /*
1343 * Too much overhead if we get near the end of the string,
1344 * continue with the next loop.
1345 */
1346 break;
1347 }
1348 do {
1349 ch = *pSrc;
1350 if(ch <= 0x7f){
1351 *pDest++=(UChar)ch;
1352 ++pSrc;
1353 } else {
1354 if(ch >= 0xe0) {
1355 if( /* handle U+0000..U+FFFF inline */
1356 ch <= 0xef &&
1357 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1358 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1359 ) {
1360 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1361 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1362 pSrc += 3;
1363 continue;
1364 }
1365 } else {
1366 if( /* handle U+0000..U+07FF inline */
1367 ch >= 0xc0 &&
1368 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1369 ) {
1370 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1371 pSrc += 2;
1372 continue;
1373 }
1374 }
1375
1376 if(subchar < 0) {
1377 *pErrorCode = U_INVALID_CHAR_FOUND;
1378 return NULL;
1379 } else if(subchar > 0xffff && --count == 0) {
1380 /*
1381 * We need to write two UChars, adjusted count for that,
1382 * and ran out of space.
1383 */
1384 break;
1385 } else {
1386 /* function call for error cases */
1387 ++pSrc; /* continue after the lead byte */
1388 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1389 ++numSubstitutions;
1390 if(subchar<=0xFFFF) {
1391 *(pDest++)=(UChar)subchar;
1392 } else {
1393 *(pDest++)=U16_LEAD(subchar);
1394 *(pDest++)=U16_TRAIL(subchar);
1395 }
1396 }
1397 }
1398 } while(--count > 0);
1399 }
1400
1401 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1402 ch = *pSrc;
1403 if(ch <= 0x7f){
1404 *pDest++=(UChar)ch;
1405 ++pSrc;
1406 } else {
1407 if(ch >= 0xe0) {
1408 if( /* handle U+0000..U+FFFF inline */
1409 ch <= 0xef &&
1410 ((pSrcLimit - pSrc) >= 3) &&
1411 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1412 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1413 ) {
1414 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1415 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1416 pSrc += 3;
1417 continue;
1418 }
1419 } else {
1420 if( /* handle U+0000..U+07FF inline */
1421 ch >= 0xc0 &&
1422 ((pSrcLimit - pSrc) >= 2) &&
1423 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1424 ) {
1425 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1426 pSrc += 2;
1427 continue;
1428 }
1429 }
1430
1431 if(subchar < 0) {
1432 *pErrorCode = U_INVALID_CHAR_FOUND;
1433 return NULL;
1434 } else {
1435 /* function call for error cases */
1436 ++pSrc; /* continue after the lead byte */
1437 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1438 ++numSubstitutions;
1439 if(subchar<=0xFFFF) {
1440 *(pDest++)=(UChar)subchar;
1441 } else {
1442 *(pDest++)=U16_LEAD(subchar);
1443 if(pDest<pDestLimit) {
1444 *(pDest++)=U16_TRAIL(subchar);
1445 } else {
1446 reqLength++;
1447 break;
1448 }
1449 }
1450 }
1451 }
1452 }
1453
1454 /* do not fill the dest buffer just count the UChars needed */
1455 while(pSrc < pSrcLimit){
1456 ch = *pSrc;
1457 if(ch <= 0x7f) {
1458 reqLength++;
1459 ++pSrc;
1460 } else {
1461 if(ch >= 0xe0) {
1462 if( /* handle U+0000..U+FFFF inline */
1463 ch <= 0xef &&
1464 ((pSrcLimit - pSrc) >= 3) &&
1465 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1466 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1467 ) {
1468 reqLength++;
1469 pSrc += 3;
1470 continue;
1471 }
1472 } else {
1473 if( /* handle U+0000..U+07FF inline */
1474 ch >= 0xc0 &&
1475 ((pSrcLimit - pSrc) >= 2) &&
1476 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1477 ) {
1478 reqLength++;
1479 pSrc += 2;
1480 continue;
1481 }
1482 }
1483
1484 if(subchar < 0) {
1485 *pErrorCode = U_INVALID_CHAR_FOUND;
1486 return NULL;
1487 } else {
1488 /* function call for error cases */
1489 ++pSrc; /* continue after the lead byte */
1490 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1491 ++numSubstitutions;
1492 reqLength+=U16_LENGTH(ch);
1493 }
1494 }
1495 }
1496
1497 if(pNumSubstitutions!=NULL) {
1498 *pNumSubstitutions=numSubstitutions;
1499 }
1500
1501 reqLength+=(int32_t)(pDest - dest);
1502 if(pDestLength) {
1503 *pDestLength = reqLength;
1504 }
1505
1506 /* Terminate the buffer */
1507 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1508 return dest;
1509}
1510
1511U_CAPI char* U_EXPORT2
1512u_strToJavaModifiedUTF8(
1513 char *dest,
1514 int32_t destCapacity,
1515 int32_t *pDestLength,
1516 const UChar *src,
1517 int32_t srcLength,
1518 UErrorCode *pErrorCode) {
1519 int32_t reqLength=0;
1520 uint32_t ch=0;
1521 uint8_t *pDest = (uint8_t *)dest;
1522 uint8_t *pDestLimit = pDest + destCapacity;
1523 const UChar *pSrcLimit;
1524 int32_t count;
1525
1526 /* args check */
1527 if(U_FAILURE(*pErrorCode)){
1528 return NULL;
1529 }
1530 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1531 (dest==NULL && destCapacity!=0) || destCapacity<0
1532 ) {
1533 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1534 return NULL;
1535 }
1536
1537 if(srcLength==-1) {
1538 /* Convert NUL-terminated ASCII, then find the string length. */
1539 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1540 *pDest++ = (uint8_t)ch;
1541 ++src;
1542 }
1543 if(ch == 0) {
1544 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1545 if(pDestLength) {
1546 *pDestLength = reqLength;
1547 }
1548
1549 /* Terminate the buffer */
1550 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1551 return dest;
1552 }
1553 srcLength = u_strlen(src);
1554 }
1555
1556 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
4388f060 1557 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
729e4ab9
A
1558 for(;;) {
1559 count = (int32_t)(pDestLimit - pDest);
1560 srcLength = (int32_t)(pSrcLimit - src);
1561 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1562 /* fast ASCII loop */
1563 const UChar *prevSrc = src;
1564 int32_t delta;
1565 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1566 *pDest++=(uint8_t)ch;
1567 ++src;
1568 }
1569 delta = (int32_t)(src - prevSrc);
1570 count -= delta;
1571 srcLength -= delta;
1572 }
1573 /*
1574 * Each iteration of the inner loop progresses by at most 3 UTF-8
1575 * bytes and one UChar.
1576 */
1577 count /= 3;
1578 if(count > srcLength) {
1579 count = srcLength; /* min(remaining dest/3, remaining src) */
1580 }
1581 if(count < 3) {
1582 /*
1583 * Too much overhead if we get near the end of the string,
1584 * continue with the next loop.
1585 */
1586 break;
1587 }
1588 do {
1589 ch=*src++;
1590 if(ch <= 0x7f && ch != 0) {
1591 *pDest++ = (uint8_t)ch;
1592 } else if(ch <= 0x7ff) {
1593 *pDest++=(uint8_t)((ch>>6)|0xc0);
1594 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1595 } else {
1596 *pDest++=(uint8_t)((ch>>12)|0xe0);
1597 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1598 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1599 }
1600 } while(--count > 0);
1601 }
1602
1603 while(src<pSrcLimit) {
1604 ch=*src++;
1605 if(ch <= 0x7f && ch != 0) {
1606 if(pDest<pDestLimit) {
1607 *pDest++ = (uint8_t)ch;
1608 } else {
1609 reqLength = 1;
1610 break;
1611 }
1612 } else if(ch <= 0x7ff) {
1613 if((pDestLimit - pDest) >= 2) {
1614 *pDest++=(uint8_t)((ch>>6)|0xc0);
1615 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1616 } else {
1617 reqLength = 2;
1618 break;
1619 }
1620 } else {
1621 if((pDestLimit - pDest) >= 3) {
1622 *pDest++=(uint8_t)((ch>>12)|0xe0);
1623 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1624 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1625 } else {
1626 reqLength = 3;
1627 break;
1628 }
1629 }
1630 }
1631 while(src<pSrcLimit) {
1632 ch=*src++;
1633 if(ch <= 0x7f && ch != 0) {
1634 ++reqLength;
1635 } else if(ch<=0x7ff) {
1636 reqLength+=2;
1637 } else {
1638 reqLength+=3;
1639 }
1640 }
1641
1642 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1643 if(pDestLength){
1644 *pDestLength = reqLength;
1645 }
1646
1647 /* Terminate the buffer */
1648 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1649 return dest;
1650}