]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u8.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2002-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* file name: ucnv_u8.c
f3c0d7a5 9* encoding: UTF-8
b75a7d8f
A
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002jul01
14* created by: Markus W. Scherer
15*
16* UTF-8 converter implementation. Used to be in ucnv_utf.c.
17*
18* Also, CESU-8 implementation, see UTR 26.
19* The CESU-8 converter uses all the same functions as the
20* UTF-8 converter, with a branch for converting supplementary code points.
21*/
22
23#include "unicode/utypes.h"
374ca955
A
24
25#if !UCONFIG_NO_CONVERSION
26
b75a7d8f 27#include "unicode/ucnv.h"
4388f060
A
28#include "unicode/utf.h"
29#include "unicode/utf8.h"
30#include "unicode/utf16.h"
0f5d89e8 31#include "uassert.h"
b75a7d8f
A
32#include "ucnv_bld.h"
33#include "ucnv_cnv.h"
34#include "cmemory.h"
0f5d89e8 35#include "ustr_imp.h"
b75a7d8f
A
36
37/* Prototypes --------------------------------------------------------------- */
38
39/* Keep these here to make finicky compilers happy */
40
374ca955 41U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
b75a7d8f 42 UErrorCode *err);
374ca955 43U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
b75a7d8f 44 UErrorCode *err);
b75a7d8f
A
45
46
47/* UTF-8 -------------------------------------------------------------------- */
48
b75a7d8f 49#define MAXIMUM_UCS2 0x0000FFFF
b75a7d8f 50
0f5d89e8
A
51static const uint32_t offsetsFromUTF8[5] = {0,
52 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
53 (uint32_t) 0x03C82080
b75a7d8f
A
54};
55
b331163b
A
56static UBool hasCESU8Data(const UConverter *cnv)
57{
58#if UCONFIG_ONLY_HTML_CONVERSION
59 return FALSE;
60#else
61 return (UBool)(cnv->sharedData == &_CESU8Data);
62#endif
63}
f3c0d7a5
A
64U_CDECL_BEGIN
65static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
b75a7d8f
A
66 UErrorCode * err)
67{
46f4442e 68 UConverter *cnv = args->converter;
b75a7d8f
A
69 const unsigned char *mySource = (unsigned char *) args->source;
70 UChar *myTarget = args->target;
71 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
72 const UChar *targetLimit = args->targetLimit;
46f4442e 73 unsigned char *toUBytes = cnv->toUBytes;
b331163b 74 UBool isCESU8 = hasCESU8Data(cnv);
b75a7d8f
A
75 uint32_t ch, ch2 = 0;
76 int32_t i, inBytes;
b331163b 77
b75a7d8f 78 /* Restore size of current sequence */
0f5d89e8 79 if (cnv->toULength > 0 && myTarget < targetLimit)
b75a7d8f 80 {
46f4442e
A
81 inBytes = cnv->mode; /* restore # of bytes to consume */
82 i = cnv->toULength; /* restore # of bytes consumed */
83 cnv->toULength = 0;
b75a7d8f 84
46f4442e
A
85 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
86 cnv->toUnicodeStatus = 0;
b75a7d8f
A
87 goto morebytes;
88 }
89
90
91 while (mySource < sourceLimit && myTarget < targetLimit)
92 {
93 ch = *(mySource++);
0f5d89e8 94 if (U8_IS_SINGLE(ch)) /* Simple case */
b75a7d8f
A
95 {
96 *(myTarget++) = (UChar) ch;
97 }
98 else
99 {
100 /* store the first char */
101 toUBytes[0] = (char)ch;
0f5d89e8 102 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
b75a7d8f
A
103 i = 1;
104
105morebytes:
106 while (i < inBytes)
107 {
108 if (mySource < sourceLimit)
109 {
110 toUBytes[i] = (char) (ch2 = *mySource);
3d1f044b 111 if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
0f5d89e8 112 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
b75a7d8f
A
113 {
114 break; /* i < inBytes */
115 }
116 ch = (ch << 6) + ch2;
117 ++mySource;
118 i++;
119 }
120 else
121 {
374ca955 122 /* stores a partially calculated target*/
46f4442e
A
123 cnv->toUnicodeStatus = ch;
124 cnv->mode = inBytes;
125 cnv->toULength = (int8_t) i;
b75a7d8f
A
126 goto donefornow;
127 }
128 }
129
0f5d89e8
A
130 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131 if (i == inBytes && (!isCESU8 || i <= 3))
b75a7d8f 132 {
0f5d89e8
A
133 /* Remove the accumulated high bits */
134 ch -= offsetsFromUTF8[inBytes];
135
b75a7d8f
A
136 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137 if (ch <= MAXIMUM_UCS2)
138 {
139 /* fits in 16 bits */
140 *(myTarget++) = (UChar) ch;
141 }
142 else
143 {
144 /* write out the surrogates */
0f5d89e8
A
145 *(myTarget++) = U16_LEAD(ch);
146 ch = U16_TRAIL(ch);
b75a7d8f
A
147 if (myTarget < targetLimit)
148 {
149 *(myTarget++) = (UChar)ch;
150 }
151 else
152 {
153 /* Put in overflow buffer (not handled here) */
46f4442e
A
154 cnv->UCharErrorBuffer[0] = (UChar) ch;
155 cnv->UCharErrorBufferLength = 1;
b75a7d8f
A
156 *err = U_BUFFER_OVERFLOW_ERROR;
157 break;
158 }
159 }
160 }
161 else
162 {
46f4442e 163 cnv->toULength = (int8_t)i;
374ca955
A
164 *err = U_ILLEGAL_CHAR_FOUND;
165 break;
b75a7d8f
A
166 }
167 }
168 }
169
170donefornow:
171 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172 {
173 /* End of target buffer */
174 *err = U_BUFFER_OVERFLOW_ERROR;
175 }
176
177 args->target = myTarget;
178 args->source = (const char *) mySource;
179}
180
f3c0d7a5 181static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
b75a7d8f
A
182 UErrorCode * err)
183{
46f4442e 184 UConverter *cnv = args->converter;
b75a7d8f
A
185 const unsigned char *mySource = (unsigned char *) args->source;
186 UChar *myTarget = args->target;
187 int32_t *myOffsets = args->offsets;
188 int32_t offsetNum = 0;
189 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
190 const UChar *targetLimit = args->targetLimit;
46f4442e 191 unsigned char *toUBytes = cnv->toUBytes;
b331163b 192 UBool isCESU8 = hasCESU8Data(cnv);
b75a7d8f
A
193 uint32_t ch, ch2 = 0;
194 int32_t i, inBytes;
195
196 /* Restore size of current sequence */
0f5d89e8 197 if (cnv->toULength > 0 && myTarget < targetLimit)
b75a7d8f 198 {
46f4442e
A
199 inBytes = cnv->mode; /* restore # of bytes to consume */
200 i = cnv->toULength; /* restore # of bytes consumed */
201 cnv->toULength = 0;
b75a7d8f 202
46f4442e
A
203 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
204 cnv->toUnicodeStatus = 0;
b75a7d8f
A
205 goto morebytes;
206 }
207
208 while (mySource < sourceLimit && myTarget < targetLimit)
209 {
210 ch = *(mySource++);
0f5d89e8 211 if (U8_IS_SINGLE(ch)) /* Simple case */
b75a7d8f
A
212 {
213 *(myTarget++) = (UChar) ch;
214 *(myOffsets++) = offsetNum++;
215 }
216 else
217 {
218 toUBytes[0] = (char)ch;
0f5d89e8 219 inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
b75a7d8f
A
220 i = 1;
221
222morebytes:
223 while (i < inBytes)
224 {
225 if (mySource < sourceLimit)
226 {
227 toUBytes[i] = (char) (ch2 = *mySource);
3d1f044b 228 if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
0f5d89e8 229 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
b75a7d8f
A
230 {
231 break; /* i < inBytes */
232 }
233 ch = (ch << 6) + ch2;
234 ++mySource;
235 i++;
236 }
237 else
238 {
46f4442e
A
239 cnv->toUnicodeStatus = ch;
240 cnv->mode = inBytes;
241 cnv->toULength = (int8_t)i;
b75a7d8f
A
242 goto donefornow;
243 }
244 }
245
0f5d89e8
A
246 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247 if (i == inBytes && (!isCESU8 || i <= 3))
b75a7d8f 248 {
0f5d89e8
A
249 /* Remove the accumulated high bits */
250 ch -= offsetsFromUTF8[inBytes];
251
b75a7d8f
A
252 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253 if (ch <= MAXIMUM_UCS2)
254 {
255 /* fits in 16 bits */
256 *(myTarget++) = (UChar) ch;
257 *(myOffsets++) = offsetNum;
258 }
259 else
260 {
261 /* write out the surrogates */
0f5d89e8 262 *(myTarget++) = U16_LEAD(ch);
b75a7d8f 263 *(myOffsets++) = offsetNum;
0f5d89e8 264 ch = U16_TRAIL(ch);
b75a7d8f
A
265 if (myTarget < targetLimit)
266 {
267 *(myTarget++) = (UChar)ch;
268 *(myOffsets++) = offsetNum;
269 }
270 else
271 {
46f4442e
A
272 cnv->UCharErrorBuffer[0] = (UChar) ch;
273 cnv->UCharErrorBufferLength = 1;
b75a7d8f
A
274 *err = U_BUFFER_OVERFLOW_ERROR;
275 }
276 }
277 offsetNum += i;
278 }
279 else
280 {
46f4442e 281 cnv->toULength = (int8_t)i;
374ca955
A
282 *err = U_ILLEGAL_CHAR_FOUND;
283 break;
b75a7d8f
A
284 }
285 }
286 }
287
288donefornow:
289 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290 { /* End of target buffer */
291 *err = U_BUFFER_OVERFLOW_ERROR;
292 }
293
294 args->target = myTarget;
295 args->source = (const char *) mySource;
296 args->offsets = myOffsets;
297}
f3c0d7a5 298U_CDECL_END
b75a7d8f 299
f3c0d7a5 300U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
b75a7d8f
A
301 UErrorCode * err)
302{
303 UConverter *cnv = args->converter;
304 const UChar *mySource = args->source;
b75a7d8f 305 const UChar *sourceLimit = args->sourceLimit;
46f4442e
A
306 uint8_t *myTarget = (uint8_t *) args->target;
307 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
308 uint8_t *tempPtr;
73c04bcf 309 UChar32 ch;
46f4442e
A
310 uint8_t tempBuf[4];
311 int32_t indexToWrite;
b331163b 312 UBool isNotCESU8 = !hasCESU8Data(cnv);
b75a7d8f 313
374ca955 314 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 315 {
374ca955
A
316 ch = cnv->fromUChar32;
317 cnv->fromUChar32 = 0;
b75a7d8f
A
318 goto lowsurrogate;
319 }
320
321 while (mySource < sourceLimit && myTarget < targetLimit)
322 {
323 ch = *(mySource++);
324
325 if (ch < 0x80) /* Single byte */
326 {
46f4442e 327 *(myTarget++) = (uint8_t) ch;
b75a7d8f
A
328 }
329 else if (ch < 0x800) /* Double byte */
330 {
46f4442e 331 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
b75a7d8f
A
332 if (myTarget < targetLimit)
333 {
46f4442e 334 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
335 }
336 else
337 {
46f4442e 338 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
339 cnv->charErrorBufferLength = 1;
340 *err = U_BUFFER_OVERFLOW_ERROR;
341 }
342 }
46f4442e
A
343 else {
344 /* Check for surrogates */
4388f060 345 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
b75a7d8f 346lowsurrogate:
46f4442e
A
347 if (mySource < sourceLimit) {
348 /* test both code units */
4388f060 349 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
46f4442e 350 /* convert and consume this supplementary code point */
4388f060 351 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
46f4442e
A
352 ++mySource;
353 /* exit this condition tree */
354 }
355 else {
356 /* this is an unpaired trail or lead code unit */
357 /* callback(illegal) */
374ca955 358 cnv->fromUChar32 = ch;
46f4442e 359 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
360 break;
361 }
46f4442e
A
362 }
363 else {
364 /* no more input */
374ca955 365 cnv->fromUChar32 = ch;
374ca955 366 break;
b75a7d8f
A
367 }
368 }
369
46f4442e
A
370 /* Do we write the buffer directly for speed,
371 or do we have to be careful about target buffer space? */
372 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
373
374 if (ch <= MAXIMUM_UCS2) {
b75a7d8f 375 indexToWrite = 2;
46f4442e 376 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
b75a7d8f 377 }
46f4442e 378 else {
b75a7d8f 379 indexToWrite = 3;
46f4442e
A
380 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
381 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
b75a7d8f 382 }
46f4442e
A
383 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
384 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f 385
46f4442e
A
386 if (tempPtr == myTarget) {
387 /* There was enough space to write the codepoint directly. */
388 myTarget += (indexToWrite + 1);
389 }
390 else {
391 /* We might run out of room soon. Write it slowly. */
392 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393 if (myTarget < targetLimit) {
394 *(myTarget++) = *tempPtr;
395 }
396 else {
397 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398 *err = U_BUFFER_OVERFLOW_ERROR;
399 }
b75a7d8f
A
400 }
401 }
402 }
403 }
404
405 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406 {
407 *err = U_BUFFER_OVERFLOW_ERROR;
408 }
b75a7d8f
A
409
410 args->target = (char *) myTarget;
411 args->source = mySource;
412}
413
f3c0d7a5 414U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
b75a7d8f
A
415 UErrorCode * err)
416{
417 UConverter *cnv = args->converter;
418 const UChar *mySource = args->source;
b75a7d8f
A
419 int32_t *myOffsets = args->offsets;
420 const UChar *sourceLimit = args->sourceLimit;
46f4442e
A
421 uint8_t *myTarget = (uint8_t *) args->target;
422 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
423 uint8_t *tempPtr;
73c04bcf 424 UChar32 ch;
b75a7d8f 425 int32_t offsetNum, nextSourceIndex;
46f4442e
A
426 int32_t indexToWrite;
427 uint8_t tempBuf[4];
b331163b 428 UBool isNotCESU8 = !hasCESU8Data(cnv);
b75a7d8f 429
374ca955 430 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 431 {
374ca955
A
432 ch = cnv->fromUChar32;
433 cnv->fromUChar32 = 0;
b75a7d8f
A
434 offsetNum = -1;
435 nextSourceIndex = 0;
436 goto lowsurrogate;
437 } else {
438 offsetNum = 0;
439 }
440
441 while (mySource < sourceLimit && myTarget < targetLimit)
442 {
443 ch = *(mySource++);
444
445 if (ch < 0x80) /* Single byte */
446 {
447 *(myOffsets++) = offsetNum++;
448 *(myTarget++) = (char) ch;
449 }
450 else if (ch < 0x800) /* Double byte */
451 {
452 *(myOffsets++) = offsetNum;
46f4442e 453 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
b75a7d8f
A
454 if (myTarget < targetLimit)
455 {
456 *(myOffsets++) = offsetNum++;
46f4442e 457 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
458 }
459 else
460 {
46f4442e 461 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
462 cnv->charErrorBufferLength = 1;
463 *err = U_BUFFER_OVERFLOW_ERROR;
464 }
465 }
466 else
467 /* Check for surrogates */
468 {
469 nextSourceIndex = offsetNum + 1;
470
4388f060 471 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
b75a7d8f 472lowsurrogate:
46f4442e
A
473 if (mySource < sourceLimit) {
474 /* test both code units */
4388f060 475 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
46f4442e 476 /* convert and consume this supplementary code point */
4388f060 477 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
46f4442e
A
478 ++mySource;
479 ++nextSourceIndex;
480 /* exit this condition tree */
481 }
482 else {
483 /* this is an unpaired trail or lead code unit */
484 /* callback(illegal) */
374ca955 485 cnv->fromUChar32 = ch;
46f4442e 486 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
487 break;
488 }
46f4442e
A
489 }
490 else {
491 /* no more input */
374ca955 492 cnv->fromUChar32 = ch;
374ca955 493 break;
b75a7d8f
A
494 }
495 }
496
46f4442e
A
497 /* Do we write the buffer directly for speed,
498 or do we have to be careful about target buffer space? */
499 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
500
501 if (ch <= MAXIMUM_UCS2) {
b75a7d8f 502 indexToWrite = 2;
46f4442e 503 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
b75a7d8f 504 }
46f4442e 505 else {
b75a7d8f 506 indexToWrite = 3;
46f4442e
A
507 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
508 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
b75a7d8f 509 }
46f4442e
A
510 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
511 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
512
513 if (tempPtr == myTarget) {
514 /* There was enough space to write the codepoint directly. */
515 myTarget += (indexToWrite + 1);
516 myOffsets[0] = offsetNum;
517 myOffsets[1] = offsetNum;
518 myOffsets[2] = offsetNum;
519 if (indexToWrite >= 3) {
520 myOffsets[3] = offsetNum;
b75a7d8f 521 }
46f4442e
A
522 myOffsets += (indexToWrite + 1);
523 }
524 else {
525 /* We might run out of room soon. Write it slowly. */
526 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527 if (myTarget < targetLimit)
528 {
529 *(myOffsets++) = offsetNum;
530 *(myTarget++) = *tempPtr;
531 }
532 else
533 {
534 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535 *err = U_BUFFER_OVERFLOW_ERROR;
536 }
b75a7d8f
A
537 }
538 }
539 offsetNum = nextSourceIndex;
540 }
541 }
542
543 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544 {
545 *err = U_BUFFER_OVERFLOW_ERROR;
546 }
b75a7d8f
A
547
548 args->target = (char *) myTarget;
549 args->source = mySource;
550 args->offsets = myOffsets;
551}
552
f3c0d7a5
A
553U_CDECL_BEGIN
554static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
b75a7d8f 555 UErrorCode *err) {
374ca955
A
556 UConverter *cnv;
557 const uint8_t *sourceInitial;
b75a7d8f 558 const uint8_t *source;
b75a7d8f
A
559 uint8_t myByte;
560 UChar32 ch;
0f5d89e8 561 int8_t i;
b75a7d8f 562
374ca955
A
563 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
564
565 cnv = args->converter;
566 sourceInitial = source = (const uint8_t *)args->source;
567 if (source >= (const uint8_t *)args->sourceLimit)
b75a7d8f 568 {
374ca955
A
569 /* no input */
570 *err = U_INDEX_OUTOFBOUNDS_ERROR;
571 return 0xffff;
572 }
b75a7d8f 573
374ca955 574 myByte = (uint8_t)*(source++);
0f5d89e8 575 if (U8_IS_SINGLE(myByte))
374ca955
A
576 {
577 args->source = (const char *)source;
578 return (UChar32)myByte;
579 }
b75a7d8f 580
0f5d89e8
A
581 uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582 if (countTrailBytes == 0) {
374ca955
A
583 cnv->toUBytes[0] = myByte;
584 cnv->toULength = 1;
585 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f 586 args->source = (const char *)source;
374ca955
A
587 return 0xffff;
588 }
b75a7d8f 589
374ca955 590 /*The byte sequence is longer than the buffer area passed*/
0f5d89e8 591 if (((const char *)source + countTrailBytes) > args->sourceLimit)
374ca955
A
592 {
593 /* check if all of the remaining bytes are trail bytes */
0f5d89e8 594 uint16_t extraBytesToWrite = countTrailBytes + 1;
374ca955
A
595 cnv->toUBytes[0] = myByte;
596 i = 1;
597 *err = U_TRUNCATED_CHAR_FOUND;
598 while(source < (const uint8_t *)args->sourceLimit) {
0f5d89e8
A
599 uint8_t b = *source;
600 if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601 cnv->toUBytes[i++] = b;
374ca955 602 ++source;
b75a7d8f 603 } else {
374ca955
A
604 /* error even before we run out of input */
605 *err = U_ILLEGAL_CHAR_FOUND;
606 break;
b75a7d8f
A
607 }
608 }
374ca955
A
609 cnv->toULength = i;
610 args->source = (const char *)source;
611 return 0xffff;
612 }
b75a7d8f 613
374ca955 614 ch = myByte << 6;
0f5d89e8
A
615 if(countTrailBytes == 2) {
616 uint8_t t1 = *source, t2;
617 if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618 args->source = (const char *)(source + 1);
619 return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
b75a7d8f 620 }
0f5d89e8
A
621 } else if(countTrailBytes == 1) {
622 uint8_t t1 = *source;
623 if(U8_IS_TRAIL(t1)) {
624 args->source = (const char *)(source + 1);
625 return (ch + t1) - offsetsFromUTF8[2];
374ca955 626 }
0f5d89e8
A
627 } else { // countTrailBytes == 3
628 uint8_t t1 = *source, t2, t3;
629 if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630 U8_IS_TRAIL(t3 = *++source)) {
631 args->source = (const char *)(source + 1);
632 return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
374ca955 633 }
b75a7d8f 634 }
0f5d89e8 635 args->source = (const char *)source;
b75a7d8f 636
374ca955
A
637 for(i = 0; sourceInitial < source; ++i) {
638 cnv->toUBytes[i] = *sourceInitial++;
639 }
640 cnv->toULength = i;
641 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
642 return 0xffff;
643}
f3c0d7a5 644U_CDECL_END
b75a7d8f 645
46f4442e
A
646/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
647
f3c0d7a5 648U_CDECL_BEGIN
46f4442e 649/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
f3c0d7a5 650static void U_CALLCONV
46f4442e
A
651ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652 UConverterToUnicodeArgs *pToUArgs,
653 UErrorCode *pErrorCode) {
4388f060 654 UConverter *utf8;
46f4442e
A
655 const uint8_t *source, *sourceLimit;
656 uint8_t *target;
657 int32_t targetCapacity;
658 int32_t count;
659
660 int8_t oldToULength, toULength, toULimit;
661
662 UChar32 c;
663 uint8_t b, t1, t2;
664
665 /* set up the local pointers */
666 utf8=pToUArgs->converter;
46f4442e
A
667 source=(uint8_t *)pToUArgs->source;
668 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669 target=(uint8_t *)pFromUArgs->target;
670 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671
672 /* get the converter state from the UTF-8 UConverter */
0f5d89e8 673 if(utf8->toULength > 0) {
46f4442e
A
674 toULength=oldToULength=utf8->toULength;
675 toULimit=(int8_t)utf8->mode;
0f5d89e8 676 c=(UChar32)utf8->toUnicodeStatus;
46f4442e
A
677 } else {
678 toULength=oldToULength=toULimit=0;
0f5d89e8 679 c = 0;
46f4442e
A
680 }
681
682 count=(int32_t)(sourceLimit-source)+oldToULength;
683 if(count<toULimit) {
684 /*
685 * Not enough input to complete the partial character.
686 * Jump to moreBytes below - it will not output to target.
687 */
688 } else if(targetCapacity<toULimit) {
689 /*
690 * Not enough target capacity to output the partial character.
691 * Let the standard converter handle this.
692 */
693 *pErrorCode=U_USING_DEFAULT_WARNING;
694 return;
695 } else {
0f5d89e8
A
696 // Use a single counter for source and target, counting the minimum of
697 // the source length and the target capacity.
698 // Let the standard converter handle edge cases.
46f4442e
A
699 if(count>targetCapacity) {
700 count=targetCapacity;
701 }
702
0f5d89e8
A
703 // The conversion loop checks count>0 only once per character.
704 // If the buffer ends with a truncated sequence,
705 // then we reduce the count to stop before that,
706 // and collect the remaining bytes after the conversion loop.
707
708 // Do not go back into the bytes that will be read for finishing a partial
709 // sequence from the previous buffer.
710 int32_t length=count-toULimit;
711 U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
712 count=toULimit+length;
46f4442e
A
713 }
714
715 if(c!=0) {
716 utf8->toUnicodeStatus=0;
717 utf8->toULength=0;
718 goto moreBytes;
719 /* See note in ucnv_SBCSFromUTF8() about this goto. */
720 }
721
722 /* conversion loop */
723 while(count>0) {
724 b=*source++;
0f5d89e8 725 if(U8_IS_SINGLE(b)) {
46f4442e
A
726 /* convert ASCII */
727 *target++=b;
728 --count;
729 continue;
730 } else {
0f5d89e8
A
731 if(b>=0xe0) {
732 if( /* handle U+0800..U+FFFF inline */
733 b<0xf0 &&
734 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
735 U8_IS_TRAIL(t2=source[1])
46f4442e
A
736 ) {
737 source+=2;
738 *target++=b;
739 *target++=t1;
740 *target++=t2;
741 count-=3;
742 continue;
743 }
0f5d89e8 744 } else {
46f4442e
A
745 if( /* handle U+0080..U+07FF inline */
746 b>=0xc2 &&
0f5d89e8 747 U8_IS_TRAIL(t1=*source)
46f4442e
A
748 ) {
749 ++source;
750 *target++=b;
751 *target++=t1;
752 count-=2;
753 continue;
754 }
46f4442e
A
755 }
756
757 /* handle "complicated" and error cases, and continuing partial characters */
758 oldToULength=0;
759 toULength=1;
0f5d89e8 760 toULimit=U8_COUNT_BYTES_NON_ASCII(b);
46f4442e
A
761 c=b;
762moreBytes:
763 while(toULength<toULimit) {
764 if(source<sourceLimit) {
765 b=*source;
0f5d89e8 766 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
46f4442e
A
767 ++source;
768 ++toULength;
769 c=(c<<6)+b;
770 } else {
771 break; /* sequence too short, stop with toULength<toULimit */
772 }
773 } else {
774 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
775 source-=(toULength-oldToULength);
776 while(oldToULength<toULength) {
777 utf8->toUBytes[oldToULength++]=*source++;
778 }
779 utf8->toUnicodeStatus=c;
780 utf8->toULength=toULength;
781 utf8->mode=toULimit;
782 pToUArgs->source=(char *)source;
783 pFromUArgs->target=(char *)target;
784 return;
785 }
786 }
787
0f5d89e8 788 if(toULength!=toULimit) {
46f4442e
A
789 /* error handling: illegal UTF-8 byte sequence */
790 source-=(toULength-oldToULength);
791 while(oldToULength<toULength) {
792 utf8->toUBytes[oldToULength++]=*source++;
793 }
794 utf8->toULength=toULength;
795 pToUArgs->source=(char *)source;
796 pFromUArgs->target=(char *)target;
797 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
798 return;
799 }
800
801 /* copy the legal byte sequence to the target */
802 {
803 int8_t i;
804
805 for(i=0; i<oldToULength; ++i) {
806 *target++=utf8->toUBytes[i];
807 }
808 source-=(toULength-oldToULength);
809 for(; i<toULength; ++i) {
810 *target++=*source++;
811 }
812 count-=toULength;
813 }
814 }
815 }
0f5d89e8 816 U_ASSERT(count>=0);
46f4442e
A
817
818 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
819 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
820 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
821 } else {
822 b=*source;
0f5d89e8 823 toULimit=U8_COUNT_BYTES(b);
46f4442e
A
824 if(toULimit>(sourceLimit-source)) {
825 /* collect a truncated byte sequence */
826 toULength=0;
827 c=b;
828 for(;;) {
829 utf8->toUBytes[toULength++]=b;
830 if(++source==sourceLimit) {
831 /* partial byte sequence at end of source */
832 utf8->toUnicodeStatus=c;
833 utf8->toULength=toULength;
834 utf8->mode=toULimit;
835 break;
0f5d89e8 836 } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
46f4442e
A
837 utf8->toULength=toULength;
838 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
839 break;
840 }
841 c=(c<<6)+b;
842 }
843 } else {
844 /* partial-sequence target overflow: fall back to the pivoting implementation */
845 *pErrorCode=U_USING_DEFAULT_WARNING;
846 }
847 }
848 }
849
850 /* write back the updated pointers */
851 pToUArgs->source=(char *)source;
852 pFromUArgs->target=(char *)target;
853}
854
f3c0d7a5
A
855U_CDECL_END
856
b75a7d8f
A
857/* UTF-8 converter data ----------------------------------------------------- */
858
859static const UConverterImpl _UTF8Impl={
860 UCNV_UTF8,
861
862 NULL,
863 NULL,
864
865 NULL,
866 NULL,
867 NULL,
868
374ca955
A
869 ucnv_toUnicode_UTF8,
870 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
871 ucnv_fromUnicode_UTF8,
872 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
873 ucnv_getNextUChar_UTF8,
b75a7d8f
A
874
875 NULL,
876 NULL,
877 NULL,
878 NULL,
46f4442e
A
879 ucnv_getNonSurrogateUnicodeSet,
880
881 ucnv_UTF8FromUTF8,
882 ucnv_UTF8FromUTF8
b75a7d8f
A
883};
884
885/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
886static const UConverterStaticData _UTF8StaticData={
887 sizeof(UConverterStaticData),
888 "UTF-8",
374ca955
A
889 1208, UCNV_IBM, UCNV_UTF8,
890 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
b75a7d8f
A
891 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
892 0,
893 0,
894 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
895};
896
897
2ca993e8
A
898const UConverterSharedData _UTF8Data=
899 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
b75a7d8f
A
900
901/* CESU-8 converter data ---------------------------------------------------- */
902
374ca955
A
903static const UConverterImpl _CESU8Impl={
904 UCNV_CESU8,
905
906 NULL,
907 NULL,
908
909 NULL,
910 NULL,
911 NULL,
912
913 ucnv_toUnicode_UTF8,
914 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
915 ucnv_fromUnicode_UTF8,
916 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
917 NULL,
918
919 NULL,
920 NULL,
921 NULL,
922 NULL,
f3c0d7a5
A
923 ucnv_getCompleteUnicodeSet,
924
925 NULL,
926 NULL
374ca955
A
927};
928
b75a7d8f
A
929static const UConverterStaticData _CESU8StaticData={
930 sizeof(UConverterStaticData),
931 "CESU-8",
73c04bcf
A
932 9400, /* CCSID for CESU-8 */
933 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
b75a7d8f
A
934 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
935 0,
936 0,
937 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
938};
939
940
2ca993e8
A
941const UConverterSharedData _CESU8Data=
942 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
374ca955
A
943
944#endif