]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u8.c
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u8.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_CONVERSION
24
25 #include "unicode/ucnv.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
29 #include "ucnv_bld.h"
30 #include "ucnv_cnv.h"
31 #include "cmemory.h"
32
33 /* Prototypes --------------------------------------------------------------- */
34
35 /* Keep these here to make finicky compilers happy */
36
37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
38 UErrorCode *err);
39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
40 UErrorCode *err);
41
42
43 /* UTF-8 -------------------------------------------------------------------- */
44
45 /* UTF-8 Conversion DATA
46 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
47 */
48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49 #define MAXIMUM_UCS2 0x0000FFFF
50 #define MAXIMUM_UTF 0x0010FFFF
51 #define MAXIMUM_UCS4 0x7FFFFFFF
52 #define HALF_SHIFT 10
53 #define HALF_BASE 0x0010000
54 #define HALF_MASK 0x3FF
55 #define SURROGATE_HIGH_START 0xD800
56 #define SURROGATE_HIGH_END 0xDBFF
57 #define SURROGATE_LOW_START 0xDC00
58 #define SURROGATE_LOW_END 0xDFFF
59
60 /* -SURROGATE_LOW_START + HALF_BASE */
61 #define SURROGATE_LOW_BASE 9216
62
63 static const uint32_t offsetsFromUTF8[7] = {0,
64 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66 };
67
68 /* END OF UTF-8 Conversion DATA */
69
70 static const int8_t bytesFromUTF8[256] = {
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79 };
80
81 /*
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
86 */
87 static const uint32_t
88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89
90 static UBool hasCESU8Data(const UConverter *cnv)
91 {
92 #if UCONFIG_ONLY_HTML_CONVERSION
93 return FALSE;
94 #else
95 return (UBool)(cnv->sharedData == &_CESU8Data);
96 #endif
97 }
98
99 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
100 UErrorCode * err)
101 {
102 UConverter *cnv = args->converter;
103 const unsigned char *mySource = (unsigned char *) args->source;
104 UChar *myTarget = args->target;
105 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
106 const UChar *targetLimit = args->targetLimit;
107 unsigned char *toUBytes = cnv->toUBytes;
108 UBool isCESU8 = hasCESU8Data(cnv);
109 uint32_t ch, ch2 = 0;
110 int32_t i, inBytes;
111
112 /* Restore size of current sequence */
113 if (cnv->toUnicodeStatus && myTarget < targetLimit)
114 {
115 inBytes = cnv->mode; /* restore # of bytes to consume */
116 i = cnv->toULength; /* restore # of bytes consumed */
117 cnv->toULength = 0;
118
119 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
120 cnv->toUnicodeStatus = 0;
121 goto morebytes;
122 }
123
124
125 while (mySource < sourceLimit && myTarget < targetLimit)
126 {
127 ch = *(mySource++);
128 if (ch < 0x80) /* Simple case */
129 {
130 *(myTarget++) = (UChar) ch;
131 }
132 else
133 {
134 /* store the first char */
135 toUBytes[0] = (char)ch;
136 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
137 i = 1;
138
139 morebytes:
140 while (i < inBytes)
141 {
142 if (mySource < sourceLimit)
143 {
144 toUBytes[i] = (char) (ch2 = *mySource);
145 if (!U8_IS_TRAIL(ch2))
146 {
147 break; /* i < inBytes */
148 }
149 ch = (ch << 6) + ch2;
150 ++mySource;
151 i++;
152 }
153 else
154 {
155 /* stores a partially calculated target*/
156 cnv->toUnicodeStatus = ch;
157 cnv->mode = inBytes;
158 cnv->toULength = (int8_t) i;
159 goto donefornow;
160 }
161 }
162
163 /* Remove the accumulated high bits */
164 ch -= offsetsFromUTF8[inBytes];
165
166 /*
167 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
168 * - use only trail bytes after a lead byte (checked above)
169 * - use the right number of trail bytes for a given lead byte
170 * - encode a code point <= U+10ffff
171 * - use the fewest possible number of bytes for their code points
172 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
173 *
174 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
175 * There are no irregular sequences any more.
176 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
177 */
178 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
179 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
180 {
181 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
182 if (ch <= MAXIMUM_UCS2)
183 {
184 /* fits in 16 bits */
185 *(myTarget++) = (UChar) ch;
186 }
187 else
188 {
189 /* write out the surrogates */
190 ch -= HALF_BASE;
191 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
192 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
193 if (myTarget < targetLimit)
194 {
195 *(myTarget++) = (UChar)ch;
196 }
197 else
198 {
199 /* Put in overflow buffer (not handled here) */
200 cnv->UCharErrorBuffer[0] = (UChar) ch;
201 cnv->UCharErrorBufferLength = 1;
202 *err = U_BUFFER_OVERFLOW_ERROR;
203 break;
204 }
205 }
206 }
207 else
208 {
209 cnv->toULength = (int8_t)i;
210 *err = U_ILLEGAL_CHAR_FOUND;
211 break;
212 }
213 }
214 }
215
216 donefornow:
217 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
218 {
219 /* End of target buffer */
220 *err = U_BUFFER_OVERFLOW_ERROR;
221 }
222
223 args->target = myTarget;
224 args->source = (const char *) mySource;
225 }
226
227 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
228 UErrorCode * err)
229 {
230 UConverter *cnv = args->converter;
231 const unsigned char *mySource = (unsigned char *) args->source;
232 UChar *myTarget = args->target;
233 int32_t *myOffsets = args->offsets;
234 int32_t offsetNum = 0;
235 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
236 const UChar *targetLimit = args->targetLimit;
237 unsigned char *toUBytes = cnv->toUBytes;
238 UBool isCESU8 = hasCESU8Data(cnv);
239 uint32_t ch, ch2 = 0;
240 int32_t i, inBytes;
241
242 /* Restore size of current sequence */
243 if (cnv->toUnicodeStatus && myTarget < targetLimit)
244 {
245 inBytes = cnv->mode; /* restore # of bytes to consume */
246 i = cnv->toULength; /* restore # of bytes consumed */
247 cnv->toULength = 0;
248
249 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
250 cnv->toUnicodeStatus = 0;
251 goto morebytes;
252 }
253
254 while (mySource < sourceLimit && myTarget < targetLimit)
255 {
256 ch = *(mySource++);
257 if (ch < 0x80) /* Simple case */
258 {
259 *(myTarget++) = (UChar) ch;
260 *(myOffsets++) = offsetNum++;
261 }
262 else
263 {
264 toUBytes[0] = (char)ch;
265 inBytes = bytesFromUTF8[ch];
266 i = 1;
267
268 morebytes:
269 while (i < inBytes)
270 {
271 if (mySource < sourceLimit)
272 {
273 toUBytes[i] = (char) (ch2 = *mySource);
274 if (!U8_IS_TRAIL(ch2))
275 {
276 break; /* i < inBytes */
277 }
278 ch = (ch << 6) + ch2;
279 ++mySource;
280 i++;
281 }
282 else
283 {
284 cnv->toUnicodeStatus = ch;
285 cnv->mode = inBytes;
286 cnv->toULength = (int8_t)i;
287 goto donefornow;
288 }
289 }
290
291 /* Remove the accumulated high bits */
292 ch -= offsetsFromUTF8[inBytes];
293
294 /*
295 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
296 * - use only trail bytes after a lead byte (checked above)
297 * - use the right number of trail bytes for a given lead byte
298 * - encode a code point <= U+10ffff
299 * - use the fewest possible number of bytes for their code points
300 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
301 *
302 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
303 * There are no irregular sequences any more.
304 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
305 */
306 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
307 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
308 {
309 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
310 if (ch <= MAXIMUM_UCS2)
311 {
312 /* fits in 16 bits */
313 *(myTarget++) = (UChar) ch;
314 *(myOffsets++) = offsetNum;
315 }
316 else
317 {
318 /* write out the surrogates */
319 ch -= HALF_BASE;
320 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
321 *(myOffsets++) = offsetNum;
322 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
323 if (myTarget < targetLimit)
324 {
325 *(myTarget++) = (UChar)ch;
326 *(myOffsets++) = offsetNum;
327 }
328 else
329 {
330 cnv->UCharErrorBuffer[0] = (UChar) ch;
331 cnv->UCharErrorBufferLength = 1;
332 *err = U_BUFFER_OVERFLOW_ERROR;
333 }
334 }
335 offsetNum += i;
336 }
337 else
338 {
339 cnv->toULength = (int8_t)i;
340 *err = U_ILLEGAL_CHAR_FOUND;
341 break;
342 }
343 }
344 }
345
346 donefornow:
347 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
348 { /* End of target buffer */
349 *err = U_BUFFER_OVERFLOW_ERROR;
350 }
351
352 args->target = myTarget;
353 args->source = (const char *) mySource;
354 args->offsets = myOffsets;
355 }
356
357 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
358 UErrorCode * err)
359 {
360 UConverter *cnv = args->converter;
361 const UChar *mySource = args->source;
362 const UChar *sourceLimit = args->sourceLimit;
363 uint8_t *myTarget = (uint8_t *) args->target;
364 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
365 uint8_t *tempPtr;
366 UChar32 ch;
367 uint8_t tempBuf[4];
368 int32_t indexToWrite;
369 UBool isNotCESU8 = !hasCESU8Data(cnv);
370
371 if (cnv->fromUChar32 && myTarget < targetLimit)
372 {
373 ch = cnv->fromUChar32;
374 cnv->fromUChar32 = 0;
375 goto lowsurrogate;
376 }
377
378 while (mySource < sourceLimit && myTarget < targetLimit)
379 {
380 ch = *(mySource++);
381
382 if (ch < 0x80) /* Single byte */
383 {
384 *(myTarget++) = (uint8_t) ch;
385 }
386 else if (ch < 0x800) /* Double byte */
387 {
388 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
389 if (myTarget < targetLimit)
390 {
391 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
392 }
393 else
394 {
395 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
396 cnv->charErrorBufferLength = 1;
397 *err = U_BUFFER_OVERFLOW_ERROR;
398 }
399 }
400 else {
401 /* Check for surrogates */
402 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
403 lowsurrogate:
404 if (mySource < sourceLimit) {
405 /* test both code units */
406 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
407 /* convert and consume this supplementary code point */
408 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
409 ++mySource;
410 /* exit this condition tree */
411 }
412 else {
413 /* this is an unpaired trail or lead code unit */
414 /* callback(illegal) */
415 cnv->fromUChar32 = ch;
416 *err = U_ILLEGAL_CHAR_FOUND;
417 break;
418 }
419 }
420 else {
421 /* no more input */
422 cnv->fromUChar32 = ch;
423 break;
424 }
425 }
426
427 /* Do we write the buffer directly for speed,
428 or do we have to be careful about target buffer space? */
429 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
430
431 if (ch <= MAXIMUM_UCS2) {
432 indexToWrite = 2;
433 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
434 }
435 else {
436 indexToWrite = 3;
437 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
438 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
439 }
440 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
441 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
442
443 if (tempPtr == myTarget) {
444 /* There was enough space to write the codepoint directly. */
445 myTarget += (indexToWrite + 1);
446 }
447 else {
448 /* We might run out of room soon. Write it slowly. */
449 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
450 if (myTarget < targetLimit) {
451 *(myTarget++) = *tempPtr;
452 }
453 else {
454 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
455 *err = U_BUFFER_OVERFLOW_ERROR;
456 }
457 }
458 }
459 }
460 }
461
462 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
463 {
464 *err = U_BUFFER_OVERFLOW_ERROR;
465 }
466
467 args->target = (char *) myTarget;
468 args->source = mySource;
469 }
470
471 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
472 UErrorCode * err)
473 {
474 UConverter *cnv = args->converter;
475 const UChar *mySource = args->source;
476 int32_t *myOffsets = args->offsets;
477 const UChar *sourceLimit = args->sourceLimit;
478 uint8_t *myTarget = (uint8_t *) args->target;
479 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
480 uint8_t *tempPtr;
481 UChar32 ch;
482 int32_t offsetNum, nextSourceIndex;
483 int32_t indexToWrite;
484 uint8_t tempBuf[4];
485 UBool isNotCESU8 = !hasCESU8Data(cnv);
486
487 if (cnv->fromUChar32 && myTarget < targetLimit)
488 {
489 ch = cnv->fromUChar32;
490 cnv->fromUChar32 = 0;
491 offsetNum = -1;
492 nextSourceIndex = 0;
493 goto lowsurrogate;
494 } else {
495 offsetNum = 0;
496 }
497
498 while (mySource < sourceLimit && myTarget < targetLimit)
499 {
500 ch = *(mySource++);
501
502 if (ch < 0x80) /* Single byte */
503 {
504 *(myOffsets++) = offsetNum++;
505 *(myTarget++) = (char) ch;
506 }
507 else if (ch < 0x800) /* Double byte */
508 {
509 *(myOffsets++) = offsetNum;
510 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
511 if (myTarget < targetLimit)
512 {
513 *(myOffsets++) = offsetNum++;
514 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
515 }
516 else
517 {
518 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
519 cnv->charErrorBufferLength = 1;
520 *err = U_BUFFER_OVERFLOW_ERROR;
521 }
522 }
523 else
524 /* Check for surrogates */
525 {
526 nextSourceIndex = offsetNum + 1;
527
528 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
529 lowsurrogate:
530 if (mySource < sourceLimit) {
531 /* test both code units */
532 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
533 /* convert and consume this supplementary code point */
534 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
535 ++mySource;
536 ++nextSourceIndex;
537 /* exit this condition tree */
538 }
539 else {
540 /* this is an unpaired trail or lead code unit */
541 /* callback(illegal) */
542 cnv->fromUChar32 = ch;
543 *err = U_ILLEGAL_CHAR_FOUND;
544 break;
545 }
546 }
547 else {
548 /* no more input */
549 cnv->fromUChar32 = ch;
550 break;
551 }
552 }
553
554 /* Do we write the buffer directly for speed,
555 or do we have to be careful about target buffer space? */
556 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
557
558 if (ch <= MAXIMUM_UCS2) {
559 indexToWrite = 2;
560 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
561 }
562 else {
563 indexToWrite = 3;
564 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
565 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
566 }
567 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
568 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
569
570 if (tempPtr == myTarget) {
571 /* There was enough space to write the codepoint directly. */
572 myTarget += (indexToWrite + 1);
573 myOffsets[0] = offsetNum;
574 myOffsets[1] = offsetNum;
575 myOffsets[2] = offsetNum;
576 if (indexToWrite >= 3) {
577 myOffsets[3] = offsetNum;
578 }
579 myOffsets += (indexToWrite + 1);
580 }
581 else {
582 /* We might run out of room soon. Write it slowly. */
583 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
584 if (myTarget < targetLimit)
585 {
586 *(myOffsets++) = offsetNum;
587 *(myTarget++) = *tempPtr;
588 }
589 else
590 {
591 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
592 *err = U_BUFFER_OVERFLOW_ERROR;
593 }
594 }
595 }
596 offsetNum = nextSourceIndex;
597 }
598 }
599
600 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
601 {
602 *err = U_BUFFER_OVERFLOW_ERROR;
603 }
604
605 args->target = (char *) myTarget;
606 args->source = mySource;
607 args->offsets = myOffsets;
608 }
609
610 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
611 UErrorCode *err) {
612 UConverter *cnv;
613 const uint8_t *sourceInitial;
614 const uint8_t *source;
615 uint16_t extraBytesToWrite;
616 uint8_t myByte;
617 UChar32 ch;
618 int8_t i, isLegalSequence;
619
620 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
621
622 cnv = args->converter;
623 sourceInitial = source = (const uint8_t *)args->source;
624 if (source >= (const uint8_t *)args->sourceLimit)
625 {
626 /* no input */
627 *err = U_INDEX_OUTOFBOUNDS_ERROR;
628 return 0xffff;
629 }
630
631 myByte = (uint8_t)*(source++);
632 if (myByte < 0x80)
633 {
634 args->source = (const char *)source;
635 return (UChar32)myByte;
636 }
637
638 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
639 if (extraBytesToWrite == 0) {
640 cnv->toUBytes[0] = myByte;
641 cnv->toULength = 1;
642 *err = U_ILLEGAL_CHAR_FOUND;
643 args->source = (const char *)source;
644 return 0xffff;
645 }
646
647 /*The byte sequence is longer than the buffer area passed*/
648 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
649 {
650 /* check if all of the remaining bytes are trail bytes */
651 cnv->toUBytes[0] = myByte;
652 i = 1;
653 *err = U_TRUNCATED_CHAR_FOUND;
654 while(source < (const uint8_t *)args->sourceLimit) {
655 if(U8_IS_TRAIL(myByte = *source)) {
656 cnv->toUBytes[i++] = myByte;
657 ++source;
658 } else {
659 /* error even before we run out of input */
660 *err = U_ILLEGAL_CHAR_FOUND;
661 break;
662 }
663 }
664 cnv->toULength = i;
665 args->source = (const char *)source;
666 return 0xffff;
667 }
668
669 isLegalSequence = 1;
670 ch = myByte << 6;
671 switch(extraBytesToWrite)
672 {
673 /* note: code falls through cases! (sic)*/
674 case 6:
675 ch += (myByte = *source);
676 ch <<= 6;
677 if (!U8_IS_TRAIL(myByte))
678 {
679 isLegalSequence = 0;
680 break;
681 }
682 ++source;
683 U_FALLTHROUGH;
684 case 5:
685 ch += (myByte = *source);
686 ch <<= 6;
687 if (!U8_IS_TRAIL(myByte))
688 {
689 isLegalSequence = 0;
690 break;
691 }
692 ++source;
693 U_FALLTHROUGH;
694 case 4:
695 ch += (myByte = *source);
696 ch <<= 6;
697 if (!U8_IS_TRAIL(myByte))
698 {
699 isLegalSequence = 0;
700 break;
701 }
702 ++source;
703 U_FALLTHROUGH;
704 case 3:
705 ch += (myByte = *source);
706 ch <<= 6;
707 if (!U8_IS_TRAIL(myByte))
708 {
709 isLegalSequence = 0;
710 break;
711 }
712 ++source;
713 U_FALLTHROUGH;
714 case 2:
715 ch += (myByte = *source);
716 if (!U8_IS_TRAIL(myByte))
717 {
718 isLegalSequence = 0;
719 break;
720 }
721 ++source;
722 };
723 ch -= offsetsFromUTF8[extraBytesToWrite];
724 args->source = (const char *)source;
725
726 /*
727 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
728 * - use only trail bytes after a lead byte (checked above)
729 * - use the right number of trail bytes for a given lead byte
730 * - encode a code point <= U+10ffff
731 * - use the fewest possible number of bytes for their code points
732 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
733 *
734 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
735 * There are no irregular sequences any more.
736 */
737 if (isLegalSequence &&
738 (uint32_t)ch <= MAXIMUM_UTF &&
739 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
740 !U_IS_SURROGATE(ch)
741 ) {
742 return ch; /* return the code point */
743 }
744
745 for(i = 0; sourceInitial < source; ++i) {
746 cnv->toUBytes[i] = *sourceInitial++;
747 }
748 cnv->toULength = i;
749 *err = U_ILLEGAL_CHAR_FOUND;
750 return 0xffff;
751 }
752
753 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
754
755 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
756 static const UChar32
757 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
758
759 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
760 static const UChar32
761 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
762
763 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
764 static void
765 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
766 UConverterToUnicodeArgs *pToUArgs,
767 UErrorCode *pErrorCode) {
768 UConverter *utf8;
769 const uint8_t *source, *sourceLimit;
770 uint8_t *target;
771 int32_t targetCapacity;
772 int32_t count;
773
774 int8_t oldToULength, toULength, toULimit;
775
776 UChar32 c;
777 uint8_t b, t1, t2;
778
779 /* set up the local pointers */
780 utf8=pToUArgs->converter;
781 source=(uint8_t *)pToUArgs->source;
782 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
783 target=(uint8_t *)pFromUArgs->target;
784 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
785
786 /* get the converter state from the UTF-8 UConverter */
787 c=(UChar32)utf8->toUnicodeStatus;
788 if(c!=0) {
789 toULength=oldToULength=utf8->toULength;
790 toULimit=(int8_t)utf8->mode;
791 } else {
792 toULength=oldToULength=toULimit=0;
793 }
794
795 count=(int32_t)(sourceLimit-source)+oldToULength;
796 if(count<toULimit) {
797 /*
798 * Not enough input to complete the partial character.
799 * Jump to moreBytes below - it will not output to target.
800 */
801 } else if(targetCapacity<toULimit) {
802 /*
803 * Not enough target capacity to output the partial character.
804 * Let the standard converter handle this.
805 */
806 *pErrorCode=U_USING_DEFAULT_WARNING;
807 return;
808 } else {
809 /*
810 * Use a single counter for source and target, counting the minimum of
811 * the source length and the target capacity.
812 * As a result, the source length is checked only once per multi-byte
813 * character instead of twice.
814 *
815 * Make sure that the last byte sequence is complete, or else
816 * stop just before it.
817 * (The longest legal byte sequence has 3 trail bytes.)
818 * Count oldToULength (number of source bytes from a previous buffer)
819 * into the source length but reduce the source index by toULimit
820 * while going back over trail bytes in order to not go back into
821 * the bytes that will be read for finishing a partial
822 * sequence from the previous buffer.
823 * Let the standard converter handle edge cases.
824 */
825 int32_t i;
826
827 if(count>targetCapacity) {
828 count=targetCapacity;
829 }
830
831 i=0;
832 while(i<3 && i<(count-toULimit)) {
833 b=source[count-oldToULength-i-1];
834 if(U8_IS_TRAIL(b)) {
835 ++i;
836 } else {
837 if(i<U8_COUNT_TRAIL_BYTES(b)) {
838 /* stop converting before the lead byte if there are not enough trail bytes for it */
839 count-=i+1;
840 }
841 break;
842 }
843 }
844 }
845
846 if(c!=0) {
847 utf8->toUnicodeStatus=0;
848 utf8->toULength=0;
849 goto moreBytes;
850 /* See note in ucnv_SBCSFromUTF8() about this goto. */
851 }
852
853 /* conversion loop */
854 while(count>0) {
855 b=*source++;
856 if((int8_t)b>=0) {
857 /* convert ASCII */
858 *target++=b;
859 --count;
860 continue;
861 } else {
862 if(b>0xe0) {
863 if( /* handle U+1000..U+D7FF inline */
864 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
865 (b==0xed && (t1 <= 0x9f))) &&
866 (t2=source[1]) >= 0x80 && t2 <= 0xbf
867 ) {
868 source+=2;
869 *target++=b;
870 *target++=t1;
871 *target++=t2;
872 count-=3;
873 continue;
874 }
875 } else if(b<0xe0) {
876 if( /* handle U+0080..U+07FF inline */
877 b>=0xc2 &&
878 (t1=*source) >= 0x80 && t1 <= 0xbf
879 ) {
880 ++source;
881 *target++=b;
882 *target++=t1;
883 count-=2;
884 continue;
885 }
886 } else if(b==0xe0) {
887 if( /* handle U+0800..U+0FFF inline */
888 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
889 (t2=source[1]) >= 0x80 && t2 <= 0xbf
890 ) {
891 source+=2;
892 *target++=b;
893 *target++=t1;
894 *target++=t2;
895 count-=3;
896 continue;
897 }
898 }
899
900 /* handle "complicated" and error cases, and continuing partial characters */
901 oldToULength=0;
902 toULength=1;
903 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
904 c=b;
905 moreBytes:
906 while(toULength<toULimit) {
907 if(source<sourceLimit) {
908 b=*source;
909 if(U8_IS_TRAIL(b)) {
910 ++source;
911 ++toULength;
912 c=(c<<6)+b;
913 } else {
914 break; /* sequence too short, stop with toULength<toULimit */
915 }
916 } else {
917 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
918 source-=(toULength-oldToULength);
919 while(oldToULength<toULength) {
920 utf8->toUBytes[oldToULength++]=*source++;
921 }
922 utf8->toUnicodeStatus=c;
923 utf8->toULength=toULength;
924 utf8->mode=toULimit;
925 pToUArgs->source=(char *)source;
926 pFromUArgs->target=(char *)target;
927 return;
928 }
929 }
930
931 if( toULength==toULimit && /* consumed all trail bytes */
932 (toULength==3 || toULength==2) && /* BMP */
933 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
934 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
935 ) {
936 /* legal byte sequence for BMP code point */
937 } else if(
938 toULength==toULimit && toULength==4 &&
939 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
940 ) {
941 /* legal byte sequence for supplementary code point */
942 } else {
943 /* error handling: illegal UTF-8 byte sequence */
944 source-=(toULength-oldToULength);
945 while(oldToULength<toULength) {
946 utf8->toUBytes[oldToULength++]=*source++;
947 }
948 utf8->toULength=toULength;
949 pToUArgs->source=(char *)source;
950 pFromUArgs->target=(char *)target;
951 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
952 return;
953 }
954
955 /* copy the legal byte sequence to the target */
956 {
957 int8_t i;
958
959 for(i=0; i<oldToULength; ++i) {
960 *target++=utf8->toUBytes[i];
961 }
962 source-=(toULength-oldToULength);
963 for(; i<toULength; ++i) {
964 *target++=*source++;
965 }
966 count-=toULength;
967 }
968 }
969 }
970
971 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
972 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
973 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
974 } else {
975 b=*source;
976 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
977 if(toULimit>(sourceLimit-source)) {
978 /* collect a truncated byte sequence */
979 toULength=0;
980 c=b;
981 for(;;) {
982 utf8->toUBytes[toULength++]=b;
983 if(++source==sourceLimit) {
984 /* partial byte sequence at end of source */
985 utf8->toUnicodeStatus=c;
986 utf8->toULength=toULength;
987 utf8->mode=toULimit;
988 break;
989 } else if(!U8_IS_TRAIL(b=*source)) {
990 /* lead byte in trail byte position */
991 utf8->toULength=toULength;
992 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
993 break;
994 }
995 c=(c<<6)+b;
996 }
997 } else {
998 /* partial-sequence target overflow: fall back to the pivoting implementation */
999 *pErrorCode=U_USING_DEFAULT_WARNING;
1000 }
1001 }
1002 }
1003
1004 /* write back the updated pointers */
1005 pToUArgs->source=(char *)source;
1006 pFromUArgs->target=(char *)target;
1007 }
1008
1009 /* UTF-8 converter data ----------------------------------------------------- */
1010
1011 static const UConverterImpl _UTF8Impl={
1012 UCNV_UTF8,
1013
1014 NULL,
1015 NULL,
1016
1017 NULL,
1018 NULL,
1019 NULL,
1020
1021 ucnv_toUnicode_UTF8,
1022 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1023 ucnv_fromUnicode_UTF8,
1024 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1025 ucnv_getNextUChar_UTF8,
1026
1027 NULL,
1028 NULL,
1029 NULL,
1030 NULL,
1031 ucnv_getNonSurrogateUnicodeSet,
1032
1033 ucnv_UTF8FromUTF8,
1034 ucnv_UTF8FromUTF8
1035 };
1036
1037 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1038 static const UConverterStaticData _UTF8StaticData={
1039 sizeof(UConverterStaticData),
1040 "UTF-8",
1041 1208, UCNV_IBM, UCNV_UTF8,
1042 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1043 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1044 0,
1045 0,
1046 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1047 };
1048
1049
1050 const UConverterSharedData _UTF8Data=
1051 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1052
1053 /* CESU-8 converter data ---------------------------------------------------- */
1054
1055 static const UConverterImpl _CESU8Impl={
1056 UCNV_CESU8,
1057
1058 NULL,
1059 NULL,
1060
1061 NULL,
1062 NULL,
1063 NULL,
1064
1065 ucnv_toUnicode_UTF8,
1066 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1067 ucnv_fromUnicode_UTF8,
1068 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1069 NULL,
1070
1071 NULL,
1072 NULL,
1073 NULL,
1074 NULL,
1075 ucnv_getCompleteUnicodeSet
1076 };
1077
1078 static const UConverterStaticData _CESU8StaticData={
1079 sizeof(UConverterStaticData),
1080 "CESU-8",
1081 9400, /* CCSID for CESU-8 */
1082 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1083 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1084 0,
1085 0,
1086 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1087 };
1088
1089
1090 const UConverterSharedData _CESU8Data=
1091 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1092
1093 #endif