]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u8.c
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u8.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_CONVERSION
24
25 #include "unicode/ucnv.h"
26 #include "ucnv_bld.h"
27 #include "ucnv_cnv.h"
28 #include "cmemory.h"
29
30 /* Prototypes --------------------------------------------------------------- */
31
32 /* Keep these here to make finicky compilers happy */
33
34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
35 UErrorCode *err);
36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
37 UErrorCode *err);
38
39
40 /* UTF-8 -------------------------------------------------------------------- */
41
42 /* UTF-8 Conversion DATA
43 * for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
44 */
45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46 #define MAXIMUM_UCS2 0x0000FFFF
47 #define MAXIMUM_UTF 0x0010FFFF
48 #define MAXIMUM_UCS4 0x7FFFFFFF
49 #define HALF_SHIFT 10
50 #define HALF_BASE 0x0010000
51 #define HALF_MASK 0x3FF
52 #define SURROGATE_HIGH_START 0xD800
53 #define SURROGATE_HIGH_END 0xDBFF
54 #define SURROGATE_LOW_START 0xDC00
55 #define SURROGATE_LOW_END 0xDFFF
56
57 /* -SURROGATE_LOW_START + HALF_BASE */
58 #define SURROGATE_LOW_BASE 9216
59
60 static const uint32_t offsetsFromUTF8[7] = {0,
61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
63 };
64
65 /* END OF UTF-8 Conversion DATA */
66
67 static const int8_t bytesFromUTF8[256] = {
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
76 };
77
78 /*
79 * Starting with Unicode 3.0.1:
80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81 * byte sequences with more than 4 bytes are illegal in UTF-8,
82 * which is tested with impossible values for them
83 */
84 static const uint32_t
85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
86
87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
88 UErrorCode * err)
89 {
90 const unsigned char *mySource = (unsigned char *) args->source;
91 UChar *myTarget = args->target;
92 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
93 const UChar *targetLimit = args->targetLimit;
94 unsigned char *toUBytes = args->converter->toUBytes;
95 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
96 uint32_t ch, ch2 = 0;
97 int32_t i, inBytes;
98
99 /* Restore size of current sequence */
100 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
101 {
102 inBytes = args->converter->mode; /* restore # of bytes to consume */
103 i = args->converter->toULength; /* restore # of bytes consumed */
104 args->converter->toULength = 0;
105
106 ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
107 args->converter->toUnicodeStatus = 0;
108 goto morebytes;
109 }
110
111
112 while (mySource < sourceLimit && myTarget < targetLimit)
113 {
114 ch = *(mySource++);
115 if (ch < 0x80) /* Simple case */
116 {
117 *(myTarget++) = (UChar) ch;
118 }
119 else
120 {
121 /* store the first char */
122 toUBytes[0] = (char)ch;
123 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
124 i = 1;
125
126 morebytes:
127 while (i < inBytes)
128 {
129 if (mySource < sourceLimit)
130 {
131 toUBytes[i] = (char) (ch2 = *mySource);
132 if (!UTF8_IS_TRAIL(ch2))
133 {
134 break; /* i < inBytes */
135 }
136 ch = (ch << 6) + ch2;
137 ++mySource;
138 i++;
139 }
140 else
141 {
142 /* stores a partially calculated target*/
143 args->converter->toUnicodeStatus = ch;
144 args->converter->mode = inBytes;
145 args->converter->toULength = (int8_t) i;
146 goto donefornow;
147 }
148 }
149
150 /* Remove the accumulated high bits */
151 ch -= offsetsFromUTF8[inBytes];
152
153 /*
154 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
155 * - use only trail bytes after a lead byte (checked above)
156 * - use the right number of trail bytes for a given lead byte
157 * - encode a code point <= U+10ffff
158 * - use the fewest possible number of bytes for their code points
159 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
160 *
161 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
162 * There are no irregular sequences any more.
163 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
164 */
165 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
166 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
167 {
168 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
169 if (ch <= MAXIMUM_UCS2)
170 {
171 /* fits in 16 bits */
172 *(myTarget++) = (UChar) ch;
173 }
174 else
175 {
176 /* write out the surrogates */
177 ch -= HALF_BASE;
178 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
179 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
180 if (myTarget < targetLimit)
181 {
182 *(myTarget++) = (UChar)ch;
183 }
184 else
185 {
186 /* Put in overflow buffer (not handled here) */
187 args->converter->UCharErrorBuffer[0] = (UChar) ch;
188 args->converter->UCharErrorBufferLength = 1;
189 *err = U_BUFFER_OVERFLOW_ERROR;
190 break;
191 }
192 }
193 }
194 else
195 {
196 args->converter->toULength = (int8_t)i;
197 *err = U_ILLEGAL_CHAR_FOUND;
198 break;
199 }
200 }
201 }
202
203 donefornow:
204 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
205 {
206 /* End of target buffer */
207 *err = U_BUFFER_OVERFLOW_ERROR;
208 }
209
210 args->target = myTarget;
211 args->source = (const char *) mySource;
212 }
213
214 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
215 UErrorCode * err)
216 {
217 const unsigned char *mySource = (unsigned char *) args->source;
218 UChar *myTarget = args->target;
219 int32_t *myOffsets = args->offsets;
220 int32_t offsetNum = 0;
221 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
222 const UChar *targetLimit = args->targetLimit;
223 unsigned char *toUBytes = args->converter->toUBytes;
224 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
225 uint32_t ch, ch2 = 0;
226 int32_t i, inBytes;
227
228 /* Restore size of current sequence */
229 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
230 {
231 inBytes = args->converter->mode; /* restore # of bytes to consume */
232 i = args->converter->toULength; /* restore # of bytes consumed */
233 args->converter->toULength = 0;
234
235 ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
236 args->converter->toUnicodeStatus = 0;
237 goto morebytes;
238 }
239
240 while (mySource < sourceLimit && myTarget < targetLimit)
241 {
242 ch = *(mySource++);
243 if (ch < 0x80) /* Simple case */
244 {
245 *(myTarget++) = (UChar) ch;
246 *(myOffsets++) = offsetNum++;
247 }
248 else
249 {
250 toUBytes[0] = (char)ch;
251 inBytes = bytesFromUTF8[ch];
252 i = 1;
253
254 morebytes:
255 while (i < inBytes)
256 {
257 if (mySource < sourceLimit)
258 {
259 toUBytes[i] = (char) (ch2 = *mySource);
260 if (!UTF8_IS_TRAIL(ch2))
261 {
262 break; /* i < inBytes */
263 }
264 ch = (ch << 6) + ch2;
265 ++mySource;
266 i++;
267 }
268 else
269 {
270 args->converter->toUnicodeStatus = ch;
271 args->converter->mode = inBytes;
272 args->converter->toULength = (int8_t)i;
273 goto donefornow;
274 }
275 }
276
277 /* Remove the accumulated high bits */
278 ch -= offsetsFromUTF8[inBytes];
279
280 /*
281 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
282 * - use only trail bytes after a lead byte (checked above)
283 * - use the right number of trail bytes for a given lead byte
284 * - encode a code point <= U+10ffff
285 * - use the fewest possible number of bytes for their code points
286 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
287 *
288 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
289 * There are no irregular sequences any more.
290 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
291 */
292 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
293 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
294 {
295 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
296 if (ch <= MAXIMUM_UCS2)
297 {
298 /* fits in 16 bits */
299 *(myTarget++) = (UChar) ch;
300 *(myOffsets++) = offsetNum;
301 }
302 else
303 {
304 /* write out the surrogates */
305 ch -= HALF_BASE;
306 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
307 *(myOffsets++) = offsetNum;
308 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
309 if (myTarget < targetLimit)
310 {
311 *(myTarget++) = (UChar)ch;
312 *(myOffsets++) = offsetNum;
313 }
314 else
315 {
316 args->converter->UCharErrorBuffer[0] = (UChar) ch;
317 args->converter->UCharErrorBufferLength = 1;
318 *err = U_BUFFER_OVERFLOW_ERROR;
319 }
320 }
321 offsetNum += i;
322 }
323 else
324 {
325 args->converter->toULength = (int8_t)i;
326 *err = U_ILLEGAL_CHAR_FOUND;
327 break;
328 }
329 }
330 }
331
332 donefornow:
333 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
334 { /* End of target buffer */
335 *err = U_BUFFER_OVERFLOW_ERROR;
336 }
337
338 args->target = myTarget;
339 args->source = (const char *) mySource;
340 args->offsets = myOffsets;
341 }
342
343 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
344 UErrorCode * err)
345 {
346 UConverter *cnv = args->converter;
347 const UChar *mySource = args->source;
348 unsigned char *myTarget = (unsigned char *) args->target;
349 const UChar *sourceLimit = args->sourceLimit;
350 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
351 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
352 UChar32 ch;
353 int16_t indexToWrite;
354 char temp[4];
355
356 if (cnv->fromUChar32 && myTarget < targetLimit)
357 {
358 ch = cnv->fromUChar32;
359 cnv->fromUChar32 = 0;
360 goto lowsurrogate;
361 }
362
363 while (mySource < sourceLimit && myTarget < targetLimit)
364 {
365 ch = *(mySource++);
366
367 if (ch < 0x80) /* Single byte */
368 {
369 *(myTarget++) = (char) ch;
370 }
371 else if (ch < 0x800) /* Double byte */
372 {
373 *(myTarget++) = (char) ((ch >> 6) | 0xc0);
374 if (myTarget < targetLimit)
375 {
376 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
377 }
378 else
379 {
380 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
381 cnv->charErrorBufferLength = 1;
382 *err = U_BUFFER_OVERFLOW_ERROR;
383 }
384 }
385 else
386 /* Check for surrogates */
387 {
388 if(UTF_IS_SURROGATE(ch) && !isCESU8) {
389 if(UTF_IS_SURROGATE_FIRST(ch)) {
390 lowsurrogate:
391 if (mySource < sourceLimit) {
392 /* test the following code unit */
393 UChar trail=*mySource;
394 if(UTF_IS_SECOND_SURROGATE(trail)) {
395 ++mySource;
396 ch=UTF16_GET_PAIR_VALUE(ch, trail);
397 /* convert this supplementary code point */
398 /* exit this condition tree */
399 } else {
400 /* this is an unmatched lead code unit (1st surrogate) */
401 /* callback(illegal) */
402 cnv->fromUChar32 = ch;
403 *err = U_ILLEGAL_CHAR_FOUND;
404 break;
405 }
406 } else {
407 /* no more input */
408 cnv->fromUChar32 = ch;
409 break;
410 }
411 } else {
412 /* this is an unmatched trail code unit (2nd surrogate) */
413 /* callback(illegal) */
414 cnv->fromUChar32 = ch;
415 *err = U_ILLEGAL_CHAR_FOUND;
416 break;
417 }
418 }
419
420 if (ch < 0x10000)
421 {
422 indexToWrite = 2;
423 temp[2] = (char) ((ch >> 12) | 0xe0);
424 }
425 else
426 {
427 indexToWrite = 3;
428 temp[3] = (char) ((ch >> 18) | 0xf0);
429 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
430 }
431 temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
432 temp[0] = (char) ((ch & 0x3f) | 0x80);
433
434 for (; indexToWrite >= 0; indexToWrite--)
435 {
436 if (myTarget < targetLimit)
437 {
438 *(myTarget++) = temp[indexToWrite];
439 }
440 else
441 {
442 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
443 *err = U_BUFFER_OVERFLOW_ERROR;
444 }
445 }
446 }
447 }
448
449 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
450 {
451 *err = U_BUFFER_OVERFLOW_ERROR;
452 }
453
454 args->target = (char *) myTarget;
455 args->source = mySource;
456 }
457
458 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
459 UErrorCode * err)
460 {
461 UConverter *cnv = args->converter;
462 const UChar *mySource = args->source;
463 unsigned char *myTarget = (unsigned char *) args->target;
464 int32_t *myOffsets = args->offsets;
465 const UChar *sourceLimit = args->sourceLimit;
466 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
467 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
468 UChar32 ch;
469 int32_t offsetNum, nextSourceIndex;
470 int16_t indexToWrite;
471 char temp[4];
472
473 if (cnv->fromUChar32 && myTarget < targetLimit)
474 {
475 ch = cnv->fromUChar32;
476 cnv->fromUChar32 = 0;
477 offsetNum = -1;
478 nextSourceIndex = 0;
479 goto lowsurrogate;
480 } else {
481 offsetNum = 0;
482 }
483
484 while (mySource < sourceLimit && myTarget < targetLimit)
485 {
486 ch = *(mySource++);
487
488 if (ch < 0x80) /* Single byte */
489 {
490 *(myOffsets++) = offsetNum++;
491 *(myTarget++) = (char) ch;
492 }
493 else if (ch < 0x800) /* Double byte */
494 {
495 *(myOffsets++) = offsetNum;
496 *(myTarget++) = (char) ((ch >> 6) | 0xc0);
497 if (myTarget < targetLimit)
498 {
499 *(myOffsets++) = offsetNum++;
500 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
501 }
502 else
503 {
504 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
505 cnv->charErrorBufferLength = 1;
506 *err = U_BUFFER_OVERFLOW_ERROR;
507 }
508 }
509 else
510 /* Check for surrogates */
511 {
512 nextSourceIndex = offsetNum + 1;
513
514 if(UTF_IS_SURROGATE(ch) && !isCESU8) {
515 if(UTF_IS_SURROGATE_FIRST(ch)) {
516 lowsurrogate:
517 if (mySource < sourceLimit) {
518 /* test the following code unit */
519 UChar trail=*mySource;
520 if(UTF_IS_SECOND_SURROGATE(trail)) {
521 ++mySource;
522 ++nextSourceIndex;
523 ch=UTF16_GET_PAIR_VALUE(ch, trail);
524 /* convert this supplementary code point */
525 /* exit this condition tree */
526 } else {
527 /* this is an unmatched lead code unit (1st surrogate) */
528 /* callback(illegal) */
529 cnv->fromUChar32 = ch;
530 *err = U_ILLEGAL_CHAR_FOUND;
531 break;
532 }
533 } else {
534 /* no more input */
535 cnv->fromUChar32 = ch;
536 break;
537 }
538 } else {
539 /* this is an unmatched trail code unit (2nd surrogate) */
540 /* callback(illegal) */
541 cnv->fromUChar32 = ch;
542 *err = U_ILLEGAL_CHAR_FOUND;
543 break;
544 }
545 }
546
547 if (ch < 0x10000)
548 {
549 indexToWrite = 2;
550 temp[2] = (char) ((ch >> 12) | 0xe0);
551 }
552 else
553 {
554 indexToWrite = 3;
555 temp[3] = (char) ((ch >> 18) | 0xf0);
556 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
557 }
558 temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
559 temp[0] = (char) ((ch & 0x3f) | 0x80);
560
561 for (; indexToWrite >= 0; indexToWrite--)
562 {
563 if (myTarget < targetLimit)
564 {
565 *(myOffsets++) = offsetNum;
566 *(myTarget++) = temp[indexToWrite];
567 }
568 else
569 {
570 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
571 *err = U_BUFFER_OVERFLOW_ERROR;
572 }
573 }
574 offsetNum = nextSourceIndex;
575 }
576 }
577
578 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
579 {
580 *err = U_BUFFER_OVERFLOW_ERROR;
581 }
582
583 args->target = (char *) myTarget;
584 args->source = mySource;
585 args->offsets = myOffsets;
586 }
587
588 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
589 UErrorCode *err) {
590 UConverter *cnv;
591 const uint8_t *sourceInitial;
592 const uint8_t *source;
593 uint16_t extraBytesToWrite;
594 uint8_t myByte;
595 UChar32 ch;
596 int8_t i, isLegalSequence;
597
598 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
599
600 cnv = args->converter;
601 sourceInitial = source = (const uint8_t *)args->source;
602 if (source >= (const uint8_t *)args->sourceLimit)
603 {
604 /* no input */
605 *err = U_INDEX_OUTOFBOUNDS_ERROR;
606 return 0xffff;
607 }
608
609 myByte = (uint8_t)*(source++);
610 if (myByte < 0x80)
611 {
612 args->source = (const char *)source;
613 return (UChar32)myByte;
614 }
615
616 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
617 if (extraBytesToWrite == 0) {
618 cnv->toUBytes[0] = myByte;
619 cnv->toULength = 1;
620 *err = U_ILLEGAL_CHAR_FOUND;
621 args->source = (const char *)source;
622 return 0xffff;
623 }
624
625 /*The byte sequence is longer than the buffer area passed*/
626 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
627 {
628 /* check if all of the remaining bytes are trail bytes */
629 cnv->toUBytes[0] = myByte;
630 i = 1;
631 *err = U_TRUNCATED_CHAR_FOUND;
632 while(source < (const uint8_t *)args->sourceLimit) {
633 if(U8_IS_TRAIL(myByte = *source)) {
634 cnv->toUBytes[i++] = myByte;
635 ++source;
636 } else {
637 /* error even before we run out of input */
638 *err = U_ILLEGAL_CHAR_FOUND;
639 break;
640 }
641 }
642 cnv->toULength = i;
643 args->source = (const char *)source;
644 return 0xffff;
645 }
646
647 isLegalSequence = 1;
648 ch = myByte << 6;
649 switch(extraBytesToWrite)
650 {
651 /* note: code falls through cases! (sic)*/
652 case 6:
653 ch += (myByte = *source);
654 ch <<= 6;
655 if (!UTF8_IS_TRAIL(myByte))
656 {
657 isLegalSequence = 0;
658 break;
659 }
660 ++source;
661 case 5:
662 ch += (myByte = *source);
663 ch <<= 6;
664 if (!UTF8_IS_TRAIL(myByte))
665 {
666 isLegalSequence = 0;
667 break;
668 }
669 ++source;
670 case 4:
671 ch += (myByte = *source);
672 ch <<= 6;
673 if (!UTF8_IS_TRAIL(myByte))
674 {
675 isLegalSequence = 0;
676 break;
677 }
678 ++source;
679 case 3:
680 ch += (myByte = *source);
681 ch <<= 6;
682 if (!UTF8_IS_TRAIL(myByte))
683 {
684 isLegalSequence = 0;
685 break;
686 }
687 ++source;
688 case 2:
689 ch += (myByte = *source);
690 if (!UTF8_IS_TRAIL(myByte))
691 {
692 isLegalSequence = 0;
693 break;
694 }
695 ++source;
696 };
697 ch -= offsetsFromUTF8[extraBytesToWrite];
698 args->source = (const char *)source;
699
700 /*
701 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
702 * - use only trail bytes after a lead byte (checked above)
703 * - use the right number of trail bytes for a given lead byte
704 * - encode a code point <= U+10ffff
705 * - use the fewest possible number of bytes for their code points
706 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
707 *
708 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
709 * There are no irregular sequences any more.
710 */
711 if (isLegalSequence &&
712 (uint32_t)ch <= MAXIMUM_UTF &&
713 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
714 !U_IS_SURROGATE(ch)
715 ) {
716 return ch; /* return the code point */
717 }
718
719 for(i = 0; sourceInitial < source; ++i) {
720 cnv->toUBytes[i] = *sourceInitial++;
721 }
722 cnv->toULength = i;
723 *err = U_ILLEGAL_CHAR_FOUND;
724 return 0xffff;
725 }
726
727 /* UTF-8 converter data ----------------------------------------------------- */
728
729 static const UConverterImpl _UTF8Impl={
730 UCNV_UTF8,
731
732 NULL,
733 NULL,
734
735 NULL,
736 NULL,
737 NULL,
738
739 ucnv_toUnicode_UTF8,
740 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
741 ucnv_fromUnicode_UTF8,
742 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
743 ucnv_getNextUChar_UTF8,
744
745 NULL,
746 NULL,
747 NULL,
748 NULL,
749 ucnv_getNonSurrogateUnicodeSet
750 };
751
752 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
753 static const UConverterStaticData _UTF8StaticData={
754 sizeof(UConverterStaticData),
755 "UTF-8",
756 1208, UCNV_IBM, UCNV_UTF8,
757 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
758 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
759 0,
760 0,
761 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
762 };
763
764
765 const UConverterSharedData _UTF8Data={
766 sizeof(UConverterSharedData), ~((uint32_t) 0),
767 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
768 0
769 };
770
771 /* CESU-8 converter data ---------------------------------------------------- */
772
773 static const UConverterImpl _CESU8Impl={
774 UCNV_CESU8,
775
776 NULL,
777 NULL,
778
779 NULL,
780 NULL,
781 NULL,
782
783 ucnv_toUnicode_UTF8,
784 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
785 ucnv_fromUnicode_UTF8,
786 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
787 NULL,
788
789 NULL,
790 NULL,
791 NULL,
792 NULL,
793 ucnv_getCompleteUnicodeSet
794 };
795
796 static const UConverterStaticData _CESU8StaticData={
797 sizeof(UConverterStaticData),
798 "CESU-8",
799 9400, /* CCSID for CESU-8 */
800 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
801 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
802 0,
803 0,
804 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
805 };
806
807
808 const UConverterSharedData _CESU8Data={
809 sizeof(UConverterSharedData), ~((uint32_t) 0),
810 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
811 0
812 };
813
814 #endif