]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u8.c
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
73c04bcf 3* Copyright (C) 2002-2006, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv_u8.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2002jul01
12* created by: Markus W. Scherer
13*
14* UTF-8 converter implementation. Used to be in ucnv_utf.c.
15*
16* Also, CESU-8 implementation, see UTR 26.
17* The CESU-8 converter uses all the same functions as the
18* UTF-8 converter, with a branch for converting supplementary code points.
19*/
20
21#include "unicode/utypes.h"
374ca955
A
22
23#if !UCONFIG_NO_CONVERSION
24
b75a7d8f 25#include "unicode/ucnv.h"
b75a7d8f
A
26#include "ucnv_bld.h"
27#include "ucnv_cnv.h"
28#include "cmemory.h"
29
30/* Prototypes --------------------------------------------------------------- */
31
32/* Keep these here to make finicky compilers happy */
33
374ca955 34U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
b75a7d8f 35 UErrorCode *err);
374ca955 36U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
b75a7d8f 37 UErrorCode *err);
b75a7d8f
A
38
39
40/* UTF-8 -------------------------------------------------------------------- */
41
42/* UTF-8 Conversion DATA
43 * for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
44 */
45/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46#define MAXIMUM_UCS2 0x0000FFFF
47#define MAXIMUM_UTF 0x0010FFFF
48#define MAXIMUM_UCS4 0x7FFFFFFF
49#define HALF_SHIFT 10
50#define HALF_BASE 0x0010000
51#define HALF_MASK 0x3FF
52#define SURROGATE_HIGH_START 0xD800
53#define SURROGATE_HIGH_END 0xDBFF
54#define SURROGATE_LOW_START 0xDC00
55#define SURROGATE_LOW_END 0xDFFF
56
57/* -SURROGATE_LOW_START + HALF_BASE */
58#define SURROGATE_LOW_BASE 9216
59
60static const uint32_t offsetsFromUTF8[7] = {0,
61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
63};
64
65/* END OF UTF-8 Conversion DATA */
66
67static const int8_t bytesFromUTF8[256] = {
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
76};
77
78/*
79 * Starting with Unicode 3.0.1:
80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81 * byte sequences with more than 4 bytes are illegal in UTF-8,
82 * which is tested with impossible values for them
83 */
84static const uint32_t
85utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
86
374ca955 87static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
b75a7d8f
A
88 UErrorCode * err)
89{
90 const unsigned char *mySource = (unsigned char *) args->source;
91 UChar *myTarget = args->target;
92 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
93 const UChar *targetLimit = args->targetLimit;
94 unsigned char *toUBytes = args->converter->toUBytes;
95 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
96 uint32_t ch, ch2 = 0;
97 int32_t i, inBytes;
98
99 /* Restore size of current sequence */
b75a7d8f
A
100 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
101 {
102 inBytes = args->converter->mode; /* restore # of bytes to consume */
103 i = args->converter->toULength; /* restore # of bytes consumed */
73c04bcf 104 args->converter->toULength = 0;
b75a7d8f
A
105
106 ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
107 args->converter->toUnicodeStatus = 0;
108 goto morebytes;
109 }
110
111
112 while (mySource < sourceLimit && myTarget < targetLimit)
113 {
114 ch = *(mySource++);
115 if (ch < 0x80) /* Simple case */
116 {
117 *(myTarget++) = (UChar) ch;
118 }
119 else
120 {
121 /* store the first char */
122 toUBytes[0] = (char)ch;
123 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
124 i = 1;
125
126morebytes:
127 while (i < inBytes)
128 {
129 if (mySource < sourceLimit)
130 {
131 toUBytes[i] = (char) (ch2 = *mySource);
132 if (!UTF8_IS_TRAIL(ch2))
133 {
134 break; /* i < inBytes */
135 }
136 ch = (ch << 6) + ch2;
137 ++mySource;
138 i++;
139 }
140 else
141 {
374ca955
A
142 /* stores a partially calculated target*/
143 args->converter->toUnicodeStatus = ch;
144 args->converter->mode = inBytes;
145 args->converter->toULength = (int8_t) i;
b75a7d8f
A
146 goto donefornow;
147 }
148 }
149
150 /* Remove the accumulated high bits */
151 ch -= offsetsFromUTF8[inBytes];
152
153 /*
154 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
155 * - use only trail bytes after a lead byte (checked above)
156 * - use the right number of trail bytes for a given lead byte
157 * - encode a code point <= U+10ffff
158 * - use the fewest possible number of bytes for their code points
159 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
160 *
161 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
162 * There are no irregular sequences any more.
163 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
164 */
165 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
166 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
167 {
168 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
169 if (ch <= MAXIMUM_UCS2)
170 {
171 /* fits in 16 bits */
172 *(myTarget++) = (UChar) ch;
173 }
174 else
175 {
176 /* write out the surrogates */
177 ch -= HALF_BASE;
178 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
179 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
180 if (myTarget < targetLimit)
181 {
182 *(myTarget++) = (UChar)ch;
183 }
184 else
185 {
186 /* Put in overflow buffer (not handled here) */
187 args->converter->UCharErrorBuffer[0] = (UChar) ch;
188 args->converter->UCharErrorBufferLength = 1;
189 *err = U_BUFFER_OVERFLOW_ERROR;
190 break;
191 }
192 }
193 }
194 else
195 {
b75a7d8f 196 args->converter->toULength = (int8_t)i;
374ca955
A
197 *err = U_ILLEGAL_CHAR_FOUND;
198 break;
b75a7d8f
A
199 }
200 }
201 }
202
203donefornow:
204 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
205 {
206 /* End of target buffer */
207 *err = U_BUFFER_OVERFLOW_ERROR;
208 }
209
210 args->target = myTarget;
211 args->source = (const char *) mySource;
212}
213
374ca955 214static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
b75a7d8f
A
215 UErrorCode * err)
216{
217 const unsigned char *mySource = (unsigned char *) args->source;
218 UChar *myTarget = args->target;
219 int32_t *myOffsets = args->offsets;
220 int32_t offsetNum = 0;
221 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
222 const UChar *targetLimit = args->targetLimit;
223 unsigned char *toUBytes = args->converter->toUBytes;
224 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
225 uint32_t ch, ch2 = 0;
226 int32_t i, inBytes;
227
228 /* Restore size of current sequence */
b75a7d8f
A
229 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
230 {
231 inBytes = args->converter->mode; /* restore # of bytes to consume */
232 i = args->converter->toULength; /* restore # of bytes consumed */
73c04bcf 233 args->converter->toULength = 0;
b75a7d8f
A
234
235 ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
236 args->converter->toUnicodeStatus = 0;
237 goto morebytes;
238 }
239
240 while (mySource < sourceLimit && myTarget < targetLimit)
241 {
242 ch = *(mySource++);
243 if (ch < 0x80) /* Simple case */
244 {
245 *(myTarget++) = (UChar) ch;
246 *(myOffsets++) = offsetNum++;
247 }
248 else
249 {
250 toUBytes[0] = (char)ch;
251 inBytes = bytesFromUTF8[ch];
252 i = 1;
253
254morebytes:
255 while (i < inBytes)
256 {
257 if (mySource < sourceLimit)
258 {
259 toUBytes[i] = (char) (ch2 = *mySource);
260 if (!UTF8_IS_TRAIL(ch2))
261 {
262 break; /* i < inBytes */
263 }
264 ch = (ch << 6) + ch2;
265 ++mySource;
266 i++;
267 }
268 else
269 {
374ca955
A
270 args->converter->toUnicodeStatus = ch;
271 args->converter->mode = inBytes;
272 args->converter->toULength = (int8_t)i;
b75a7d8f
A
273 goto donefornow;
274 }
275 }
276
277 /* Remove the accumulated high bits */
278 ch -= offsetsFromUTF8[inBytes];
279
280 /*
281 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
282 * - use only trail bytes after a lead byte (checked above)
283 * - use the right number of trail bytes for a given lead byte
284 * - encode a code point <= U+10ffff
285 * - use the fewest possible number of bytes for their code points
286 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
287 *
288 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
289 * There are no irregular sequences any more.
290 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
291 */
292 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
293 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
294 {
295 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
296 if (ch <= MAXIMUM_UCS2)
297 {
298 /* fits in 16 bits */
299 *(myTarget++) = (UChar) ch;
300 *(myOffsets++) = offsetNum;
301 }
302 else
303 {
304 /* write out the surrogates */
305 ch -= HALF_BASE;
306 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
307 *(myOffsets++) = offsetNum;
308 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
309 if (myTarget < targetLimit)
310 {
311 *(myTarget++) = (UChar)ch;
312 *(myOffsets++) = offsetNum;
313 }
314 else
315 {
316 args->converter->UCharErrorBuffer[0] = (UChar) ch;
317 args->converter->UCharErrorBufferLength = 1;
318 *err = U_BUFFER_OVERFLOW_ERROR;
319 }
320 }
321 offsetNum += i;
322 }
323 else
324 {
b75a7d8f 325 args->converter->toULength = (int8_t)i;
374ca955
A
326 *err = U_ILLEGAL_CHAR_FOUND;
327 break;
b75a7d8f
A
328 }
329 }
330 }
331
332donefornow:
333 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
334 { /* End of target buffer */
335 *err = U_BUFFER_OVERFLOW_ERROR;
336 }
337
338 args->target = myTarget;
339 args->source = (const char *) mySource;
340 args->offsets = myOffsets;
341}
342
374ca955 343U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
b75a7d8f
A
344 UErrorCode * err)
345{
346 UConverter *cnv = args->converter;
347 const UChar *mySource = args->source;
348 unsigned char *myTarget = (unsigned char *) args->target;
349 const UChar *sourceLimit = args->sourceLimit;
350 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
351 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
73c04bcf 352 UChar32 ch;
b75a7d8f
A
353 int16_t indexToWrite;
354 char temp[4];
355
374ca955 356 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 357 {
374ca955
A
358 ch = cnv->fromUChar32;
359 cnv->fromUChar32 = 0;
b75a7d8f
A
360 goto lowsurrogate;
361 }
362
363 while (mySource < sourceLimit && myTarget < targetLimit)
364 {
365 ch = *(mySource++);
366
367 if (ch < 0x80) /* Single byte */
368 {
369 *(myTarget++) = (char) ch;
370 }
371 else if (ch < 0x800) /* Double byte */
372 {
373 *(myTarget++) = (char) ((ch >> 6) | 0xc0);
374 if (myTarget < targetLimit)
375 {
376 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
377 }
378 else
379 {
380 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
381 cnv->charErrorBufferLength = 1;
382 *err = U_BUFFER_OVERFLOW_ERROR;
383 }
384 }
385 else
386 /* Check for surrogates */
387 {
388 if(UTF_IS_SURROGATE(ch) && !isCESU8) {
389 if(UTF_IS_SURROGATE_FIRST(ch)) {
390lowsurrogate:
391 if (mySource < sourceLimit) {
392 /* test the following code unit */
393 UChar trail=*mySource;
394 if(UTF_IS_SECOND_SURROGATE(trail)) {
395 ++mySource;
396 ch=UTF16_GET_PAIR_VALUE(ch, trail);
b75a7d8f
A
397 /* convert this supplementary code point */
398 /* exit this condition tree */
399 } else {
400 /* this is an unmatched lead code unit (1st surrogate) */
401 /* callback(illegal) */
374ca955
A
402 cnv->fromUChar32 = ch;
403 *err = U_ILLEGAL_CHAR_FOUND;
404 break;
b75a7d8f
A
405 }
406 } else {
407 /* no more input */
374ca955 408 cnv->fromUChar32 = ch;
b75a7d8f
A
409 break;
410 }
411 } else {
412 /* this is an unmatched trail code unit (2nd surrogate) */
413 /* callback(illegal) */
374ca955 414 cnv->fromUChar32 = ch;
b75a7d8f 415 *err = U_ILLEGAL_CHAR_FOUND;
374ca955 416 break;
b75a7d8f
A
417 }
418 }
419
420 if (ch < 0x10000)
421 {
422 indexToWrite = 2;
423 temp[2] = (char) ((ch >> 12) | 0xe0);
424 }
425 else
426 {
427 indexToWrite = 3;
428 temp[3] = (char) ((ch >> 18) | 0xf0);
429 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
430 }
431 temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
432 temp[0] = (char) ((ch & 0x3f) | 0x80);
433
434 for (; indexToWrite >= 0; indexToWrite--)
435 {
436 if (myTarget < targetLimit)
437 {
438 *(myTarget++) = temp[indexToWrite];
439 }
440 else
441 {
442 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
443 *err = U_BUFFER_OVERFLOW_ERROR;
444 }
445 }
446 }
447 }
448
449 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
450 {
451 *err = U_BUFFER_OVERFLOW_ERROR;
452 }
b75a7d8f
A
453
454 args->target = (char *) myTarget;
455 args->source = mySource;
456}
457
374ca955 458U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
b75a7d8f
A
459 UErrorCode * err)
460{
461 UConverter *cnv = args->converter;
462 const UChar *mySource = args->source;
463 unsigned char *myTarget = (unsigned char *) args->target;
464 int32_t *myOffsets = args->offsets;
465 const UChar *sourceLimit = args->sourceLimit;
466 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
467 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
73c04bcf 468 UChar32 ch;
b75a7d8f
A
469 int32_t offsetNum, nextSourceIndex;
470 int16_t indexToWrite;
471 char temp[4];
472
374ca955 473 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 474 {
374ca955
A
475 ch = cnv->fromUChar32;
476 cnv->fromUChar32 = 0;
b75a7d8f
A
477 offsetNum = -1;
478 nextSourceIndex = 0;
479 goto lowsurrogate;
480 } else {
481 offsetNum = 0;
482 }
483
484 while (mySource < sourceLimit && myTarget < targetLimit)
485 {
486 ch = *(mySource++);
487
488 if (ch < 0x80) /* Single byte */
489 {
490 *(myOffsets++) = offsetNum++;
491 *(myTarget++) = (char) ch;
492 }
493 else if (ch < 0x800) /* Double byte */
494 {
495 *(myOffsets++) = offsetNum;
496 *(myTarget++) = (char) ((ch >> 6) | 0xc0);
497 if (myTarget < targetLimit)
498 {
499 *(myOffsets++) = offsetNum++;
500 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
501 }
502 else
503 {
504 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
505 cnv->charErrorBufferLength = 1;
506 *err = U_BUFFER_OVERFLOW_ERROR;
507 }
508 }
509 else
510 /* Check for surrogates */
511 {
512 nextSourceIndex = offsetNum + 1;
513
514 if(UTF_IS_SURROGATE(ch) && !isCESU8) {
515 if(UTF_IS_SURROGATE_FIRST(ch)) {
516lowsurrogate:
517 if (mySource < sourceLimit) {
518 /* test the following code unit */
519 UChar trail=*mySource;
520 if(UTF_IS_SECOND_SURROGATE(trail)) {
521 ++mySource;
522 ++nextSourceIndex;
523 ch=UTF16_GET_PAIR_VALUE(ch, trail);
b75a7d8f
A
524 /* convert this supplementary code point */
525 /* exit this condition tree */
526 } else {
527 /* this is an unmatched lead code unit (1st surrogate) */
528 /* callback(illegal) */
374ca955
A
529 cnv->fromUChar32 = ch;
530 *err = U_ILLEGAL_CHAR_FOUND;
531 break;
b75a7d8f
A
532 }
533 } else {
534 /* no more input */
374ca955 535 cnv->fromUChar32 = ch;
b75a7d8f
A
536 break;
537 }
538 } else {
539 /* this is an unmatched trail code unit (2nd surrogate) */
540 /* callback(illegal) */
374ca955 541 cnv->fromUChar32 = ch;
b75a7d8f 542 *err = U_ILLEGAL_CHAR_FOUND;
374ca955 543 break;
b75a7d8f
A
544 }
545 }
546
547 if (ch < 0x10000)
548 {
549 indexToWrite = 2;
550 temp[2] = (char) ((ch >> 12) | 0xe0);
551 }
552 else
553 {
554 indexToWrite = 3;
555 temp[3] = (char) ((ch >> 18) | 0xf0);
556 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
557 }
558 temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
559 temp[0] = (char) ((ch & 0x3f) | 0x80);
560
561 for (; indexToWrite >= 0; indexToWrite--)
562 {
563 if (myTarget < targetLimit)
564 {
565 *(myOffsets++) = offsetNum;
566 *(myTarget++) = temp[indexToWrite];
567 }
568 else
569 {
570 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
571 *err = U_BUFFER_OVERFLOW_ERROR;
572 }
573 }
574 offsetNum = nextSourceIndex;
575 }
576 }
577
578 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
579 {
580 *err = U_BUFFER_OVERFLOW_ERROR;
581 }
b75a7d8f
A
582
583 args->target = (char *) myTarget;
584 args->source = mySource;
585 args->offsets = myOffsets;
586}
587
374ca955 588static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
b75a7d8f 589 UErrorCode *err) {
374ca955
A
590 UConverter *cnv;
591 const uint8_t *sourceInitial;
b75a7d8f 592 const uint8_t *source;
b75a7d8f
A
593 uint16_t extraBytesToWrite;
594 uint8_t myByte;
595 UChar32 ch;
374ca955 596 int8_t i, isLegalSequence;
b75a7d8f 597
374ca955
A
598 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
599
600 cnv = args->converter;
601 sourceInitial = source = (const uint8_t *)args->source;
602 if (source >= (const uint8_t *)args->sourceLimit)
b75a7d8f 603 {
374ca955
A
604 /* no input */
605 *err = U_INDEX_OUTOFBOUNDS_ERROR;
606 return 0xffff;
607 }
b75a7d8f 608
374ca955
A
609 myByte = (uint8_t)*(source++);
610 if (myByte < 0x80)
611 {
612 args->source = (const char *)source;
613 return (UChar32)myByte;
614 }
b75a7d8f 615
374ca955
A
616 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
617 if (extraBytesToWrite == 0) {
618 cnv->toUBytes[0] = myByte;
619 cnv->toULength = 1;
620 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f 621 args->source = (const char *)source;
374ca955
A
622 return 0xffff;
623 }
b75a7d8f 624
374ca955
A
625 /*The byte sequence is longer than the buffer area passed*/
626 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
627 {
628 /* check if all of the remaining bytes are trail bytes */
629 cnv->toUBytes[0] = myByte;
630 i = 1;
631 *err = U_TRUNCATED_CHAR_FOUND;
632 while(source < (const uint8_t *)args->sourceLimit) {
633 if(U8_IS_TRAIL(myByte = *source)) {
634 cnv->toUBytes[i++] = myByte;
635 ++source;
b75a7d8f 636 } else {
374ca955
A
637 /* error even before we run out of input */
638 *err = U_ILLEGAL_CHAR_FOUND;
639 break;
b75a7d8f
A
640 }
641 }
374ca955
A
642 cnv->toULength = i;
643 args->source = (const char *)source;
644 return 0xffff;
645 }
b75a7d8f 646
374ca955
A
647 isLegalSequence = 1;
648 ch = myByte << 6;
649 switch(extraBytesToWrite)
650 {
651 /* note: code falls through cases! (sic)*/
652 case 6:
653 ch += (myByte = *source);
654 ch <<= 6;
655 if (!UTF8_IS_TRAIL(myByte))
656 {
657 isLegalSequence = 0;
658 break;
b75a7d8f 659 }
374ca955
A
660 ++source;
661 case 5:
662 ch += (myByte = *source);
663 ch <<= 6;
664 if (!UTF8_IS_TRAIL(myByte))
665 {
666 isLegalSequence = 0;
667 break;
668 }
669 ++source;
670 case 4:
671 ch += (myByte = *source);
672 ch <<= 6;
673 if (!UTF8_IS_TRAIL(myByte))
674 {
675 isLegalSequence = 0;
676 break;
677 }
678 ++source;
679 case 3:
680 ch += (myByte = *source);
681 ch <<= 6;
682 if (!UTF8_IS_TRAIL(myByte))
683 {
684 isLegalSequence = 0;
685 break;
686 }
687 ++source;
688 case 2:
689 ch += (myByte = *source);
690 if (!UTF8_IS_TRAIL(myByte))
691 {
692 isLegalSequence = 0;
693 break;
694 }
695 ++source;
696 };
697 ch -= offsetsFromUTF8[extraBytesToWrite];
698 args->source = (const char *)source;
699
700 /*
701 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
702 * - use only trail bytes after a lead byte (checked above)
703 * - use the right number of trail bytes for a given lead byte
704 * - encode a code point <= U+10ffff
705 * - use the fewest possible number of bytes for their code points
706 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
707 *
708 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
709 * There are no irregular sequences any more.
710 */
711 if (isLegalSequence &&
712 (uint32_t)ch <= MAXIMUM_UTF &&
713 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
714 !U_IS_SURROGATE(ch)
715 ) {
716 return ch; /* return the code point */
b75a7d8f
A
717 }
718
374ca955
A
719 for(i = 0; sourceInitial < source; ++i) {
720 cnv->toUBytes[i] = *sourceInitial++;
721 }
722 cnv->toULength = i;
723 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
724 return 0xffff;
725}
726
727/* UTF-8 converter data ----------------------------------------------------- */
728
729static const UConverterImpl _UTF8Impl={
730 UCNV_UTF8,
731
732 NULL,
733 NULL,
734
735 NULL,
736 NULL,
737 NULL,
738
374ca955
A
739 ucnv_toUnicode_UTF8,
740 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
741 ucnv_fromUnicode_UTF8,
742 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
743 ucnv_getNextUChar_UTF8,
b75a7d8f
A
744
745 NULL,
746 NULL,
747 NULL,
748 NULL,
749 ucnv_getNonSurrogateUnicodeSet
750};
751
752/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
753static const UConverterStaticData _UTF8StaticData={
754 sizeof(UConverterStaticData),
755 "UTF-8",
374ca955
A
756 1208, UCNV_IBM, UCNV_UTF8,
757 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
b75a7d8f
A
758 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
759 0,
760 0,
761 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
762};
763
764
765const UConverterSharedData _UTF8Data={
766 sizeof(UConverterSharedData), ~((uint32_t) 0),
767 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
768 0
769};
770
771/* CESU-8 converter data ---------------------------------------------------- */
772
374ca955
A
773static const UConverterImpl _CESU8Impl={
774 UCNV_CESU8,
775
776 NULL,
777 NULL,
778
779 NULL,
780 NULL,
781 NULL,
782
783 ucnv_toUnicode_UTF8,
784 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
785 ucnv_fromUnicode_UTF8,
786 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
787 NULL,
788
789 NULL,
790 NULL,
791 NULL,
792 NULL,
793 ucnv_getCompleteUnicodeSet
794};
795
b75a7d8f
A
796static const UConverterStaticData _CESU8StaticData={
797 sizeof(UConverterStaticData),
798 "CESU-8",
73c04bcf
A
799 9400, /* CCSID for CESU-8 */
800 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
b75a7d8f
A
801 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
802 0,
803 0,
804 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
805};
806
807
808const UConverterSharedData _CESU8Data={
809 sizeof(UConverterSharedData), ~((uint32_t) 0),
374ca955 810 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
b75a7d8f
A
811 0
812};
374ca955
A
813
814#endif