]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u8.c
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 2002-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv_u8.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2002jul01
12* created by: Markus W. Scherer
13*
14* UTF-8 converter implementation. Used to be in ucnv_utf.c.
15*
16* Also, CESU-8 implementation, see UTR 26.
17* The CESU-8 converter uses all the same functions as the
18* UTF-8 converter, with a branch for converting supplementary code points.
19*/
20
21#include "unicode/utypes.h"
374ca955
A
22
23#if !UCONFIG_NO_CONVERSION
24
b75a7d8f 25#include "unicode/ucnv.h"
b75a7d8f
A
26#include "ucnv_bld.h"
27#include "ucnv_cnv.h"
28#include "cmemory.h"
29
30/* Prototypes --------------------------------------------------------------- */
31
32/* Keep these here to make finicky compilers happy */
33
374ca955 34U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
b75a7d8f 35 UErrorCode *err);
374ca955 36U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
b75a7d8f 37 UErrorCode *err);
b75a7d8f
A
38
39
40/* UTF-8 -------------------------------------------------------------------- */
41
42/* UTF-8 Conversion DATA
43 * for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
44 */
45/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46#define MAXIMUM_UCS2 0x0000FFFF
47#define MAXIMUM_UTF 0x0010FFFF
48#define MAXIMUM_UCS4 0x7FFFFFFF
49#define HALF_SHIFT 10
50#define HALF_BASE 0x0010000
51#define HALF_MASK 0x3FF
52#define SURROGATE_HIGH_START 0xD800
53#define SURROGATE_HIGH_END 0xDBFF
54#define SURROGATE_LOW_START 0xDC00
55#define SURROGATE_LOW_END 0xDFFF
56
57/* -SURROGATE_LOW_START + HALF_BASE */
58#define SURROGATE_LOW_BASE 9216
59
60static const uint32_t offsetsFromUTF8[7] = {0,
61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
63};
64
65/* END OF UTF-8 Conversion DATA */
66
67static const int8_t bytesFromUTF8[256] = {
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
76};
77
78/*
79 * Starting with Unicode 3.0.1:
80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81 * byte sequences with more than 4 bytes are illegal in UTF-8,
82 * which is tested with impossible values for them
83 */
84static const uint32_t
85utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
86
374ca955 87static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
b75a7d8f
A
88 UErrorCode * err)
89{
90 const unsigned char *mySource = (unsigned char *) args->source;
91 UChar *myTarget = args->target;
92 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
93 const UChar *targetLimit = args->targetLimit;
94 unsigned char *toUBytes = args->converter->toUBytes;
95 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
96 uint32_t ch, ch2 = 0;
97 int32_t i, inBytes;
98
99 /* Restore size of current sequence */
b75a7d8f
A
100 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
101 {
102 inBytes = args->converter->mode; /* restore # of bytes to consume */
103 i = args->converter->toULength; /* restore # of bytes consumed */
104
105 ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
106 args->converter->toUnicodeStatus = 0;
107 goto morebytes;
108 }
109
110
111 while (mySource < sourceLimit && myTarget < targetLimit)
112 {
113 ch = *(mySource++);
114 if (ch < 0x80) /* Simple case */
115 {
116 *(myTarget++) = (UChar) ch;
117 }
118 else
119 {
120 /* store the first char */
121 toUBytes[0] = (char)ch;
122 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
123 i = 1;
124
125morebytes:
126 while (i < inBytes)
127 {
128 if (mySource < sourceLimit)
129 {
130 toUBytes[i] = (char) (ch2 = *mySource);
131 if (!UTF8_IS_TRAIL(ch2))
132 {
133 break; /* i < inBytes */
134 }
135 ch = (ch << 6) + ch2;
136 ++mySource;
137 i++;
138 }
139 else
140 {
374ca955
A
141 /* stores a partially calculated target*/
142 args->converter->toUnicodeStatus = ch;
143 args->converter->mode = inBytes;
144 args->converter->toULength = (int8_t) i;
b75a7d8f
A
145 goto donefornow;
146 }
147 }
148
149 /* Remove the accumulated high bits */
150 ch -= offsetsFromUTF8[inBytes];
151
152 /*
153 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
154 * - use only trail bytes after a lead byte (checked above)
155 * - use the right number of trail bytes for a given lead byte
156 * - encode a code point <= U+10ffff
157 * - use the fewest possible number of bytes for their code points
158 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
159 *
160 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
161 * There are no irregular sequences any more.
162 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
163 */
164 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
165 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
166 {
167 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
374ca955 168 args->converter->toULength = 0;
b75a7d8f
A
169 if (ch <= MAXIMUM_UCS2)
170 {
171 /* fits in 16 bits */
172 *(myTarget++) = (UChar) ch;
173 }
174 else
175 {
176 /* write out the surrogates */
177 ch -= HALF_BASE;
178 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
179 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
180 if (myTarget < targetLimit)
181 {
182 *(myTarget++) = (UChar)ch;
183 }
184 else
185 {
186 /* Put in overflow buffer (not handled here) */
187 args->converter->UCharErrorBuffer[0] = (UChar) ch;
188 args->converter->UCharErrorBufferLength = 1;
189 *err = U_BUFFER_OVERFLOW_ERROR;
190 break;
191 }
192 }
193 }
194 else
195 {
b75a7d8f 196 args->converter->toULength = (int8_t)i;
374ca955
A
197 *err = U_ILLEGAL_CHAR_FOUND;
198 break;
b75a7d8f
A
199 }
200 }
201 }
202
203donefornow:
204 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
205 {
206 /* End of target buffer */
207 *err = U_BUFFER_OVERFLOW_ERROR;
208 }
209
210 args->target = myTarget;
211 args->source = (const char *) mySource;
212}
213
374ca955 214static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
b75a7d8f
A
215 UErrorCode * err)
216{
217 const unsigned char *mySource = (unsigned char *) args->source;
218 UChar *myTarget = args->target;
219 int32_t *myOffsets = args->offsets;
220 int32_t offsetNum = 0;
221 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
222 const UChar *targetLimit = args->targetLimit;
223 unsigned char *toUBytes = args->converter->toUBytes;
224 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
225 uint32_t ch, ch2 = 0;
226 int32_t i, inBytes;
227
228 /* Restore size of current sequence */
b75a7d8f
A
229 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
230 {
231 inBytes = args->converter->mode; /* restore # of bytes to consume */
232 i = args->converter->toULength; /* restore # of bytes consumed */
233
234 ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
235 args->converter->toUnicodeStatus = 0;
236 goto morebytes;
237 }
238
239 while (mySource < sourceLimit && myTarget < targetLimit)
240 {
241 ch = *(mySource++);
242 if (ch < 0x80) /* Simple case */
243 {
244 *(myTarget++) = (UChar) ch;
245 *(myOffsets++) = offsetNum++;
246 }
247 else
248 {
249 toUBytes[0] = (char)ch;
250 inBytes = bytesFromUTF8[ch];
251 i = 1;
252
253morebytes:
254 while (i < inBytes)
255 {
256 if (mySource < sourceLimit)
257 {
258 toUBytes[i] = (char) (ch2 = *mySource);
259 if (!UTF8_IS_TRAIL(ch2))
260 {
261 break; /* i < inBytes */
262 }
263 ch = (ch << 6) + ch2;
264 ++mySource;
265 i++;
266 }
267 else
268 {
374ca955
A
269 args->converter->toUnicodeStatus = ch;
270 args->converter->mode = inBytes;
271 args->converter->toULength = (int8_t)i;
b75a7d8f
A
272 goto donefornow;
273 }
274 }
275
276 /* Remove the accumulated high bits */
277 ch -= offsetsFromUTF8[inBytes];
278
279 /*
280 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
281 * - use only trail bytes after a lead byte (checked above)
282 * - use the right number of trail bytes for a given lead byte
283 * - encode a code point <= U+10ffff
284 * - use the fewest possible number of bytes for their code points
285 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
286 *
287 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
288 * There are no irregular sequences any more.
289 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
290 */
291 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
292 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
293 {
294 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
374ca955 295 args->converter->toULength = 0;
b75a7d8f
A
296 if (ch <= MAXIMUM_UCS2)
297 {
298 /* fits in 16 bits */
299 *(myTarget++) = (UChar) ch;
300 *(myOffsets++) = offsetNum;
301 }
302 else
303 {
304 /* write out the surrogates */
305 ch -= HALF_BASE;
306 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
307 *(myOffsets++) = offsetNum;
308 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
309 if (myTarget < targetLimit)
310 {
311 *(myTarget++) = (UChar)ch;
312 *(myOffsets++) = offsetNum;
313 }
314 else
315 {
316 args->converter->UCharErrorBuffer[0] = (UChar) ch;
317 args->converter->UCharErrorBufferLength = 1;
318 *err = U_BUFFER_OVERFLOW_ERROR;
319 }
320 }
321 offsetNum += i;
322 }
323 else
324 {
b75a7d8f 325 args->converter->toULength = (int8_t)i;
374ca955
A
326 *err = U_ILLEGAL_CHAR_FOUND;
327 break;
b75a7d8f
A
328 }
329 }
330 }
331
332donefornow:
333 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
334 { /* End of target buffer */
335 *err = U_BUFFER_OVERFLOW_ERROR;
336 }
337
338 args->target = myTarget;
339 args->source = (const char *) mySource;
340 args->offsets = myOffsets;
341}
342
374ca955 343U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
b75a7d8f
A
344 UErrorCode * err)
345{
346 UConverter *cnv = args->converter;
347 const UChar *mySource = args->source;
348 unsigned char *myTarget = (unsigned char *) args->target;
349 const UChar *sourceLimit = args->sourceLimit;
350 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
351 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
374ca955 352 UChar32 ch, ch2;
b75a7d8f
A
353 int16_t indexToWrite;
354 char temp[4];
355
374ca955 356 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 357 {
374ca955
A
358 ch = cnv->fromUChar32;
359 cnv->fromUChar32 = 0;
b75a7d8f
A
360 goto lowsurrogate;
361 }
362
363 while (mySource < sourceLimit && myTarget < targetLimit)
364 {
365 ch = *(mySource++);
366
367 if (ch < 0x80) /* Single byte */
368 {
369 *(myTarget++) = (char) ch;
370 }
371 else if (ch < 0x800) /* Double byte */
372 {
373 *(myTarget++) = (char) ((ch >> 6) | 0xc0);
374 if (myTarget < targetLimit)
375 {
376 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
377 }
378 else
379 {
380 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
381 cnv->charErrorBufferLength = 1;
382 *err = U_BUFFER_OVERFLOW_ERROR;
383 }
384 }
385 else
386 /* Check for surrogates */
387 {
388 if(UTF_IS_SURROGATE(ch) && !isCESU8) {
389 if(UTF_IS_SURROGATE_FIRST(ch)) {
390lowsurrogate:
391 if (mySource < sourceLimit) {
392 /* test the following code unit */
393 UChar trail=*mySource;
394 if(UTF_IS_SECOND_SURROGATE(trail)) {
395 ++mySource;
396 ch=UTF16_GET_PAIR_VALUE(ch, trail);
397 ch2 = 0;
398 /* convert this supplementary code point */
399 /* exit this condition tree */
400 } else {
401 /* this is an unmatched lead code unit (1st surrogate) */
402 /* callback(illegal) */
374ca955
A
403 cnv->fromUChar32 = ch;
404 *err = U_ILLEGAL_CHAR_FOUND;
405 break;
b75a7d8f
A
406 }
407 } else {
408 /* no more input */
374ca955 409 cnv->fromUChar32 = ch;
b75a7d8f
A
410 break;
411 }
412 } else {
413 /* this is an unmatched trail code unit (2nd surrogate) */
414 /* callback(illegal) */
374ca955 415 cnv->fromUChar32 = ch;
b75a7d8f 416 *err = U_ILLEGAL_CHAR_FOUND;
374ca955 417 break;
b75a7d8f
A
418 }
419 }
420
421 if (ch < 0x10000)
422 {
423 indexToWrite = 2;
424 temp[2] = (char) ((ch >> 12) | 0xe0);
425 }
426 else
427 {
428 indexToWrite = 3;
429 temp[3] = (char) ((ch >> 18) | 0xf0);
430 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
431 }
432 temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
433 temp[0] = (char) ((ch & 0x3f) | 0x80);
434
435 for (; indexToWrite >= 0; indexToWrite--)
436 {
437 if (myTarget < targetLimit)
438 {
439 *(myTarget++) = temp[indexToWrite];
440 }
441 else
442 {
443 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
444 *err = U_BUFFER_OVERFLOW_ERROR;
445 }
446 }
447 }
448 }
449
450 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
451 {
452 *err = U_BUFFER_OVERFLOW_ERROR;
453 }
b75a7d8f
A
454
455 args->target = (char *) myTarget;
456 args->source = mySource;
457}
458
374ca955 459U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
b75a7d8f
A
460 UErrorCode * err)
461{
462 UConverter *cnv = args->converter;
463 const UChar *mySource = args->source;
464 unsigned char *myTarget = (unsigned char *) args->target;
465 int32_t *myOffsets = args->offsets;
466 const UChar *sourceLimit = args->sourceLimit;
467 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
468 UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
374ca955 469 UChar32 ch, ch2;
b75a7d8f
A
470 int32_t offsetNum, nextSourceIndex;
471 int16_t indexToWrite;
472 char temp[4];
473
374ca955 474 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 475 {
374ca955
A
476 ch = cnv->fromUChar32;
477 cnv->fromUChar32 = 0;
b75a7d8f
A
478 offsetNum = -1;
479 nextSourceIndex = 0;
480 goto lowsurrogate;
481 } else {
482 offsetNum = 0;
483 }
484
485 while (mySource < sourceLimit && myTarget < targetLimit)
486 {
487 ch = *(mySource++);
488
489 if (ch < 0x80) /* Single byte */
490 {
491 *(myOffsets++) = offsetNum++;
492 *(myTarget++) = (char) ch;
493 }
494 else if (ch < 0x800) /* Double byte */
495 {
496 *(myOffsets++) = offsetNum;
497 *(myTarget++) = (char) ((ch >> 6) | 0xc0);
498 if (myTarget < targetLimit)
499 {
500 *(myOffsets++) = offsetNum++;
501 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
502 }
503 else
504 {
505 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
506 cnv->charErrorBufferLength = 1;
507 *err = U_BUFFER_OVERFLOW_ERROR;
508 }
509 }
510 else
511 /* Check for surrogates */
512 {
513 nextSourceIndex = offsetNum + 1;
514
515 if(UTF_IS_SURROGATE(ch) && !isCESU8) {
516 if(UTF_IS_SURROGATE_FIRST(ch)) {
517lowsurrogate:
518 if (mySource < sourceLimit) {
519 /* test the following code unit */
520 UChar trail=*mySource;
521 if(UTF_IS_SECOND_SURROGATE(trail)) {
522 ++mySource;
523 ++nextSourceIndex;
524 ch=UTF16_GET_PAIR_VALUE(ch, trail);
525 ch2 = 0;
526 /* convert this supplementary code point */
527 /* exit this condition tree */
528 } else {
529 /* this is an unmatched lead code unit (1st surrogate) */
530 /* callback(illegal) */
374ca955
A
531 cnv->fromUChar32 = ch;
532 *err = U_ILLEGAL_CHAR_FOUND;
533 break;
b75a7d8f
A
534 }
535 } else {
536 /* no more input */
374ca955 537 cnv->fromUChar32 = ch;
b75a7d8f
A
538 break;
539 }
540 } else {
541 /* this is an unmatched trail code unit (2nd surrogate) */
542 /* callback(illegal) */
374ca955 543 cnv->fromUChar32 = ch;
b75a7d8f 544 *err = U_ILLEGAL_CHAR_FOUND;
374ca955 545 break;
b75a7d8f
A
546 }
547 }
548
549 if (ch < 0x10000)
550 {
551 indexToWrite = 2;
552 temp[2] = (char) ((ch >> 12) | 0xe0);
553 }
554 else
555 {
556 indexToWrite = 3;
557 temp[3] = (char) ((ch >> 18) | 0xf0);
558 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
559 }
560 temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
561 temp[0] = (char) ((ch & 0x3f) | 0x80);
562
563 for (; indexToWrite >= 0; indexToWrite--)
564 {
565 if (myTarget < targetLimit)
566 {
567 *(myOffsets++) = offsetNum;
568 *(myTarget++) = temp[indexToWrite];
569 }
570 else
571 {
572 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
573 *err = U_BUFFER_OVERFLOW_ERROR;
574 }
575 }
576 offsetNum = nextSourceIndex;
577 }
578 }
579
580 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
581 {
582 *err = U_BUFFER_OVERFLOW_ERROR;
583 }
b75a7d8f
A
584
585 args->target = (char *) myTarget;
586 args->source = mySource;
587 args->offsets = myOffsets;
588}
589
374ca955 590static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
b75a7d8f 591 UErrorCode *err) {
374ca955
A
592 UConverter *cnv;
593 const uint8_t *sourceInitial;
b75a7d8f 594 const uint8_t *source;
b75a7d8f
A
595 uint16_t extraBytesToWrite;
596 uint8_t myByte;
597 UChar32 ch;
374ca955 598 int8_t i, isLegalSequence;
b75a7d8f 599
374ca955
A
600 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
601
602 cnv = args->converter;
603 sourceInitial = source = (const uint8_t *)args->source;
604 if (source >= (const uint8_t *)args->sourceLimit)
b75a7d8f 605 {
374ca955
A
606 /* no input */
607 *err = U_INDEX_OUTOFBOUNDS_ERROR;
608 return 0xffff;
609 }
b75a7d8f 610
374ca955
A
611 myByte = (uint8_t)*(source++);
612 if (myByte < 0x80)
613 {
614 args->source = (const char *)source;
615 return (UChar32)myByte;
616 }
b75a7d8f 617
374ca955
A
618 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
619 if (extraBytesToWrite == 0) {
620 cnv->toUBytes[0] = myByte;
621 cnv->toULength = 1;
622 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f 623 args->source = (const char *)source;
374ca955
A
624 return 0xffff;
625 }
b75a7d8f 626
374ca955
A
627 /*The byte sequence is longer than the buffer area passed*/
628 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
629 {
630 /* check if all of the remaining bytes are trail bytes */
631 cnv->toUBytes[0] = myByte;
632 i = 1;
633 *err = U_TRUNCATED_CHAR_FOUND;
634 while(source < (const uint8_t *)args->sourceLimit) {
635 if(U8_IS_TRAIL(myByte = *source)) {
636 cnv->toUBytes[i++] = myByte;
637 ++source;
b75a7d8f 638 } else {
374ca955
A
639 /* error even before we run out of input */
640 *err = U_ILLEGAL_CHAR_FOUND;
641 break;
b75a7d8f
A
642 }
643 }
374ca955
A
644 cnv->toULength = i;
645 args->source = (const char *)source;
646 return 0xffff;
647 }
b75a7d8f 648
374ca955
A
649 isLegalSequence = 1;
650 ch = myByte << 6;
651 switch(extraBytesToWrite)
652 {
653 /* note: code falls through cases! (sic)*/
654 case 6:
655 ch += (myByte = *source);
656 ch <<= 6;
657 if (!UTF8_IS_TRAIL(myByte))
658 {
659 isLegalSequence = 0;
660 break;
b75a7d8f 661 }
374ca955
A
662 ++source;
663 case 5:
664 ch += (myByte = *source);
665 ch <<= 6;
666 if (!UTF8_IS_TRAIL(myByte))
667 {
668 isLegalSequence = 0;
669 break;
670 }
671 ++source;
672 case 4:
673 ch += (myByte = *source);
674 ch <<= 6;
675 if (!UTF8_IS_TRAIL(myByte))
676 {
677 isLegalSequence = 0;
678 break;
679 }
680 ++source;
681 case 3:
682 ch += (myByte = *source);
683 ch <<= 6;
684 if (!UTF8_IS_TRAIL(myByte))
685 {
686 isLegalSequence = 0;
687 break;
688 }
689 ++source;
690 case 2:
691 ch += (myByte = *source);
692 if (!UTF8_IS_TRAIL(myByte))
693 {
694 isLegalSequence = 0;
695 break;
696 }
697 ++source;
698 };
699 ch -= offsetsFromUTF8[extraBytesToWrite];
700 args->source = (const char *)source;
701
702 /*
703 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
704 * - use only trail bytes after a lead byte (checked above)
705 * - use the right number of trail bytes for a given lead byte
706 * - encode a code point <= U+10ffff
707 * - use the fewest possible number of bytes for their code points
708 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
709 *
710 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
711 * There are no irregular sequences any more.
712 */
713 if (isLegalSequence &&
714 (uint32_t)ch <= MAXIMUM_UTF &&
715 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
716 !U_IS_SURROGATE(ch)
717 ) {
718 return ch; /* return the code point */
b75a7d8f
A
719 }
720
374ca955
A
721 for(i = 0; sourceInitial < source; ++i) {
722 cnv->toUBytes[i] = *sourceInitial++;
723 }
724 cnv->toULength = i;
725 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
726 return 0xffff;
727}
728
729/* UTF-8 converter data ----------------------------------------------------- */
730
731static const UConverterImpl _UTF8Impl={
732 UCNV_UTF8,
733
734 NULL,
735 NULL,
736
737 NULL,
738 NULL,
739 NULL,
740
374ca955
A
741 ucnv_toUnicode_UTF8,
742 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
743 ucnv_fromUnicode_UTF8,
744 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
745 ucnv_getNextUChar_UTF8,
b75a7d8f
A
746
747 NULL,
748 NULL,
749 NULL,
750 NULL,
751 ucnv_getNonSurrogateUnicodeSet
752};
753
754/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
755static const UConverterStaticData _UTF8StaticData={
756 sizeof(UConverterStaticData),
757 "UTF-8",
374ca955
A
758 1208, UCNV_IBM, UCNV_UTF8,
759 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
b75a7d8f
A
760 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
761 0,
762 0,
763 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
764};
765
766
767const UConverterSharedData _UTF8Data={
768 sizeof(UConverterSharedData), ~((uint32_t) 0),
769 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
770 0
771};
772
773/* CESU-8 converter data ---------------------------------------------------- */
774
374ca955
A
775static const UConverterImpl _CESU8Impl={
776 UCNV_CESU8,
777
778 NULL,
779 NULL,
780
781 NULL,
782 NULL,
783 NULL,
784
785 ucnv_toUnicode_UTF8,
786 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
787 ucnv_fromUnicode_UTF8,
788 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
789 NULL,
790
791 NULL,
792 NULL,
793 NULL,
794 NULL,
795 ucnv_getCompleteUnicodeSet
796};
797
b75a7d8f
A
798static const UConverterStaticData _CESU8StaticData={
799 sizeof(UConverterStaticData),
800 "CESU-8",
801 0, UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
802 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
803 0,
804 0,
805 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
806};
807
808
809const UConverterSharedData _CESU8Data={
810 sizeof(UConverterSharedData), ~((uint32_t) 0),
374ca955 811 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
b75a7d8f
A
812 0
813};
374ca955
A
814
815#endif