]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u8.c
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
b331163b 3* Copyright (C) 2002-2015, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv_u8.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2002jul01
12* created by: Markus W. Scherer
13*
14* UTF-8 converter implementation. Used to be in ucnv_utf.c.
15*
16* Also, CESU-8 implementation, see UTR 26.
17* The CESU-8 converter uses all the same functions as the
18* UTF-8 converter, with a branch for converting supplementary code points.
19*/
20
21#include "unicode/utypes.h"
374ca955
A
22
23#if !UCONFIG_NO_CONVERSION
24
b75a7d8f 25#include "unicode/ucnv.h"
4388f060
A
26#include "unicode/utf.h"
27#include "unicode/utf8.h"
28#include "unicode/utf16.h"
b75a7d8f
A
29#include "ucnv_bld.h"
30#include "ucnv_cnv.h"
31#include "cmemory.h"
32
33/* Prototypes --------------------------------------------------------------- */
34
35/* Keep these here to make finicky compilers happy */
36
374ca955 37U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
b75a7d8f 38 UErrorCode *err);
374ca955 39U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
b75a7d8f 40 UErrorCode *err);
b75a7d8f
A
41
42
43/* UTF-8 -------------------------------------------------------------------- */
44
45/* UTF-8 Conversion DATA
46f4442e 46 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
b75a7d8f
A
47 */
48/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49#define MAXIMUM_UCS2 0x0000FFFF
50#define MAXIMUM_UTF 0x0010FFFF
51#define MAXIMUM_UCS4 0x7FFFFFFF
52#define HALF_SHIFT 10
53#define HALF_BASE 0x0010000
54#define HALF_MASK 0x3FF
55#define SURROGATE_HIGH_START 0xD800
56#define SURROGATE_HIGH_END 0xDBFF
57#define SURROGATE_LOW_START 0xDC00
58#define SURROGATE_LOW_END 0xDFFF
59
60/* -SURROGATE_LOW_START + HALF_BASE */
61#define SURROGATE_LOW_BASE 9216
62
63static const uint32_t offsetsFromUTF8[7] = {0,
64 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66};
67
68/* END OF UTF-8 Conversion DATA */
69
70static const int8_t bytesFromUTF8[256] = {
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79};
80
81/*
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
86 */
87static const uint32_t
88utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89
b331163b
A
90static UBool hasCESU8Data(const UConverter *cnv)
91{
92#if UCONFIG_ONLY_HTML_CONVERSION
93 return FALSE;
94#else
95 return (UBool)(cnv->sharedData == &_CESU8Data);
96#endif
97}
98
374ca955 99static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
b75a7d8f
A
100 UErrorCode * err)
101{
46f4442e 102 UConverter *cnv = args->converter;
b75a7d8f
A
103 const unsigned char *mySource = (unsigned char *) args->source;
104 UChar *myTarget = args->target;
105 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
106 const UChar *targetLimit = args->targetLimit;
46f4442e 107 unsigned char *toUBytes = cnv->toUBytes;
b331163b 108 UBool isCESU8 = hasCESU8Data(cnv);
b75a7d8f
A
109 uint32_t ch, ch2 = 0;
110 int32_t i, inBytes;
b331163b 111
b75a7d8f 112 /* Restore size of current sequence */
46f4442e 113 if (cnv->toUnicodeStatus && myTarget < targetLimit)
b75a7d8f 114 {
46f4442e
A
115 inBytes = cnv->mode; /* restore # of bytes to consume */
116 i = cnv->toULength; /* restore # of bytes consumed */
117 cnv->toULength = 0;
b75a7d8f 118
46f4442e
A
119 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
120 cnv->toUnicodeStatus = 0;
b75a7d8f
A
121 goto morebytes;
122 }
123
124
125 while (mySource < sourceLimit && myTarget < targetLimit)
126 {
127 ch = *(mySource++);
128 if (ch < 0x80) /* Simple case */
129 {
130 *(myTarget++) = (UChar) ch;
131 }
132 else
133 {
134 /* store the first char */
135 toUBytes[0] = (char)ch;
136 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
137 i = 1;
138
139morebytes:
140 while (i < inBytes)
141 {
142 if (mySource < sourceLimit)
143 {
144 toUBytes[i] = (char) (ch2 = *mySource);
4388f060 145 if (!U8_IS_TRAIL(ch2))
b75a7d8f
A
146 {
147 break; /* i < inBytes */
148 }
149 ch = (ch << 6) + ch2;
150 ++mySource;
151 i++;
152 }
153 else
154 {
374ca955 155 /* stores a partially calculated target*/
46f4442e
A
156 cnv->toUnicodeStatus = ch;
157 cnv->mode = inBytes;
158 cnv->toULength = (int8_t) i;
b75a7d8f
A
159 goto donefornow;
160 }
161 }
162
163 /* Remove the accumulated high bits */
164 ch -= offsetsFromUTF8[inBytes];
165
166 /*
167 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
168 * - use only trail bytes after a lead byte (checked above)
169 * - use the right number of trail bytes for a given lead byte
170 * - encode a code point <= U+10ffff
171 * - use the fewest possible number of bytes for their code points
172 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
173 *
174 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
175 * There are no irregular sequences any more.
176 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
177 */
178 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
4388f060 179 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
b75a7d8f
A
180 {
181 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
182 if (ch <= MAXIMUM_UCS2)
183 {
184 /* fits in 16 bits */
185 *(myTarget++) = (UChar) ch;
186 }
187 else
188 {
189 /* write out the surrogates */
190 ch -= HALF_BASE;
191 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
192 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
193 if (myTarget < targetLimit)
194 {
195 *(myTarget++) = (UChar)ch;
196 }
197 else
198 {
199 /* Put in overflow buffer (not handled here) */
46f4442e
A
200 cnv->UCharErrorBuffer[0] = (UChar) ch;
201 cnv->UCharErrorBufferLength = 1;
b75a7d8f
A
202 *err = U_BUFFER_OVERFLOW_ERROR;
203 break;
204 }
205 }
206 }
207 else
208 {
46f4442e 209 cnv->toULength = (int8_t)i;
374ca955
A
210 *err = U_ILLEGAL_CHAR_FOUND;
211 break;
b75a7d8f
A
212 }
213 }
214 }
215
216donefornow:
217 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
218 {
219 /* End of target buffer */
220 *err = U_BUFFER_OVERFLOW_ERROR;
221 }
222
223 args->target = myTarget;
224 args->source = (const char *) mySource;
225}
226
374ca955 227static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
b75a7d8f
A
228 UErrorCode * err)
229{
46f4442e 230 UConverter *cnv = args->converter;
b75a7d8f
A
231 const unsigned char *mySource = (unsigned char *) args->source;
232 UChar *myTarget = args->target;
233 int32_t *myOffsets = args->offsets;
234 int32_t offsetNum = 0;
235 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
236 const UChar *targetLimit = args->targetLimit;
46f4442e 237 unsigned char *toUBytes = cnv->toUBytes;
b331163b 238 UBool isCESU8 = hasCESU8Data(cnv);
b75a7d8f
A
239 uint32_t ch, ch2 = 0;
240 int32_t i, inBytes;
241
242 /* Restore size of current sequence */
46f4442e 243 if (cnv->toUnicodeStatus && myTarget < targetLimit)
b75a7d8f 244 {
46f4442e
A
245 inBytes = cnv->mode; /* restore # of bytes to consume */
246 i = cnv->toULength; /* restore # of bytes consumed */
247 cnv->toULength = 0;
b75a7d8f 248
46f4442e
A
249 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
250 cnv->toUnicodeStatus = 0;
b75a7d8f
A
251 goto morebytes;
252 }
253
254 while (mySource < sourceLimit && myTarget < targetLimit)
255 {
256 ch = *(mySource++);
257 if (ch < 0x80) /* Simple case */
258 {
259 *(myTarget++) = (UChar) ch;
260 *(myOffsets++) = offsetNum++;
261 }
262 else
263 {
264 toUBytes[0] = (char)ch;
265 inBytes = bytesFromUTF8[ch];
266 i = 1;
267
268morebytes:
269 while (i < inBytes)
270 {
271 if (mySource < sourceLimit)
272 {
273 toUBytes[i] = (char) (ch2 = *mySource);
4388f060 274 if (!U8_IS_TRAIL(ch2))
b75a7d8f
A
275 {
276 break; /* i < inBytes */
277 }
278 ch = (ch << 6) + ch2;
279 ++mySource;
280 i++;
281 }
282 else
283 {
46f4442e
A
284 cnv->toUnicodeStatus = ch;
285 cnv->mode = inBytes;
286 cnv->toULength = (int8_t)i;
b75a7d8f
A
287 goto donefornow;
288 }
289 }
290
291 /* Remove the accumulated high bits */
292 ch -= offsetsFromUTF8[inBytes];
293
294 /*
295 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
296 * - use only trail bytes after a lead byte (checked above)
297 * - use the right number of trail bytes for a given lead byte
298 * - encode a code point <= U+10ffff
299 * - use the fewest possible number of bytes for their code points
300 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
301 *
302 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
303 * There are no irregular sequences any more.
304 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
305 */
306 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
4388f060 307 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
b75a7d8f
A
308 {
309 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
310 if (ch <= MAXIMUM_UCS2)
311 {
312 /* fits in 16 bits */
313 *(myTarget++) = (UChar) ch;
314 *(myOffsets++) = offsetNum;
315 }
316 else
317 {
318 /* write out the surrogates */
319 ch -= HALF_BASE;
320 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
321 *(myOffsets++) = offsetNum;
322 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
323 if (myTarget < targetLimit)
324 {
325 *(myTarget++) = (UChar)ch;
326 *(myOffsets++) = offsetNum;
327 }
328 else
329 {
46f4442e
A
330 cnv->UCharErrorBuffer[0] = (UChar) ch;
331 cnv->UCharErrorBufferLength = 1;
b75a7d8f
A
332 *err = U_BUFFER_OVERFLOW_ERROR;
333 }
334 }
335 offsetNum += i;
336 }
337 else
338 {
46f4442e 339 cnv->toULength = (int8_t)i;
374ca955
A
340 *err = U_ILLEGAL_CHAR_FOUND;
341 break;
b75a7d8f
A
342 }
343 }
344 }
345
346donefornow:
347 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
348 { /* End of target buffer */
349 *err = U_BUFFER_OVERFLOW_ERROR;
350 }
351
352 args->target = myTarget;
353 args->source = (const char *) mySource;
354 args->offsets = myOffsets;
355}
356
374ca955 357U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
b75a7d8f
A
358 UErrorCode * err)
359{
360 UConverter *cnv = args->converter;
361 const UChar *mySource = args->source;
b75a7d8f 362 const UChar *sourceLimit = args->sourceLimit;
46f4442e
A
363 uint8_t *myTarget = (uint8_t *) args->target;
364 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
365 uint8_t *tempPtr;
73c04bcf 366 UChar32 ch;
46f4442e
A
367 uint8_t tempBuf[4];
368 int32_t indexToWrite;
b331163b 369 UBool isNotCESU8 = !hasCESU8Data(cnv);
b75a7d8f 370
374ca955 371 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 372 {
374ca955
A
373 ch = cnv->fromUChar32;
374 cnv->fromUChar32 = 0;
b75a7d8f
A
375 goto lowsurrogate;
376 }
377
378 while (mySource < sourceLimit && myTarget < targetLimit)
379 {
380 ch = *(mySource++);
381
382 if (ch < 0x80) /* Single byte */
383 {
46f4442e 384 *(myTarget++) = (uint8_t) ch;
b75a7d8f
A
385 }
386 else if (ch < 0x800) /* Double byte */
387 {
46f4442e 388 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
b75a7d8f
A
389 if (myTarget < targetLimit)
390 {
46f4442e 391 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
392 }
393 else
394 {
46f4442e 395 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
396 cnv->charErrorBufferLength = 1;
397 *err = U_BUFFER_OVERFLOW_ERROR;
398 }
399 }
46f4442e
A
400 else {
401 /* Check for surrogates */
4388f060 402 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
b75a7d8f 403lowsurrogate:
46f4442e
A
404 if (mySource < sourceLimit) {
405 /* test both code units */
4388f060 406 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
46f4442e 407 /* convert and consume this supplementary code point */
4388f060 408 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
46f4442e
A
409 ++mySource;
410 /* exit this condition tree */
411 }
412 else {
413 /* this is an unpaired trail or lead code unit */
414 /* callback(illegal) */
374ca955 415 cnv->fromUChar32 = ch;
46f4442e 416 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
417 break;
418 }
46f4442e
A
419 }
420 else {
421 /* no more input */
374ca955 422 cnv->fromUChar32 = ch;
374ca955 423 break;
b75a7d8f
A
424 }
425 }
426
46f4442e
A
427 /* Do we write the buffer directly for speed,
428 or do we have to be careful about target buffer space? */
429 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
430
431 if (ch <= MAXIMUM_UCS2) {
b75a7d8f 432 indexToWrite = 2;
46f4442e 433 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
b75a7d8f 434 }
46f4442e 435 else {
b75a7d8f 436 indexToWrite = 3;
46f4442e
A
437 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
438 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
b75a7d8f 439 }
46f4442e
A
440 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
441 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f 442
46f4442e
A
443 if (tempPtr == myTarget) {
444 /* There was enough space to write the codepoint directly. */
445 myTarget += (indexToWrite + 1);
446 }
447 else {
448 /* We might run out of room soon. Write it slowly. */
449 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
450 if (myTarget < targetLimit) {
451 *(myTarget++) = *tempPtr;
452 }
453 else {
454 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
455 *err = U_BUFFER_OVERFLOW_ERROR;
456 }
b75a7d8f
A
457 }
458 }
459 }
460 }
461
462 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
463 {
464 *err = U_BUFFER_OVERFLOW_ERROR;
465 }
b75a7d8f
A
466
467 args->target = (char *) myTarget;
468 args->source = mySource;
469}
470
374ca955 471U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
b75a7d8f
A
472 UErrorCode * err)
473{
474 UConverter *cnv = args->converter;
475 const UChar *mySource = args->source;
b75a7d8f
A
476 int32_t *myOffsets = args->offsets;
477 const UChar *sourceLimit = args->sourceLimit;
46f4442e
A
478 uint8_t *myTarget = (uint8_t *) args->target;
479 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
480 uint8_t *tempPtr;
73c04bcf 481 UChar32 ch;
b75a7d8f 482 int32_t offsetNum, nextSourceIndex;
46f4442e
A
483 int32_t indexToWrite;
484 uint8_t tempBuf[4];
b331163b 485 UBool isNotCESU8 = !hasCESU8Data(cnv);
b75a7d8f 486
374ca955 487 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 488 {
374ca955
A
489 ch = cnv->fromUChar32;
490 cnv->fromUChar32 = 0;
b75a7d8f
A
491 offsetNum = -1;
492 nextSourceIndex = 0;
493 goto lowsurrogate;
494 } else {
495 offsetNum = 0;
496 }
497
498 while (mySource < sourceLimit && myTarget < targetLimit)
499 {
500 ch = *(mySource++);
501
502 if (ch < 0x80) /* Single byte */
503 {
504 *(myOffsets++) = offsetNum++;
505 *(myTarget++) = (char) ch;
506 }
507 else if (ch < 0x800) /* Double byte */
508 {
509 *(myOffsets++) = offsetNum;
46f4442e 510 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
b75a7d8f
A
511 if (myTarget < targetLimit)
512 {
513 *(myOffsets++) = offsetNum++;
46f4442e 514 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
515 }
516 else
517 {
46f4442e 518 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
519 cnv->charErrorBufferLength = 1;
520 *err = U_BUFFER_OVERFLOW_ERROR;
521 }
522 }
523 else
524 /* Check for surrogates */
525 {
526 nextSourceIndex = offsetNum + 1;
527
4388f060 528 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
b75a7d8f 529lowsurrogate:
46f4442e
A
530 if (mySource < sourceLimit) {
531 /* test both code units */
4388f060 532 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
46f4442e 533 /* convert and consume this supplementary code point */
4388f060 534 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
46f4442e
A
535 ++mySource;
536 ++nextSourceIndex;
537 /* exit this condition tree */
538 }
539 else {
540 /* this is an unpaired trail or lead code unit */
541 /* callback(illegal) */
374ca955 542 cnv->fromUChar32 = ch;
46f4442e 543 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
544 break;
545 }
46f4442e
A
546 }
547 else {
548 /* no more input */
374ca955 549 cnv->fromUChar32 = ch;
374ca955 550 break;
b75a7d8f
A
551 }
552 }
553
46f4442e
A
554 /* Do we write the buffer directly for speed,
555 or do we have to be careful about target buffer space? */
556 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
557
558 if (ch <= MAXIMUM_UCS2) {
b75a7d8f 559 indexToWrite = 2;
46f4442e 560 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
b75a7d8f 561 }
46f4442e 562 else {
b75a7d8f 563 indexToWrite = 3;
46f4442e
A
564 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
565 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
b75a7d8f 566 }
46f4442e
A
567 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
568 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
569
570 if (tempPtr == myTarget) {
571 /* There was enough space to write the codepoint directly. */
572 myTarget += (indexToWrite + 1);
573 myOffsets[0] = offsetNum;
574 myOffsets[1] = offsetNum;
575 myOffsets[2] = offsetNum;
576 if (indexToWrite >= 3) {
577 myOffsets[3] = offsetNum;
b75a7d8f 578 }
46f4442e
A
579 myOffsets += (indexToWrite + 1);
580 }
581 else {
582 /* We might run out of room soon. Write it slowly. */
583 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
584 if (myTarget < targetLimit)
585 {
586 *(myOffsets++) = offsetNum;
587 *(myTarget++) = *tempPtr;
588 }
589 else
590 {
591 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
592 *err = U_BUFFER_OVERFLOW_ERROR;
593 }
b75a7d8f
A
594 }
595 }
596 offsetNum = nextSourceIndex;
597 }
598 }
599
600 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
601 {
602 *err = U_BUFFER_OVERFLOW_ERROR;
603 }
b75a7d8f
A
604
605 args->target = (char *) myTarget;
606 args->source = mySource;
607 args->offsets = myOffsets;
608}
609
374ca955 610static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
b75a7d8f 611 UErrorCode *err) {
374ca955
A
612 UConverter *cnv;
613 const uint8_t *sourceInitial;
b75a7d8f 614 const uint8_t *source;
b75a7d8f
A
615 uint16_t extraBytesToWrite;
616 uint8_t myByte;
617 UChar32 ch;
374ca955 618 int8_t i, isLegalSequence;
b75a7d8f 619
374ca955
A
620 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
621
622 cnv = args->converter;
623 sourceInitial = source = (const uint8_t *)args->source;
624 if (source >= (const uint8_t *)args->sourceLimit)
b75a7d8f 625 {
374ca955
A
626 /* no input */
627 *err = U_INDEX_OUTOFBOUNDS_ERROR;
628 return 0xffff;
629 }
b75a7d8f 630
374ca955
A
631 myByte = (uint8_t)*(source++);
632 if (myByte < 0x80)
633 {
634 args->source = (const char *)source;
635 return (UChar32)myByte;
636 }
b75a7d8f 637
374ca955
A
638 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
639 if (extraBytesToWrite == 0) {
640 cnv->toUBytes[0] = myByte;
641 cnv->toULength = 1;
642 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f 643 args->source = (const char *)source;
374ca955
A
644 return 0xffff;
645 }
b75a7d8f 646
374ca955
A
647 /*The byte sequence is longer than the buffer area passed*/
648 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
649 {
650 /* check if all of the remaining bytes are trail bytes */
651 cnv->toUBytes[0] = myByte;
652 i = 1;
653 *err = U_TRUNCATED_CHAR_FOUND;
654 while(source < (const uint8_t *)args->sourceLimit) {
655 if(U8_IS_TRAIL(myByte = *source)) {
656 cnv->toUBytes[i++] = myByte;
657 ++source;
b75a7d8f 658 } else {
374ca955
A
659 /* error even before we run out of input */
660 *err = U_ILLEGAL_CHAR_FOUND;
661 break;
b75a7d8f
A
662 }
663 }
374ca955
A
664 cnv->toULength = i;
665 args->source = (const char *)source;
666 return 0xffff;
667 }
b75a7d8f 668
374ca955
A
669 isLegalSequence = 1;
670 ch = myByte << 6;
671 switch(extraBytesToWrite)
672 {
673 /* note: code falls through cases! (sic)*/
674 case 6:
675 ch += (myByte = *source);
676 ch <<= 6;
4388f060 677 if (!U8_IS_TRAIL(myByte))
374ca955
A
678 {
679 isLegalSequence = 0;
680 break;
b75a7d8f 681 }
374ca955 682 ++source;
4388f060 683 case 5: /*fall through*/
374ca955
A
684 ch += (myByte = *source);
685 ch <<= 6;
4388f060 686 if (!U8_IS_TRAIL(myByte))
374ca955
A
687 {
688 isLegalSequence = 0;
689 break;
690 }
691 ++source;
4388f060 692 case 4: /*fall through*/
374ca955
A
693 ch += (myByte = *source);
694 ch <<= 6;
4388f060 695 if (!U8_IS_TRAIL(myByte))
374ca955
A
696 {
697 isLegalSequence = 0;
698 break;
699 }
700 ++source;
4388f060 701 case 3: /*fall through*/
374ca955
A
702 ch += (myByte = *source);
703 ch <<= 6;
4388f060 704 if (!U8_IS_TRAIL(myByte))
374ca955
A
705 {
706 isLegalSequence = 0;
707 break;
708 }
709 ++source;
4388f060 710 case 2: /*fall through*/
374ca955 711 ch += (myByte = *source);
4388f060 712 if (!U8_IS_TRAIL(myByte))
374ca955
A
713 {
714 isLegalSequence = 0;
715 break;
716 }
717 ++source;
718 };
719 ch -= offsetsFromUTF8[extraBytesToWrite];
720 args->source = (const char *)source;
721
722 /*
723 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
724 * - use only trail bytes after a lead byte (checked above)
725 * - use the right number of trail bytes for a given lead byte
726 * - encode a code point <= U+10ffff
727 * - use the fewest possible number of bytes for their code points
728 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
729 *
730 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
731 * There are no irregular sequences any more.
732 */
733 if (isLegalSequence &&
734 (uint32_t)ch <= MAXIMUM_UTF &&
735 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
736 !U_IS_SURROGATE(ch)
737 ) {
738 return ch; /* return the code point */
b75a7d8f
A
739 }
740
374ca955
A
741 for(i = 0; sourceInitial < source; ++i) {
742 cnv->toUBytes[i] = *sourceInitial++;
743 }
744 cnv->toULength = i;
745 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
746 return 0xffff;
747}
748
46f4442e
A
749/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
750
751/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
752static const UChar32
753utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
754
755/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
756static const UChar32
757utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
758
759/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
760static void
761ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
762 UConverterToUnicodeArgs *pToUArgs,
763 UErrorCode *pErrorCode) {
4388f060 764 UConverter *utf8;
46f4442e
A
765 const uint8_t *source, *sourceLimit;
766 uint8_t *target;
767 int32_t targetCapacity;
768 int32_t count;
769
770 int8_t oldToULength, toULength, toULimit;
771
772 UChar32 c;
773 uint8_t b, t1, t2;
774
775 /* set up the local pointers */
776 utf8=pToUArgs->converter;
46f4442e
A
777 source=(uint8_t *)pToUArgs->source;
778 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
779 target=(uint8_t *)pFromUArgs->target;
780 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
781
782 /* get the converter state from the UTF-8 UConverter */
783 c=(UChar32)utf8->toUnicodeStatus;
784 if(c!=0) {
785 toULength=oldToULength=utf8->toULength;
786 toULimit=(int8_t)utf8->mode;
787 } else {
788 toULength=oldToULength=toULimit=0;
789 }
790
791 count=(int32_t)(sourceLimit-source)+oldToULength;
792 if(count<toULimit) {
793 /*
794 * Not enough input to complete the partial character.
795 * Jump to moreBytes below - it will not output to target.
796 */
797 } else if(targetCapacity<toULimit) {
798 /*
799 * Not enough target capacity to output the partial character.
800 * Let the standard converter handle this.
801 */
802 *pErrorCode=U_USING_DEFAULT_WARNING;
803 return;
804 } else {
805 /*
806 * Use a single counter for source and target, counting the minimum of
807 * the source length and the target capacity.
808 * As a result, the source length is checked only once per multi-byte
809 * character instead of twice.
810 *
811 * Make sure that the last byte sequence is complete, or else
812 * stop just before it.
813 * (The longest legal byte sequence has 3 trail bytes.)
814 * Count oldToULength (number of source bytes from a previous buffer)
815 * into the source length but reduce the source index by toULimit
816 * while going back over trail bytes in order to not go back into
817 * the bytes that will be read for finishing a partial
818 * sequence from the previous buffer.
819 * Let the standard converter handle edge cases.
820 */
821 int32_t i;
822
823 if(count>targetCapacity) {
824 count=targetCapacity;
825 }
826
827 i=0;
828 while(i<3 && i<(count-toULimit)) {
829 b=source[count-oldToULength-i-1];
830 if(U8_IS_TRAIL(b)) {
831 ++i;
832 } else {
51004dcb 833 if(i<U8_COUNT_TRAIL_BYTES(b)) {
46f4442e
A
834 /* stop converting before the lead byte if there are not enough trail bytes for it */
835 count-=i+1;
836 }
837 break;
838 }
839 }
840 }
841
842 if(c!=0) {
843 utf8->toUnicodeStatus=0;
844 utf8->toULength=0;
845 goto moreBytes;
846 /* See note in ucnv_SBCSFromUTF8() about this goto. */
847 }
848
849 /* conversion loop */
850 while(count>0) {
851 b=*source++;
852 if((int8_t)b>=0) {
853 /* convert ASCII */
854 *target++=b;
855 --count;
856 continue;
857 } else {
858 if(b>0xe0) {
859 if( /* handle U+1000..U+D7FF inline */
860 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
861 (b==0xed && (t1 <= 0x9f))) &&
862 (t2=source[1]) >= 0x80 && t2 <= 0xbf
863 ) {
864 source+=2;
865 *target++=b;
866 *target++=t1;
867 *target++=t2;
868 count-=3;
869 continue;
870 }
871 } else if(b<0xe0) {
872 if( /* handle U+0080..U+07FF inline */
873 b>=0xc2 &&
874 (t1=*source) >= 0x80 && t1 <= 0xbf
875 ) {
876 ++source;
877 *target++=b;
878 *target++=t1;
879 count-=2;
880 continue;
881 }
882 } else if(b==0xe0) {
883 if( /* handle U+0800..U+0FFF inline */
884 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
885 (t2=source[1]) >= 0x80 && t2 <= 0xbf
886 ) {
887 source+=2;
888 *target++=b;
889 *target++=t1;
890 *target++=t2;
891 count-=3;
892 continue;
893 }
894 }
895
896 /* handle "complicated" and error cases, and continuing partial characters */
897 oldToULength=0;
898 toULength=1;
51004dcb 899 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
900 c=b;
901moreBytes:
902 while(toULength<toULimit) {
903 if(source<sourceLimit) {
904 b=*source;
905 if(U8_IS_TRAIL(b)) {
906 ++source;
907 ++toULength;
908 c=(c<<6)+b;
909 } else {
910 break; /* sequence too short, stop with toULength<toULimit */
911 }
912 } else {
913 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
914 source-=(toULength-oldToULength);
915 while(oldToULength<toULength) {
916 utf8->toUBytes[oldToULength++]=*source++;
917 }
918 utf8->toUnicodeStatus=c;
919 utf8->toULength=toULength;
920 utf8->mode=toULimit;
921 pToUArgs->source=(char *)source;
922 pFromUArgs->target=(char *)target;
923 return;
924 }
925 }
926
927 if( toULength==toULimit && /* consumed all trail bytes */
928 (toULength==3 || toULength==2) && /* BMP */
929 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
930 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
931 ) {
932 /* legal byte sequence for BMP code point */
933 } else if(
934 toULength==toULimit && toULength==4 &&
935 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
936 ) {
937 /* legal byte sequence for supplementary code point */
938 } else {
939 /* error handling: illegal UTF-8 byte sequence */
940 source-=(toULength-oldToULength);
941 while(oldToULength<toULength) {
942 utf8->toUBytes[oldToULength++]=*source++;
943 }
944 utf8->toULength=toULength;
945 pToUArgs->source=(char *)source;
946 pFromUArgs->target=(char *)target;
947 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
948 return;
949 }
950
951 /* copy the legal byte sequence to the target */
952 {
953 int8_t i;
954
955 for(i=0; i<oldToULength; ++i) {
956 *target++=utf8->toUBytes[i];
957 }
958 source-=(toULength-oldToULength);
959 for(; i<toULength; ++i) {
960 *target++=*source++;
961 }
962 count-=toULength;
963 }
964 }
965 }
966
967 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
968 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
969 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
970 } else {
971 b=*source;
51004dcb 972 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
973 if(toULimit>(sourceLimit-source)) {
974 /* collect a truncated byte sequence */
975 toULength=0;
976 c=b;
977 for(;;) {
978 utf8->toUBytes[toULength++]=b;
979 if(++source==sourceLimit) {
980 /* partial byte sequence at end of source */
981 utf8->toUnicodeStatus=c;
982 utf8->toULength=toULength;
983 utf8->mode=toULimit;
984 break;
985 } else if(!U8_IS_TRAIL(b=*source)) {
986 /* lead byte in trail byte position */
987 utf8->toULength=toULength;
988 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
989 break;
990 }
991 c=(c<<6)+b;
992 }
993 } else {
994 /* partial-sequence target overflow: fall back to the pivoting implementation */
995 *pErrorCode=U_USING_DEFAULT_WARNING;
996 }
997 }
998 }
999
1000 /* write back the updated pointers */
1001 pToUArgs->source=(char *)source;
1002 pFromUArgs->target=(char *)target;
1003}
1004
b75a7d8f
A
1005/* UTF-8 converter data ----------------------------------------------------- */
1006
1007static const UConverterImpl _UTF8Impl={
1008 UCNV_UTF8,
1009
1010 NULL,
1011 NULL,
1012
1013 NULL,
1014 NULL,
1015 NULL,
1016
374ca955
A
1017 ucnv_toUnicode_UTF8,
1018 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1019 ucnv_fromUnicode_UTF8,
1020 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1021 ucnv_getNextUChar_UTF8,
b75a7d8f
A
1022
1023 NULL,
1024 NULL,
1025 NULL,
1026 NULL,
46f4442e
A
1027 ucnv_getNonSurrogateUnicodeSet,
1028
1029 ucnv_UTF8FromUTF8,
1030 ucnv_UTF8FromUTF8
b75a7d8f
A
1031};
1032
1033/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1034static const UConverterStaticData _UTF8StaticData={
1035 sizeof(UConverterStaticData),
1036 "UTF-8",
374ca955
A
1037 1208, UCNV_IBM, UCNV_UTF8,
1038 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
b75a7d8f
A
1039 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1040 0,
1041 0,
1042 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1043};
1044
1045
1046const UConverterSharedData _UTF8Data={
1047 sizeof(UConverterSharedData), ~((uint32_t) 0),
1048 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1049 0
1050};
1051
1052/* CESU-8 converter data ---------------------------------------------------- */
1053
374ca955
A
1054static const UConverterImpl _CESU8Impl={
1055 UCNV_CESU8,
1056
1057 NULL,
1058 NULL,
1059
1060 NULL,
1061 NULL,
1062 NULL,
1063
1064 ucnv_toUnicode_UTF8,
1065 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1066 ucnv_fromUnicode_UTF8,
1067 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1068 NULL,
1069
1070 NULL,
1071 NULL,
1072 NULL,
1073 NULL,
1074 ucnv_getCompleteUnicodeSet
1075};
1076
b75a7d8f
A
1077static const UConverterStaticData _CESU8StaticData={
1078 sizeof(UConverterStaticData),
1079 "CESU-8",
73c04bcf
A
1080 9400, /* CCSID for CESU-8 */
1081 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
b75a7d8f
A
1082 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1083 0,
1084 0,
1085 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1086};
1087
1088
1089const UConverterSharedData _CESU8Data={
1090 sizeof(UConverterSharedData), ~((uint32_t) 0),
374ca955 1091 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
b75a7d8f
A
1092 0
1093};
374ca955
A
1094
1095#endif