]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u8.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u8.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2002-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* file name: ucnv_u8.c
f3c0d7a5 9* encoding: UTF-8
b75a7d8f
A
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002jul01
14* created by: Markus W. Scherer
15*
16* UTF-8 converter implementation. Used to be in ucnv_utf.c.
17*
18* Also, CESU-8 implementation, see UTR 26.
19* The CESU-8 converter uses all the same functions as the
20* UTF-8 converter, with a branch for converting supplementary code points.
21*/
22
23#include "unicode/utypes.h"
374ca955
A
24
25#if !UCONFIG_NO_CONVERSION
26
b75a7d8f 27#include "unicode/ucnv.h"
4388f060
A
28#include "unicode/utf.h"
29#include "unicode/utf8.h"
30#include "unicode/utf16.h"
b75a7d8f
A
31#include "ucnv_bld.h"
32#include "ucnv_cnv.h"
33#include "cmemory.h"
34
35/* Prototypes --------------------------------------------------------------- */
36
37/* Keep these here to make finicky compilers happy */
38
374ca955 39U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
b75a7d8f 40 UErrorCode *err);
374ca955 41U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
b75a7d8f 42 UErrorCode *err);
b75a7d8f
A
43
44
45/* UTF-8 -------------------------------------------------------------------- */
46
47/* UTF-8 Conversion DATA
46f4442e 48 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
b75a7d8f
A
49 */
50/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
51#define MAXIMUM_UCS2 0x0000FFFF
52#define MAXIMUM_UTF 0x0010FFFF
53#define MAXIMUM_UCS4 0x7FFFFFFF
54#define HALF_SHIFT 10
55#define HALF_BASE 0x0010000
56#define HALF_MASK 0x3FF
57#define SURROGATE_HIGH_START 0xD800
58#define SURROGATE_HIGH_END 0xDBFF
59#define SURROGATE_LOW_START 0xDC00
60#define SURROGATE_LOW_END 0xDFFF
61
62/* -SURROGATE_LOW_START + HALF_BASE */
63#define SURROGATE_LOW_BASE 9216
64
65static const uint32_t offsetsFromUTF8[7] = {0,
66 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
67 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
68};
69
70/* END OF UTF-8 Conversion DATA */
71
72static const int8_t bytesFromUTF8[256] = {
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
81};
82
83/*
84 * Starting with Unicode 3.0.1:
85 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
86 * byte sequences with more than 4 bytes are illegal in UTF-8,
87 * which is tested with impossible values for them
88 */
89static const uint32_t
90utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
91
b331163b
A
92static UBool hasCESU8Data(const UConverter *cnv)
93{
94#if UCONFIG_ONLY_HTML_CONVERSION
95 return FALSE;
96#else
97 return (UBool)(cnv->sharedData == &_CESU8Data);
98#endif
99}
f3c0d7a5
A
100U_CDECL_BEGIN
101static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
b75a7d8f
A
102 UErrorCode * err)
103{
46f4442e 104 UConverter *cnv = args->converter;
b75a7d8f
A
105 const unsigned char *mySource = (unsigned char *) args->source;
106 UChar *myTarget = args->target;
107 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
108 const UChar *targetLimit = args->targetLimit;
46f4442e 109 unsigned char *toUBytes = cnv->toUBytes;
b331163b 110 UBool isCESU8 = hasCESU8Data(cnv);
b75a7d8f
A
111 uint32_t ch, ch2 = 0;
112 int32_t i, inBytes;
b331163b 113
b75a7d8f 114 /* Restore size of current sequence */
46f4442e 115 if (cnv->toUnicodeStatus && myTarget < targetLimit)
b75a7d8f 116 {
46f4442e
A
117 inBytes = cnv->mode; /* restore # of bytes to consume */
118 i = cnv->toULength; /* restore # of bytes consumed */
119 cnv->toULength = 0;
b75a7d8f 120
46f4442e
A
121 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
122 cnv->toUnicodeStatus = 0;
b75a7d8f
A
123 goto morebytes;
124 }
125
126
127 while (mySource < sourceLimit && myTarget < targetLimit)
128 {
129 ch = *(mySource++);
130 if (ch < 0x80) /* Simple case */
131 {
132 *(myTarget++) = (UChar) ch;
133 }
134 else
135 {
136 /* store the first char */
137 toUBytes[0] = (char)ch;
138 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
139 i = 1;
140
141morebytes:
142 while (i < inBytes)
143 {
144 if (mySource < sourceLimit)
145 {
146 toUBytes[i] = (char) (ch2 = *mySource);
4388f060 147 if (!U8_IS_TRAIL(ch2))
b75a7d8f
A
148 {
149 break; /* i < inBytes */
150 }
151 ch = (ch << 6) + ch2;
152 ++mySource;
153 i++;
154 }
155 else
156 {
374ca955 157 /* stores a partially calculated target*/
46f4442e
A
158 cnv->toUnicodeStatus = ch;
159 cnv->mode = inBytes;
160 cnv->toULength = (int8_t) i;
b75a7d8f
A
161 goto donefornow;
162 }
163 }
164
165 /* Remove the accumulated high bits */
166 ch -= offsetsFromUTF8[inBytes];
167
168 /*
169 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
170 * - use only trail bytes after a lead byte (checked above)
171 * - use the right number of trail bytes for a given lead byte
172 * - encode a code point <= U+10ffff
173 * - use the fewest possible number of bytes for their code points
174 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
175 *
176 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
177 * There are no irregular sequences any more.
178 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
179 */
180 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
4388f060 181 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
b75a7d8f
A
182 {
183 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
184 if (ch <= MAXIMUM_UCS2)
185 {
186 /* fits in 16 bits */
187 *(myTarget++) = (UChar) ch;
188 }
189 else
190 {
191 /* write out the surrogates */
192 ch -= HALF_BASE;
193 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
194 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
195 if (myTarget < targetLimit)
196 {
197 *(myTarget++) = (UChar)ch;
198 }
199 else
200 {
201 /* Put in overflow buffer (not handled here) */
46f4442e
A
202 cnv->UCharErrorBuffer[0] = (UChar) ch;
203 cnv->UCharErrorBufferLength = 1;
b75a7d8f
A
204 *err = U_BUFFER_OVERFLOW_ERROR;
205 break;
206 }
207 }
208 }
209 else
210 {
46f4442e 211 cnv->toULength = (int8_t)i;
374ca955
A
212 *err = U_ILLEGAL_CHAR_FOUND;
213 break;
b75a7d8f
A
214 }
215 }
216 }
217
218donefornow:
219 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
220 {
221 /* End of target buffer */
222 *err = U_BUFFER_OVERFLOW_ERROR;
223 }
224
225 args->target = myTarget;
226 args->source = (const char *) mySource;
227}
228
f3c0d7a5 229static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
b75a7d8f
A
230 UErrorCode * err)
231{
46f4442e 232 UConverter *cnv = args->converter;
b75a7d8f
A
233 const unsigned char *mySource = (unsigned char *) args->source;
234 UChar *myTarget = args->target;
235 int32_t *myOffsets = args->offsets;
236 int32_t offsetNum = 0;
237 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
238 const UChar *targetLimit = args->targetLimit;
46f4442e 239 unsigned char *toUBytes = cnv->toUBytes;
b331163b 240 UBool isCESU8 = hasCESU8Data(cnv);
b75a7d8f
A
241 uint32_t ch, ch2 = 0;
242 int32_t i, inBytes;
243
244 /* Restore size of current sequence */
46f4442e 245 if (cnv->toUnicodeStatus && myTarget < targetLimit)
b75a7d8f 246 {
46f4442e
A
247 inBytes = cnv->mode; /* restore # of bytes to consume */
248 i = cnv->toULength; /* restore # of bytes consumed */
249 cnv->toULength = 0;
b75a7d8f 250
46f4442e
A
251 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
252 cnv->toUnicodeStatus = 0;
b75a7d8f
A
253 goto morebytes;
254 }
255
256 while (mySource < sourceLimit && myTarget < targetLimit)
257 {
258 ch = *(mySource++);
259 if (ch < 0x80) /* Simple case */
260 {
261 *(myTarget++) = (UChar) ch;
262 *(myOffsets++) = offsetNum++;
263 }
264 else
265 {
266 toUBytes[0] = (char)ch;
267 inBytes = bytesFromUTF8[ch];
268 i = 1;
269
270morebytes:
271 while (i < inBytes)
272 {
273 if (mySource < sourceLimit)
274 {
275 toUBytes[i] = (char) (ch2 = *mySource);
4388f060 276 if (!U8_IS_TRAIL(ch2))
b75a7d8f
A
277 {
278 break; /* i < inBytes */
279 }
280 ch = (ch << 6) + ch2;
281 ++mySource;
282 i++;
283 }
284 else
285 {
46f4442e
A
286 cnv->toUnicodeStatus = ch;
287 cnv->mode = inBytes;
288 cnv->toULength = (int8_t)i;
b75a7d8f
A
289 goto donefornow;
290 }
291 }
292
293 /* Remove the accumulated high bits */
294 ch -= offsetsFromUTF8[inBytes];
295
296 /*
297 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
298 * - use only trail bytes after a lead byte (checked above)
299 * - use the right number of trail bytes for a given lead byte
300 * - encode a code point <= U+10ffff
301 * - use the fewest possible number of bytes for their code points
302 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
303 *
304 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
305 * There are no irregular sequences any more.
306 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
307 */
308 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
4388f060 309 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
b75a7d8f
A
310 {
311 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
312 if (ch <= MAXIMUM_UCS2)
313 {
314 /* fits in 16 bits */
315 *(myTarget++) = (UChar) ch;
316 *(myOffsets++) = offsetNum;
317 }
318 else
319 {
320 /* write out the surrogates */
321 ch -= HALF_BASE;
322 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
323 *(myOffsets++) = offsetNum;
324 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
325 if (myTarget < targetLimit)
326 {
327 *(myTarget++) = (UChar)ch;
328 *(myOffsets++) = offsetNum;
329 }
330 else
331 {
46f4442e
A
332 cnv->UCharErrorBuffer[0] = (UChar) ch;
333 cnv->UCharErrorBufferLength = 1;
b75a7d8f
A
334 *err = U_BUFFER_OVERFLOW_ERROR;
335 }
336 }
337 offsetNum += i;
338 }
339 else
340 {
46f4442e 341 cnv->toULength = (int8_t)i;
374ca955
A
342 *err = U_ILLEGAL_CHAR_FOUND;
343 break;
b75a7d8f
A
344 }
345 }
346 }
347
348donefornow:
349 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
350 { /* End of target buffer */
351 *err = U_BUFFER_OVERFLOW_ERROR;
352 }
353
354 args->target = myTarget;
355 args->source = (const char *) mySource;
356 args->offsets = myOffsets;
357}
f3c0d7a5 358U_CDECL_END
b75a7d8f 359
f3c0d7a5 360U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
b75a7d8f
A
361 UErrorCode * err)
362{
363 UConverter *cnv = args->converter;
364 const UChar *mySource = args->source;
b75a7d8f 365 const UChar *sourceLimit = args->sourceLimit;
46f4442e
A
366 uint8_t *myTarget = (uint8_t *) args->target;
367 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
368 uint8_t *tempPtr;
73c04bcf 369 UChar32 ch;
46f4442e
A
370 uint8_t tempBuf[4];
371 int32_t indexToWrite;
b331163b 372 UBool isNotCESU8 = !hasCESU8Data(cnv);
b75a7d8f 373
374ca955 374 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 375 {
374ca955
A
376 ch = cnv->fromUChar32;
377 cnv->fromUChar32 = 0;
b75a7d8f
A
378 goto lowsurrogate;
379 }
380
381 while (mySource < sourceLimit && myTarget < targetLimit)
382 {
383 ch = *(mySource++);
384
385 if (ch < 0x80) /* Single byte */
386 {
46f4442e 387 *(myTarget++) = (uint8_t) ch;
b75a7d8f
A
388 }
389 else if (ch < 0x800) /* Double byte */
390 {
46f4442e 391 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
b75a7d8f
A
392 if (myTarget < targetLimit)
393 {
46f4442e 394 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
395 }
396 else
397 {
46f4442e 398 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
399 cnv->charErrorBufferLength = 1;
400 *err = U_BUFFER_OVERFLOW_ERROR;
401 }
402 }
46f4442e
A
403 else {
404 /* Check for surrogates */
4388f060 405 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
b75a7d8f 406lowsurrogate:
46f4442e
A
407 if (mySource < sourceLimit) {
408 /* test both code units */
4388f060 409 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
46f4442e 410 /* convert and consume this supplementary code point */
4388f060 411 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
46f4442e
A
412 ++mySource;
413 /* exit this condition tree */
414 }
415 else {
416 /* this is an unpaired trail or lead code unit */
417 /* callback(illegal) */
374ca955 418 cnv->fromUChar32 = ch;
46f4442e 419 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
420 break;
421 }
46f4442e
A
422 }
423 else {
424 /* no more input */
374ca955 425 cnv->fromUChar32 = ch;
374ca955 426 break;
b75a7d8f
A
427 }
428 }
429
46f4442e
A
430 /* Do we write the buffer directly for speed,
431 or do we have to be careful about target buffer space? */
432 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
433
434 if (ch <= MAXIMUM_UCS2) {
b75a7d8f 435 indexToWrite = 2;
46f4442e 436 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
b75a7d8f 437 }
46f4442e 438 else {
b75a7d8f 439 indexToWrite = 3;
46f4442e
A
440 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
441 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
b75a7d8f 442 }
46f4442e
A
443 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
444 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f 445
46f4442e
A
446 if (tempPtr == myTarget) {
447 /* There was enough space to write the codepoint directly. */
448 myTarget += (indexToWrite + 1);
449 }
450 else {
451 /* We might run out of room soon. Write it slowly. */
452 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
453 if (myTarget < targetLimit) {
454 *(myTarget++) = *tempPtr;
455 }
456 else {
457 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
458 *err = U_BUFFER_OVERFLOW_ERROR;
459 }
b75a7d8f
A
460 }
461 }
462 }
463 }
464
465 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
466 {
467 *err = U_BUFFER_OVERFLOW_ERROR;
468 }
b75a7d8f
A
469
470 args->target = (char *) myTarget;
471 args->source = mySource;
472}
473
f3c0d7a5 474U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
b75a7d8f
A
475 UErrorCode * err)
476{
477 UConverter *cnv = args->converter;
478 const UChar *mySource = args->source;
b75a7d8f
A
479 int32_t *myOffsets = args->offsets;
480 const UChar *sourceLimit = args->sourceLimit;
46f4442e
A
481 uint8_t *myTarget = (uint8_t *) args->target;
482 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
483 uint8_t *tempPtr;
73c04bcf 484 UChar32 ch;
b75a7d8f 485 int32_t offsetNum, nextSourceIndex;
46f4442e
A
486 int32_t indexToWrite;
487 uint8_t tempBuf[4];
b331163b 488 UBool isNotCESU8 = !hasCESU8Data(cnv);
b75a7d8f 489
374ca955 490 if (cnv->fromUChar32 && myTarget < targetLimit)
b75a7d8f 491 {
374ca955
A
492 ch = cnv->fromUChar32;
493 cnv->fromUChar32 = 0;
b75a7d8f
A
494 offsetNum = -1;
495 nextSourceIndex = 0;
496 goto lowsurrogate;
497 } else {
498 offsetNum = 0;
499 }
500
501 while (mySource < sourceLimit && myTarget < targetLimit)
502 {
503 ch = *(mySource++);
504
505 if (ch < 0x80) /* Single byte */
506 {
507 *(myOffsets++) = offsetNum++;
508 *(myTarget++) = (char) ch;
509 }
510 else if (ch < 0x800) /* Double byte */
511 {
512 *(myOffsets++) = offsetNum;
46f4442e 513 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
b75a7d8f
A
514 if (myTarget < targetLimit)
515 {
516 *(myOffsets++) = offsetNum++;
46f4442e 517 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
518 }
519 else
520 {
46f4442e 521 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
b75a7d8f
A
522 cnv->charErrorBufferLength = 1;
523 *err = U_BUFFER_OVERFLOW_ERROR;
524 }
525 }
526 else
527 /* Check for surrogates */
528 {
529 nextSourceIndex = offsetNum + 1;
530
4388f060 531 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
b75a7d8f 532lowsurrogate:
46f4442e
A
533 if (mySource < sourceLimit) {
534 /* test both code units */
4388f060 535 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
46f4442e 536 /* convert and consume this supplementary code point */
4388f060 537 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
46f4442e
A
538 ++mySource;
539 ++nextSourceIndex;
540 /* exit this condition tree */
541 }
542 else {
543 /* this is an unpaired trail or lead code unit */
544 /* callback(illegal) */
374ca955 545 cnv->fromUChar32 = ch;
46f4442e 546 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
547 break;
548 }
46f4442e
A
549 }
550 else {
551 /* no more input */
374ca955 552 cnv->fromUChar32 = ch;
374ca955 553 break;
b75a7d8f
A
554 }
555 }
556
46f4442e
A
557 /* Do we write the buffer directly for speed,
558 or do we have to be careful about target buffer space? */
559 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
560
561 if (ch <= MAXIMUM_UCS2) {
b75a7d8f 562 indexToWrite = 2;
46f4442e 563 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
b75a7d8f 564 }
46f4442e 565 else {
b75a7d8f 566 indexToWrite = 3;
46f4442e
A
567 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
568 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
b75a7d8f 569 }
46f4442e
A
570 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
571 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
572
573 if (tempPtr == myTarget) {
574 /* There was enough space to write the codepoint directly. */
575 myTarget += (indexToWrite + 1);
576 myOffsets[0] = offsetNum;
577 myOffsets[1] = offsetNum;
578 myOffsets[2] = offsetNum;
579 if (indexToWrite >= 3) {
580 myOffsets[3] = offsetNum;
b75a7d8f 581 }
46f4442e
A
582 myOffsets += (indexToWrite + 1);
583 }
584 else {
585 /* We might run out of room soon. Write it slowly. */
586 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
587 if (myTarget < targetLimit)
588 {
589 *(myOffsets++) = offsetNum;
590 *(myTarget++) = *tempPtr;
591 }
592 else
593 {
594 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
595 *err = U_BUFFER_OVERFLOW_ERROR;
596 }
b75a7d8f
A
597 }
598 }
599 offsetNum = nextSourceIndex;
600 }
601 }
602
603 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
604 {
605 *err = U_BUFFER_OVERFLOW_ERROR;
606 }
b75a7d8f
A
607
608 args->target = (char *) myTarget;
609 args->source = mySource;
610 args->offsets = myOffsets;
611}
612
f3c0d7a5
A
613U_CDECL_BEGIN
614static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
b75a7d8f 615 UErrorCode *err) {
374ca955
A
616 UConverter *cnv;
617 const uint8_t *sourceInitial;
b75a7d8f 618 const uint8_t *source;
b75a7d8f
A
619 uint16_t extraBytesToWrite;
620 uint8_t myByte;
621 UChar32 ch;
374ca955 622 int8_t i, isLegalSequence;
b75a7d8f 623
374ca955
A
624 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
625
626 cnv = args->converter;
627 sourceInitial = source = (const uint8_t *)args->source;
628 if (source >= (const uint8_t *)args->sourceLimit)
b75a7d8f 629 {
374ca955
A
630 /* no input */
631 *err = U_INDEX_OUTOFBOUNDS_ERROR;
632 return 0xffff;
633 }
b75a7d8f 634
374ca955
A
635 myByte = (uint8_t)*(source++);
636 if (myByte < 0x80)
637 {
638 args->source = (const char *)source;
639 return (UChar32)myByte;
640 }
b75a7d8f 641
374ca955
A
642 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
643 if (extraBytesToWrite == 0) {
644 cnv->toUBytes[0] = myByte;
645 cnv->toULength = 1;
646 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f 647 args->source = (const char *)source;
374ca955
A
648 return 0xffff;
649 }
b75a7d8f 650
374ca955
A
651 /*The byte sequence is longer than the buffer area passed*/
652 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
653 {
654 /* check if all of the remaining bytes are trail bytes */
655 cnv->toUBytes[0] = myByte;
656 i = 1;
657 *err = U_TRUNCATED_CHAR_FOUND;
658 while(source < (const uint8_t *)args->sourceLimit) {
659 if(U8_IS_TRAIL(myByte = *source)) {
660 cnv->toUBytes[i++] = myByte;
661 ++source;
b75a7d8f 662 } else {
374ca955
A
663 /* error even before we run out of input */
664 *err = U_ILLEGAL_CHAR_FOUND;
665 break;
b75a7d8f
A
666 }
667 }
374ca955
A
668 cnv->toULength = i;
669 args->source = (const char *)source;
670 return 0xffff;
671 }
b75a7d8f 672
374ca955
A
673 isLegalSequence = 1;
674 ch = myByte << 6;
675 switch(extraBytesToWrite)
676 {
677 /* note: code falls through cases! (sic)*/
678 case 6:
679 ch += (myByte = *source);
680 ch <<= 6;
4388f060 681 if (!U8_IS_TRAIL(myByte))
374ca955
A
682 {
683 isLegalSequence = 0;
684 break;
b75a7d8f 685 }
374ca955 686 ++source;
2ca993e8
A
687 U_FALLTHROUGH;
688 case 5:
374ca955
A
689 ch += (myByte = *source);
690 ch <<= 6;
4388f060 691 if (!U8_IS_TRAIL(myByte))
374ca955
A
692 {
693 isLegalSequence = 0;
694 break;
695 }
696 ++source;
2ca993e8
A
697 U_FALLTHROUGH;
698 case 4:
374ca955
A
699 ch += (myByte = *source);
700 ch <<= 6;
4388f060 701 if (!U8_IS_TRAIL(myByte))
374ca955
A
702 {
703 isLegalSequence = 0;
704 break;
705 }
706 ++source;
2ca993e8
A
707 U_FALLTHROUGH;
708 case 3:
374ca955
A
709 ch += (myByte = *source);
710 ch <<= 6;
4388f060 711 if (!U8_IS_TRAIL(myByte))
374ca955
A
712 {
713 isLegalSequence = 0;
714 break;
715 }
716 ++source;
2ca993e8
A
717 U_FALLTHROUGH;
718 case 2:
374ca955 719 ch += (myByte = *source);
4388f060 720 if (!U8_IS_TRAIL(myByte))
374ca955
A
721 {
722 isLegalSequence = 0;
723 break;
724 }
725 ++source;
726 };
727 ch -= offsetsFromUTF8[extraBytesToWrite];
728 args->source = (const char *)source;
729
730 /*
731 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
732 * - use only trail bytes after a lead byte (checked above)
733 * - use the right number of trail bytes for a given lead byte
734 * - encode a code point <= U+10ffff
735 * - use the fewest possible number of bytes for their code points
736 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
737 *
738 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
739 * There are no irregular sequences any more.
740 */
741 if (isLegalSequence &&
742 (uint32_t)ch <= MAXIMUM_UTF &&
743 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
744 !U_IS_SURROGATE(ch)
745 ) {
746 return ch; /* return the code point */
b75a7d8f
A
747 }
748
374ca955
A
749 for(i = 0; sourceInitial < source; ++i) {
750 cnv->toUBytes[i] = *sourceInitial++;
751 }
752 cnv->toULength = i;
753 *err = U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
754 return 0xffff;
755}
f3c0d7a5 756U_CDECL_END
b75a7d8f 757
46f4442e
A
758/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
759
760/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
761static const UChar32
762utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
763
764/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
765static const UChar32
766utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
767
f3c0d7a5 768U_CDECL_BEGIN
46f4442e 769/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
f3c0d7a5 770static void U_CALLCONV
46f4442e
A
771ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
772 UConverterToUnicodeArgs *pToUArgs,
773 UErrorCode *pErrorCode) {
4388f060 774 UConverter *utf8;
46f4442e
A
775 const uint8_t *source, *sourceLimit;
776 uint8_t *target;
777 int32_t targetCapacity;
778 int32_t count;
779
780 int8_t oldToULength, toULength, toULimit;
781
782 UChar32 c;
783 uint8_t b, t1, t2;
784
785 /* set up the local pointers */
786 utf8=pToUArgs->converter;
46f4442e
A
787 source=(uint8_t *)pToUArgs->source;
788 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
789 target=(uint8_t *)pFromUArgs->target;
790 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
791
792 /* get the converter state from the UTF-8 UConverter */
793 c=(UChar32)utf8->toUnicodeStatus;
794 if(c!=0) {
795 toULength=oldToULength=utf8->toULength;
796 toULimit=(int8_t)utf8->mode;
797 } else {
798 toULength=oldToULength=toULimit=0;
799 }
800
801 count=(int32_t)(sourceLimit-source)+oldToULength;
802 if(count<toULimit) {
803 /*
804 * Not enough input to complete the partial character.
805 * Jump to moreBytes below - it will not output to target.
806 */
807 } else if(targetCapacity<toULimit) {
808 /*
809 * Not enough target capacity to output the partial character.
810 * Let the standard converter handle this.
811 */
812 *pErrorCode=U_USING_DEFAULT_WARNING;
813 return;
814 } else {
815 /*
816 * Use a single counter for source and target, counting the minimum of
817 * the source length and the target capacity.
818 * As a result, the source length is checked only once per multi-byte
819 * character instead of twice.
820 *
821 * Make sure that the last byte sequence is complete, or else
822 * stop just before it.
823 * (The longest legal byte sequence has 3 trail bytes.)
824 * Count oldToULength (number of source bytes from a previous buffer)
825 * into the source length but reduce the source index by toULimit
826 * while going back over trail bytes in order to not go back into
827 * the bytes that will be read for finishing a partial
828 * sequence from the previous buffer.
829 * Let the standard converter handle edge cases.
830 */
831 int32_t i;
832
833 if(count>targetCapacity) {
834 count=targetCapacity;
835 }
836
837 i=0;
838 while(i<3 && i<(count-toULimit)) {
839 b=source[count-oldToULength-i-1];
840 if(U8_IS_TRAIL(b)) {
841 ++i;
842 } else {
51004dcb 843 if(i<U8_COUNT_TRAIL_BYTES(b)) {
46f4442e
A
844 /* stop converting before the lead byte if there are not enough trail bytes for it */
845 count-=i+1;
846 }
847 break;
848 }
849 }
850 }
851
852 if(c!=0) {
853 utf8->toUnicodeStatus=0;
854 utf8->toULength=0;
855 goto moreBytes;
856 /* See note in ucnv_SBCSFromUTF8() about this goto. */
857 }
858
859 /* conversion loop */
860 while(count>0) {
861 b=*source++;
862 if((int8_t)b>=0) {
863 /* convert ASCII */
864 *target++=b;
865 --count;
866 continue;
867 } else {
868 if(b>0xe0) {
869 if( /* handle U+1000..U+D7FF inline */
870 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
871 (b==0xed && (t1 <= 0x9f))) &&
872 (t2=source[1]) >= 0x80 && t2 <= 0xbf
873 ) {
874 source+=2;
875 *target++=b;
876 *target++=t1;
877 *target++=t2;
878 count-=3;
879 continue;
880 }
881 } else if(b<0xe0) {
882 if( /* handle U+0080..U+07FF inline */
883 b>=0xc2 &&
884 (t1=*source) >= 0x80 && t1 <= 0xbf
885 ) {
886 ++source;
887 *target++=b;
888 *target++=t1;
889 count-=2;
890 continue;
891 }
892 } else if(b==0xe0) {
893 if( /* handle U+0800..U+0FFF inline */
894 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
895 (t2=source[1]) >= 0x80 && t2 <= 0xbf
896 ) {
897 source+=2;
898 *target++=b;
899 *target++=t1;
900 *target++=t2;
901 count-=3;
902 continue;
903 }
904 }
905
906 /* handle "complicated" and error cases, and continuing partial characters */
907 oldToULength=0;
908 toULength=1;
51004dcb 909 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
910 c=b;
911moreBytes:
912 while(toULength<toULimit) {
913 if(source<sourceLimit) {
914 b=*source;
915 if(U8_IS_TRAIL(b)) {
916 ++source;
917 ++toULength;
918 c=(c<<6)+b;
919 } else {
920 break; /* sequence too short, stop with toULength<toULimit */
921 }
922 } else {
923 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
924 source-=(toULength-oldToULength);
925 while(oldToULength<toULength) {
926 utf8->toUBytes[oldToULength++]=*source++;
927 }
928 utf8->toUnicodeStatus=c;
929 utf8->toULength=toULength;
930 utf8->mode=toULimit;
931 pToUArgs->source=(char *)source;
932 pFromUArgs->target=(char *)target;
933 return;
934 }
935 }
936
937 if( toULength==toULimit && /* consumed all trail bytes */
938 (toULength==3 || toULength==2) && /* BMP */
939 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
940 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
941 ) {
942 /* legal byte sequence for BMP code point */
943 } else if(
944 toULength==toULimit && toULength==4 &&
945 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
946 ) {
947 /* legal byte sequence for supplementary code point */
948 } else {
949 /* error handling: illegal UTF-8 byte sequence */
950 source-=(toULength-oldToULength);
951 while(oldToULength<toULength) {
952 utf8->toUBytes[oldToULength++]=*source++;
953 }
954 utf8->toULength=toULength;
955 pToUArgs->source=(char *)source;
956 pFromUArgs->target=(char *)target;
957 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
958 return;
959 }
960
961 /* copy the legal byte sequence to the target */
962 {
963 int8_t i;
964
965 for(i=0; i<oldToULength; ++i) {
966 *target++=utf8->toUBytes[i];
967 }
968 source-=(toULength-oldToULength);
969 for(; i<toULength; ++i) {
970 *target++=*source++;
971 }
972 count-=toULength;
973 }
974 }
975 }
976
977 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
978 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
979 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
980 } else {
981 b=*source;
51004dcb 982 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
983 if(toULimit>(sourceLimit-source)) {
984 /* collect a truncated byte sequence */
985 toULength=0;
986 c=b;
987 for(;;) {
988 utf8->toUBytes[toULength++]=b;
989 if(++source==sourceLimit) {
990 /* partial byte sequence at end of source */
991 utf8->toUnicodeStatus=c;
992 utf8->toULength=toULength;
993 utf8->mode=toULimit;
994 break;
995 } else if(!U8_IS_TRAIL(b=*source)) {
996 /* lead byte in trail byte position */
997 utf8->toULength=toULength;
998 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
999 break;
1000 }
1001 c=(c<<6)+b;
1002 }
1003 } else {
1004 /* partial-sequence target overflow: fall back to the pivoting implementation */
1005 *pErrorCode=U_USING_DEFAULT_WARNING;
1006 }
1007 }
1008 }
1009
1010 /* write back the updated pointers */
1011 pToUArgs->source=(char *)source;
1012 pFromUArgs->target=(char *)target;
1013}
1014
f3c0d7a5
A
1015U_CDECL_END
1016
b75a7d8f
A
1017/* UTF-8 converter data ----------------------------------------------------- */
1018
1019static const UConverterImpl _UTF8Impl={
1020 UCNV_UTF8,
1021
1022 NULL,
1023 NULL,
1024
1025 NULL,
1026 NULL,
1027 NULL,
1028
374ca955
A
1029 ucnv_toUnicode_UTF8,
1030 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1031 ucnv_fromUnicode_UTF8,
1032 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1033 ucnv_getNextUChar_UTF8,
b75a7d8f
A
1034
1035 NULL,
1036 NULL,
1037 NULL,
1038 NULL,
46f4442e
A
1039 ucnv_getNonSurrogateUnicodeSet,
1040
1041 ucnv_UTF8FromUTF8,
1042 ucnv_UTF8FromUTF8
b75a7d8f
A
1043};
1044
1045/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1046static const UConverterStaticData _UTF8StaticData={
1047 sizeof(UConverterStaticData),
1048 "UTF-8",
374ca955
A
1049 1208, UCNV_IBM, UCNV_UTF8,
1050 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
b75a7d8f
A
1051 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1052 0,
1053 0,
1054 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1055};
1056
1057
2ca993e8
A
1058const UConverterSharedData _UTF8Data=
1059 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
b75a7d8f
A
1060
1061/* CESU-8 converter data ---------------------------------------------------- */
1062
374ca955
A
1063static const UConverterImpl _CESU8Impl={
1064 UCNV_CESU8,
1065
1066 NULL,
1067 NULL,
1068
1069 NULL,
1070 NULL,
1071 NULL,
1072
1073 ucnv_toUnicode_UTF8,
1074 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1075 ucnv_fromUnicode_UTF8,
1076 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1077 NULL,
1078
1079 NULL,
1080 NULL,
1081 NULL,
1082 NULL,
f3c0d7a5
A
1083 ucnv_getCompleteUnicodeSet,
1084
1085 NULL,
1086 NULL
374ca955
A
1087};
1088
b75a7d8f
A
1089static const UConverterStaticData _CESU8StaticData={
1090 sizeof(UConverterStaticData),
1091 "CESU-8",
73c04bcf
A
1092 9400, /* CCSID for CESU-8 */
1093 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
b75a7d8f
A
1094 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1095 0,
1096 0,
1097 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1098};
1099
1100
2ca993e8
A
1101const UConverterSharedData _CESU8Data=
1102 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
374ca955
A
1103
1104#endif