]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u32.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u32.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25
26 #define MAXIMUM_UCS2 0x0000FFFF
27 #define MAXIMUM_UTF 0x0010FFFF
28 #define HALF_SHIFT 10
29 #define HALF_BASE 0x0010000
30 #define HALF_MASK 0x3FF
31 #define SURROGATE_HIGH_START 0xD800
32 #define SURROGATE_LOW_START 0xDC00
33
34 /* -SURROGATE_LOW_START + HALF_BASE */
35 #define SURROGATE_LOW_BASE 9216
36
37 /* UTF-32BE ----------------------------------------------------------------- */
38
39 static void
40 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
41 UErrorCode * err)
42 {
43 const unsigned char *mySource = (unsigned char *) args->source;
44 UChar *myTarget = args->target;
45 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
46 const UChar *targetLimit = args->targetLimit;
47 unsigned char *toUBytes = args->converter->toUBytes;
48 uint32_t ch, i;
49
50 /* UTF-8 returns here for only non-offset, this needs to change.*/
51 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
52 i = args->converter->toULength; /* restore # of bytes consumed */
53
54 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
55 args->converter->toUnicodeStatus = 0;
56 goto morebytes;
57 }
58
59 while (mySource < sourceLimit && myTarget < targetLimit) {
60 i = 0;
61 ch = 0;
62 morebytes:
63 while (i < sizeof(uint32_t)) {
64 if (mySource < sourceLimit) {
65 ch = (ch << 8) | (uint8_t)(*mySource);
66 toUBytes[i++] = (char) *(mySource++);
67 }
68 else {
69 /* stores a partially calculated target*/
70 /* + 1 to make 0 a valid character */
71 args->converter->toUnicodeStatus = ch + 1;
72 args->converter->toULength = (int8_t) i;
73 goto donefornow;
74 }
75 }
76
77 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
78 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
79 if (ch <= MAXIMUM_UCS2)
80 {
81 /* fits in 16 bits */
82 *(myTarget++) = (UChar) ch;
83 }
84 else {
85 /* write out the surrogates */
86 *(myTarget++) = U16_LEAD(ch);
87 ch = U16_TRAIL(ch);
88 if (myTarget < targetLimit) {
89 *(myTarget++) = (UChar)ch;
90 }
91 else {
92 /* Put in overflow buffer (not handled here) */
93 args->converter->UCharErrorBuffer[0] = (UChar) ch;
94 args->converter->UCharErrorBufferLength = 1;
95 *err = U_BUFFER_OVERFLOW_ERROR;
96 break;
97 }
98 }
99 }
100 else {
101 args->converter->toULength = (int8_t)i;
102 *err = U_ILLEGAL_CHAR_FOUND;
103 break;
104 }
105 }
106
107 donefornow:
108 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
109 /* End of target buffer */
110 *err = U_BUFFER_OVERFLOW_ERROR;
111 }
112
113 args->target = myTarget;
114 args->source = (const char *) mySource;
115 }
116
117 static void
118 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
119 UErrorCode * err)
120 {
121 const unsigned char *mySource = (unsigned char *) args->source;
122 UChar *myTarget = args->target;
123 int32_t *myOffsets = args->offsets;
124 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
125 const UChar *targetLimit = args->targetLimit;
126 unsigned char *toUBytes = args->converter->toUBytes;
127 uint32_t ch, i;
128 int32_t offsetNum = 0;
129
130 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
131 i = args->converter->toULength; /* restore # of bytes consumed */
132
133 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
134 args->converter->toUnicodeStatus = 0;
135 goto morebytes;
136 }
137
138 while (mySource < sourceLimit && myTarget < targetLimit) {
139 i = 0;
140 ch = 0;
141 morebytes:
142 while (i < sizeof(uint32_t)) {
143 if (mySource < sourceLimit) {
144 ch = (ch << 8) | (uint8_t)(*mySource);
145 toUBytes[i++] = (char) *(mySource++);
146 }
147 else {
148 /* stores a partially calculated target*/
149 /* + 1 to make 0 a valid character */
150 args->converter->toUnicodeStatus = ch + 1;
151 args->converter->toULength = (int8_t) i;
152 goto donefornow;
153 }
154 }
155
156 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
157 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
158 if (ch <= MAXIMUM_UCS2) {
159 /* fits in 16 bits */
160 *(myTarget++) = (UChar) ch;
161 *(myOffsets++) = offsetNum;
162 }
163 else {
164 /* write out the surrogates */
165 *(myTarget++) = U16_LEAD(ch);
166 *myOffsets++ = offsetNum;
167 ch = U16_TRAIL(ch);
168 if (myTarget < targetLimit)
169 {
170 *(myTarget++) = (UChar)ch;
171 *(myOffsets++) = offsetNum;
172 }
173 else {
174 /* Put in overflow buffer (not handled here) */
175 args->converter->UCharErrorBuffer[0] = (UChar) ch;
176 args->converter->UCharErrorBufferLength = 1;
177 *err = U_BUFFER_OVERFLOW_ERROR;
178 break;
179 }
180 }
181 }
182 else {
183 args->converter->toULength = (int8_t)i;
184 *err = U_ILLEGAL_CHAR_FOUND;
185 break;
186 }
187 offsetNum += i;
188 }
189
190 donefornow:
191 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
192 {
193 /* End of target buffer */
194 *err = U_BUFFER_OVERFLOW_ERROR;
195 }
196
197 args->target = myTarget;
198 args->source = (const char *) mySource;
199 args->offsets = myOffsets;
200 }
201
202 static void
203 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
204 UErrorCode * err)
205 {
206 const UChar *mySource = args->source;
207 unsigned char *myTarget = (unsigned char *) args->target;
208 const UChar *sourceLimit = args->sourceLimit;
209 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
210 UChar32 ch, ch2;
211 unsigned int indexToWrite;
212 unsigned char temp[sizeof(uint32_t)];
213
214 temp[0] = 0;
215
216 if (args->converter->fromUChar32) {
217 ch = args->converter->fromUChar32;
218 args->converter->fromUChar32 = 0;
219 goto lowsurogate;
220 }
221
222 while (mySource < sourceLimit && myTarget < targetLimit) {
223 ch = *(mySource++);
224
225 if (UTF_IS_SURROGATE(ch)) {
226 if (U_IS_LEAD(ch)) {
227 lowsurogate:
228 if (mySource < sourceLimit) {
229 ch2 = *mySource;
230 if (U_IS_TRAIL(ch2)) {
231 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
232 mySource++;
233 }
234 else {
235 /* this is an unmatched trail code unit (2nd surrogate) */
236 /* callback(illegal) */
237 args->converter->fromUChar32 = ch;
238 *err = U_ILLEGAL_CHAR_FOUND;
239 break;
240 }
241 }
242 else {
243 /* ran out of source */
244 args->converter->fromUChar32 = ch;
245 if (args->flush) {
246 /* this is an unmatched trail code unit (2nd surrogate) */
247 /* callback(illegal) */
248 *err = U_ILLEGAL_CHAR_FOUND;
249 }
250 break;
251 }
252 }
253 else {
254 /* this is an unmatched trail code unit (2nd surrogate) */
255 /* callback(illegal) */
256 args->converter->fromUChar32 = ch;
257 *err = U_ILLEGAL_CHAR_FOUND;
258 break;
259 }
260 }
261
262 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
263 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
264 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
265 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
266
267 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
268 if (myTarget < targetLimit) {
269 *(myTarget++) = temp[indexToWrite];
270 }
271 else {
272 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
273 *err = U_BUFFER_OVERFLOW_ERROR;
274 }
275 }
276 }
277
278 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
279 *err = U_BUFFER_OVERFLOW_ERROR;
280 }
281
282 args->target = (char *) myTarget;
283 args->source = mySource;
284 }
285
286 static void
287 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
288 UErrorCode * err)
289 {
290 const UChar *mySource = args->source;
291 unsigned char *myTarget = (unsigned char *) args->target;
292 int32_t *myOffsets = args->offsets;
293 const UChar *sourceLimit = args->sourceLimit;
294 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
295 UChar32 ch, ch2;
296 int32_t offsetNum = 0;
297 unsigned int indexToWrite;
298 unsigned char temp[sizeof(uint32_t)];
299
300 temp[0] = 0;
301
302 if (args->converter->fromUChar32) {
303 ch = args->converter->fromUChar32;
304 args->converter->fromUChar32 = 0;
305 goto lowsurogate;
306 }
307
308 while (mySource < sourceLimit && myTarget < targetLimit) {
309 ch = *(mySource++);
310
311 if (UTF_IS_SURROGATE(ch)) {
312 if (U_IS_LEAD(ch)) {
313 lowsurogate:
314 if (mySource < sourceLimit) {
315 ch2 = *mySource;
316 if (U_IS_TRAIL(ch2)) {
317 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
318 mySource++;
319 }
320 else {
321 /* this is an unmatched trail code unit (2nd surrogate) */
322 /* callback(illegal) */
323 args->converter->fromUChar32 = ch;
324 *err = U_ILLEGAL_CHAR_FOUND;
325 break;
326 }
327 }
328 else {
329 /* ran out of source */
330 args->converter->fromUChar32 = ch;
331 if (args->flush) {
332 /* this is an unmatched trail code unit (2nd surrogate) */
333 /* callback(illegal) */
334 *err = U_ILLEGAL_CHAR_FOUND;
335 }
336 break;
337 }
338 }
339 else {
340 /* this is an unmatched trail code unit (2nd surrogate) */
341 /* callback(illegal) */
342 args->converter->fromUChar32 = ch;
343 *err = U_ILLEGAL_CHAR_FOUND;
344 break;
345 }
346 }
347
348 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
349 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
350 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
351 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
352
353 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
354 if (myTarget < targetLimit) {
355 *(myTarget++) = temp[indexToWrite];
356 *(myOffsets++) = offsetNum;
357 }
358 else {
359 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
360 *err = U_BUFFER_OVERFLOW_ERROR;
361 }
362 }
363 offsetNum++;
364 }
365
366 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
367 *err = U_BUFFER_OVERFLOW_ERROR;
368 }
369
370 args->target = (char *) myTarget;
371 args->source = mySource;
372 args->offsets = myOffsets;
373 }
374
375 static UChar32
376 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
377 UErrorCode* err)
378 {
379 const uint8_t *mySource;
380 UChar32 myUChar;
381 int32_t length;
382
383 mySource = (const uint8_t *)args->source;
384 if (mySource >= (const uint8_t *)args->sourceLimit)
385 {
386 /* no input */
387 *err = U_INDEX_OUTOFBOUNDS_ERROR;
388 return 0xffff;
389 }
390
391 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
392 if (length < 4)
393 {
394 /* got a partial character */
395 uprv_memcpy(args->converter->toUBytes, mySource, length);
396 args->converter->toULength = (int8_t)length;
397 args->source = (const char *)(mySource + length);
398 *err = U_TRUNCATED_CHAR_FOUND;
399 return 0xffff;
400 }
401
402 /* Don't even try to do a direct cast because the value may be on an odd address. */
403 myUChar = ((UChar32)mySource[0] << 24)
404 | ((UChar32)mySource[1] << 16)
405 | ((UChar32)mySource[2] << 8)
406 | ((UChar32)mySource[3]);
407
408 args->source = (const char *)(mySource + 4);
409 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
410 return myUChar;
411 }
412
413 uprv_memcpy(args->converter->toUBytes, mySource, 4);
414 args->converter->toULength = 4;
415
416 *err = U_ILLEGAL_CHAR_FOUND;
417 return 0xffff;
418 }
419
420 static const UConverterImpl _UTF32BEImpl = {
421 UCNV_UTF32_BigEndian,
422
423 NULL,
424 NULL,
425
426 NULL,
427 NULL,
428 NULL,
429
430 T_UConverter_toUnicode_UTF32_BE,
431 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
432 T_UConverter_fromUnicode_UTF32_BE,
433 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
434 T_UConverter_getNextUChar_UTF32_BE,
435
436 NULL,
437 NULL,
438 NULL,
439 NULL,
440 ucnv_getCompleteUnicodeSet
441 };
442
443 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
444 static const UConverterStaticData _UTF32BEStaticData = {
445 sizeof(UConverterStaticData),
446 "UTF-32BE",
447 1232,
448 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
449 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
450 0,
451 0,
452 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
453 };
454
455 const UConverterSharedData _UTF32BEData = {
456 sizeof(UConverterSharedData), ~((uint32_t) 0),
457 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
458 0
459 };
460
461 /* UTF-32LE ---------------------------------------------------------- */
462
463 static void
464 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
465 UErrorCode * err)
466 {
467 const unsigned char *mySource = (unsigned char *) args->source;
468 UChar *myTarget = args->target;
469 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
470 const UChar *targetLimit = args->targetLimit;
471 unsigned char *toUBytes = args->converter->toUBytes;
472 uint32_t ch, i;
473
474 /* UTF-8 returns here for only non-offset, this needs to change.*/
475 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
476 {
477 i = args->converter->toULength; /* restore # of bytes consumed */
478
479 /* Stores the previously calculated ch from a previous call*/
480 ch = args->converter->toUnicodeStatus - 1;
481 args->converter->toUnicodeStatus = 0;
482 goto morebytes;
483 }
484
485 while (mySource < sourceLimit && myTarget < targetLimit)
486 {
487 i = 0;
488 ch = 0;
489 morebytes:
490 while (i < sizeof(uint32_t))
491 {
492 if (mySource < sourceLimit)
493 {
494 ch |= ((uint8_t)(*mySource)) << (i * 8);
495 toUBytes[i++] = (char) *(mySource++);
496 }
497 else
498 {
499 /* stores a partially calculated target*/
500 /* + 1 to make 0 a valid character */
501 args->converter->toUnicodeStatus = ch + 1;
502 args->converter->toULength = (int8_t) i;
503 goto donefornow;
504 }
505 }
506
507 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
508 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
509 if (ch <= MAXIMUM_UCS2) {
510 /* fits in 16 bits */
511 *(myTarget++) = (UChar) ch;
512 }
513 else {
514 /* write out the surrogates */
515 *(myTarget++) = U16_LEAD(ch);
516 ch = U16_TRAIL(ch);
517 if (myTarget < targetLimit) {
518 *(myTarget++) = (UChar)ch;
519 }
520 else {
521 /* Put in overflow buffer (not handled here) */
522 args->converter->UCharErrorBuffer[0] = (UChar) ch;
523 args->converter->UCharErrorBufferLength = 1;
524 *err = U_BUFFER_OVERFLOW_ERROR;
525 break;
526 }
527 }
528 }
529 else {
530 args->converter->toULength = (int8_t)i;
531 *err = U_ILLEGAL_CHAR_FOUND;
532 break;
533 }
534 }
535
536 donefornow:
537 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
538 {
539 /* End of target buffer */
540 *err = U_BUFFER_OVERFLOW_ERROR;
541 }
542
543 args->target = myTarget;
544 args->source = (const char *) mySource;
545 }
546
547 static void
548 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
549 UErrorCode * err)
550 {
551 const unsigned char *mySource = (unsigned char *) args->source;
552 UChar *myTarget = args->target;
553 int32_t *myOffsets = args->offsets;
554 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
555 const UChar *targetLimit = args->targetLimit;
556 unsigned char *toUBytes = args->converter->toUBytes;
557 uint32_t ch, i;
558 int32_t offsetNum = 0;
559
560 /* UTF-8 returns here for only non-offset, this needs to change.*/
561 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
562 {
563 i = args->converter->toULength; /* restore # of bytes consumed */
564
565 /* Stores the previously calculated ch from a previous call*/
566 ch = args->converter->toUnicodeStatus - 1;
567 args->converter->toUnicodeStatus = 0;
568 goto morebytes;
569 }
570
571 while (mySource < sourceLimit && myTarget < targetLimit)
572 {
573 i = 0;
574 ch = 0;
575 morebytes:
576 while (i < sizeof(uint32_t))
577 {
578 if (mySource < sourceLimit)
579 {
580 ch |= ((uint8_t)(*mySource)) << (i * 8);
581 toUBytes[i++] = (char) *(mySource++);
582 }
583 else
584 {
585 /* stores a partially calculated target*/
586 /* + 1 to make 0 a valid character */
587 args->converter->toUnicodeStatus = ch + 1;
588 args->converter->toULength = (int8_t) i;
589 goto donefornow;
590 }
591 }
592
593 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
594 {
595 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
596 if (ch <= MAXIMUM_UCS2)
597 {
598 /* fits in 16 bits */
599 *(myTarget++) = (UChar) ch;
600 *(myOffsets++) = offsetNum;
601 }
602 else {
603 /* write out the surrogates */
604 *(myTarget++) = U16_LEAD(ch);
605 *(myOffsets++) = offsetNum;
606 ch = U16_TRAIL(ch);
607 if (myTarget < targetLimit)
608 {
609 *(myTarget++) = (UChar)ch;
610 *(myOffsets++) = offsetNum;
611 }
612 else
613 {
614 /* Put in overflow buffer (not handled here) */
615 args->converter->UCharErrorBuffer[0] = (UChar) ch;
616 args->converter->UCharErrorBufferLength = 1;
617 *err = U_BUFFER_OVERFLOW_ERROR;
618 break;
619 }
620 }
621 }
622 else
623 {
624 args->converter->toULength = (int8_t)i;
625 *err = U_ILLEGAL_CHAR_FOUND;
626 break;
627 }
628 offsetNum += i;
629 }
630
631 donefornow:
632 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
633 {
634 /* End of target buffer */
635 *err = U_BUFFER_OVERFLOW_ERROR;
636 }
637
638 args->target = myTarget;
639 args->source = (const char *) mySource;
640 args->offsets = myOffsets;
641 }
642
643 static void
644 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
645 UErrorCode * err)
646 {
647 const UChar *mySource = args->source;
648 unsigned char *myTarget = (unsigned char *) args->target;
649 const UChar *sourceLimit = args->sourceLimit;
650 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
651 UChar32 ch, ch2;
652 unsigned int indexToWrite;
653 unsigned char temp[sizeof(uint32_t)];
654
655 temp[3] = 0;
656
657 if (args->converter->fromUChar32)
658 {
659 ch = args->converter->fromUChar32;
660 args->converter->fromUChar32 = 0;
661 goto lowsurogate;
662 }
663
664 while (mySource < sourceLimit && myTarget < targetLimit)
665 {
666 ch = *(mySource++);
667
668 if (UTF_IS_SURROGATE(ch)) {
669 if (U_IS_LEAD(ch))
670 {
671 lowsurogate:
672 if (mySource < sourceLimit)
673 {
674 ch2 = *mySource;
675 if (U_IS_TRAIL(ch2)) {
676 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
677 mySource++;
678 }
679 else {
680 /* this is an unmatched trail code unit (2nd surrogate) */
681 /* callback(illegal) */
682 args->converter->fromUChar32 = ch;
683 *err = U_ILLEGAL_CHAR_FOUND;
684 break;
685 }
686 }
687 else {
688 /* ran out of source */
689 args->converter->fromUChar32 = ch;
690 if (args->flush) {
691 /* this is an unmatched trail code unit (2nd surrogate) */
692 /* callback(illegal) */
693 *err = U_ILLEGAL_CHAR_FOUND;
694 }
695 break;
696 }
697 }
698 else {
699 /* this is an unmatched trail code unit (2nd surrogate) */
700 /* callback(illegal) */
701 args->converter->fromUChar32 = ch;
702 *err = U_ILLEGAL_CHAR_FOUND;
703 break;
704 }
705 }
706
707 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
708 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
709 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
710 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
711
712 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
713 {
714 if (myTarget < targetLimit)
715 {
716 *(myTarget++) = temp[indexToWrite];
717 }
718 else
719 {
720 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
721 *err = U_BUFFER_OVERFLOW_ERROR;
722 }
723 }
724 }
725
726 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
727 {
728 *err = U_BUFFER_OVERFLOW_ERROR;
729 }
730
731 args->target = (char *) myTarget;
732 args->source = mySource;
733 }
734
735 static void
736 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
737 UErrorCode * err)
738 {
739 const UChar *mySource = args->source;
740 unsigned char *myTarget = (unsigned char *) args->target;
741 int32_t *myOffsets = args->offsets;
742 const UChar *sourceLimit = args->sourceLimit;
743 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
744 UChar32 ch, ch2;
745 unsigned int indexToWrite;
746 unsigned char temp[sizeof(uint32_t)];
747 int32_t offsetNum = 0;
748
749 temp[3] = 0;
750
751 if (args->converter->fromUChar32)
752 {
753 ch = args->converter->fromUChar32;
754 args->converter->fromUChar32 = 0;
755 goto lowsurogate;
756 }
757
758 while (mySource < sourceLimit && myTarget < targetLimit)
759 {
760 ch = *(mySource++);
761
762 if (UTF_IS_SURROGATE(ch)) {
763 if (U_IS_LEAD(ch))
764 {
765 lowsurogate:
766 if (mySource < sourceLimit)
767 {
768 ch2 = *mySource;
769 if (U_IS_TRAIL(ch2))
770 {
771 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
772 mySource++;
773 }
774 else {
775 /* this is an unmatched trail code unit (2nd surrogate) */
776 /* callback(illegal) */
777 args->converter->fromUChar32 = ch;
778 *err = U_ILLEGAL_CHAR_FOUND;
779 break;
780 }
781 }
782 else {
783 /* ran out of source */
784 args->converter->fromUChar32 = ch;
785 if (args->flush) {
786 /* this is an unmatched trail code unit (2nd surrogate) */
787 /* callback(illegal) */
788 *err = U_ILLEGAL_CHAR_FOUND;
789 }
790 break;
791 }
792 }
793 else {
794 /* this is an unmatched trail code unit (2nd surrogate) */
795 /* callback(illegal) */
796 args->converter->fromUChar32 = ch;
797 *err = U_ILLEGAL_CHAR_FOUND;
798 break;
799 }
800 }
801
802 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
803 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
804 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
805 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
806
807 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
808 {
809 if (myTarget < targetLimit)
810 {
811 *(myTarget++) = temp[indexToWrite];
812 *(myOffsets++) = offsetNum;
813 }
814 else
815 {
816 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
817 *err = U_BUFFER_OVERFLOW_ERROR;
818 }
819 }
820 offsetNum++;
821 }
822
823 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
824 {
825 *err = U_BUFFER_OVERFLOW_ERROR;
826 }
827
828 args->target = (char *) myTarget;
829 args->source = mySource;
830 args->offsets = myOffsets;
831 }
832
833 static UChar32
834 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
835 UErrorCode* err)
836 {
837 const uint8_t *mySource;
838 UChar32 myUChar;
839 int32_t length;
840
841 mySource = (const uint8_t *)args->source;
842 if (mySource >= (const uint8_t *)args->sourceLimit)
843 {
844 /* no input */
845 *err = U_INDEX_OUTOFBOUNDS_ERROR;
846 return 0xffff;
847 }
848
849 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
850 if (length < 4)
851 {
852 /* got a partial character */
853 uprv_memcpy(args->converter->toUBytes, mySource, length);
854 args->converter->toULength = (int8_t)length;
855 args->source = (const char *)(mySource + length);
856 *err = U_TRUNCATED_CHAR_FOUND;
857 return 0xffff;
858 }
859
860 /* Don't even try to do a direct cast because the value may be on an odd address. */
861 myUChar = ((UChar32)mySource[3] << 24)
862 | ((UChar32)mySource[2] << 16)
863 | ((UChar32)mySource[1] << 8)
864 | ((UChar32)mySource[0]);
865
866 args->source = (const char *)(mySource + 4);
867 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
868 return myUChar;
869 }
870
871 uprv_memcpy(args->converter->toUBytes, mySource, 4);
872 args->converter->toULength = 4;
873
874 *err = U_ILLEGAL_CHAR_FOUND;
875 return 0xffff;
876 }
877
878 static const UConverterImpl _UTF32LEImpl = {
879 UCNV_UTF32_LittleEndian,
880
881 NULL,
882 NULL,
883
884 NULL,
885 NULL,
886 NULL,
887
888 T_UConverter_toUnicode_UTF32_LE,
889 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
890 T_UConverter_fromUnicode_UTF32_LE,
891 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
892 T_UConverter_getNextUChar_UTF32_LE,
893
894 NULL,
895 NULL,
896 NULL,
897 NULL,
898 ucnv_getCompleteUnicodeSet
899 };
900
901 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
902 static const UConverterStaticData _UTF32LEStaticData = {
903 sizeof(UConverterStaticData),
904 "UTF-32LE",
905 1234,
906 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
907 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
908 0,
909 0,
910 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
911 };
912
913
914 const UConverterSharedData _UTF32LEData = {
915 sizeof(UConverterSharedData), ~((uint32_t) 0),
916 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
917 0
918 };
919
920 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
921
922 /*
923 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
924 * accordingly.
925 *
926 * State values:
927 * 0 initial state
928 * 1 saw 00
929 * 2 saw 00 00
930 * 3 saw 00 00 FE
931 * 4 -
932 * 5 saw FF
933 * 6 saw FF FE
934 * 7 saw FF FE 00
935 * 8 UTF-32BE mode
936 * 9 UTF-32LE mode
937 *
938 * During detection: state&3==number of matching bytes so far.
939 *
940 * On output, emit U+FEFF as the first code point.
941 */
942
943 static void
944 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
945 if(choice<=UCNV_RESET_TO_UNICODE) {
946 /* reset toUnicode: state=0 */
947 cnv->mode=0;
948 }
949 if(choice!=UCNV_RESET_TO_UNICODE) {
950 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
951 cnv->charErrorBufferLength=4;
952 #if U_IS_BIG_ENDIAN
953 cnv->charErrorBuffer[0]=0;
954 cnv->charErrorBuffer[1]=0;
955 cnv->charErrorBuffer[2]=0xfe;
956 cnv->charErrorBuffer[3]=0xff;
957 #else
958 cnv->charErrorBuffer[0]=0xff;
959 cnv->charErrorBuffer[1]=0xfe;
960 cnv->charErrorBuffer[2]=0;
961 cnv->charErrorBuffer[3]=0;
962 #endif
963 }
964 }
965
966 static void
967 _UTF32Open(UConverter *cnv,
968 const char *name,
969 const char *locale,
970 uint32_t options,
971 UErrorCode *pErrorCode) {
972 _UTF32Reset(cnv, UCNV_RESET_BOTH);
973 }
974
975 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
976
977 static void
978 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
979 UErrorCode *pErrorCode) {
980 UConverter *cnv=pArgs->converter;
981 const char *source=pArgs->source;
982 const char *sourceLimit=pArgs->sourceLimit;
983 int32_t *offsets=pArgs->offsets;
984
985 int32_t state, offsetDelta;
986 char b;
987
988 state=cnv->mode;
989
990 /*
991 * If we detect a BOM in this buffer, then we must add the BOM size to the
992 * offsets because the actual converter function will not see and count the BOM.
993 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
994 */
995 offsetDelta=0;
996
997 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
998 switch(state) {
999 case 0:
1000 b=*source;
1001 if(b==0) {
1002 state=1; /* could be 00 00 FE FF */
1003 } else if(b==(char)0xff) {
1004 state=5; /* could be FF FE 00 00 */
1005 } else {
1006 state=8; /* default to UTF-32BE */
1007 continue;
1008 }
1009 ++source;
1010 break;
1011 case 1:
1012 case 2:
1013 case 3:
1014 case 5:
1015 case 6:
1016 case 7:
1017 if(*source==utf32BOM[state]) {
1018 ++state;
1019 ++source;
1020 if(state==4) {
1021 state=8; /* detect UTF-32BE */
1022 offsetDelta=source-pArgs->source;
1023 } else if(state==8) {
1024 state=9; /* detect UTF-32LE */
1025 offsetDelta=source-pArgs->source;
1026 }
1027 } else {
1028 /* switch to UTF-32BE and pass the previous bytes */
1029 int32_t count=source-pArgs->source; /* number of bytes from this buffer */
1030
1031 /* reset the source */
1032 source=pArgs->source;
1033
1034 if(count==(state&3)) {
1035 /* simple: all in the same buffer, just reset source */
1036 } else {
1037 UBool oldFlush=pArgs->flush;
1038
1039 /* some of the bytes are from a previous buffer, replay those first */
1040 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1041 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1042 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1043
1044 /* no offsets: bytes from previous buffer, and not enough for output */
1045 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1046
1047 /* restore real pointers; pArgs->source will be set in case 8/9 */
1048 pArgs->sourceLimit=sourceLimit;
1049 pArgs->flush=oldFlush;
1050 }
1051 state=8;
1052 continue;
1053 }
1054 break;
1055 case 8:
1056 /* call UTF-32BE */
1057 pArgs->source=source;
1058 if(offsets==NULL) {
1059 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1060 } else {
1061 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1062 }
1063 source=pArgs->source;
1064 break;
1065 case 9:
1066 /* call UTF-32LE */
1067 pArgs->source=source;
1068 if(offsets==NULL) {
1069 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1070 } else {
1071 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1072 }
1073 source=pArgs->source;
1074 break;
1075 default:
1076 break; /* does not occur */
1077 }
1078 }
1079
1080 /* add BOM size to offsets - see comment at offsetDelta declaration */
1081 if(offsets!=NULL && offsetDelta!=0) {
1082 int32_t *offsetsLimit=pArgs->offsets;
1083 while(offsets<offsetsLimit) {
1084 *offsets++ += offsetDelta;
1085 }
1086 }
1087
1088 pArgs->source=source;
1089
1090 if(source==sourceLimit && pArgs->flush) {
1091 /* handle truncated input */
1092 switch(state) {
1093 case 0:
1094 break; /* no input at all, nothing to do */
1095 case 8:
1096 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1097 break;
1098 case 9:
1099 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1100 break;
1101 default:
1102 /* handle 0<state<8: call UTF-32BE with too-short input */
1103 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1104 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1105
1106 /* no offsets: not enough for output */
1107 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1108 pArgs->source=source;
1109 pArgs->sourceLimit=sourceLimit;
1110 state=8;
1111 break;
1112 }
1113 }
1114
1115 cnv->mode=state;
1116 }
1117
1118 static UChar32
1119 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1120 UErrorCode *pErrorCode) {
1121 switch(pArgs->converter->mode) {
1122 case 8:
1123 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1124 case 9:
1125 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1126 default:
1127 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1128 }
1129 }
1130
1131 static const UConverterImpl _UTF32Impl = {
1132 UCNV_UTF32,
1133
1134 NULL,
1135 NULL,
1136
1137 _UTF32Open,
1138 NULL,
1139 _UTF32Reset,
1140
1141 _UTF32ToUnicodeWithOffsets,
1142 _UTF32ToUnicodeWithOffsets,
1143 #if U_IS_BIG_ENDIAN
1144 T_UConverter_fromUnicode_UTF32_BE,
1145 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1146 #else
1147 T_UConverter_fromUnicode_UTF32_LE,
1148 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1149 #endif
1150 _UTF32GetNextUChar,
1151
1152 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1153 NULL,
1154 NULL,
1155 NULL,
1156 ucnv_getCompleteUnicodeSet
1157 };
1158
1159 static const UConverterStaticData _UTF32StaticData = {
1160 sizeof(UConverterStaticData),
1161 "UTF-32",
1162 0, /* ### TODO review correctness of all Unicode CCSIDs */
1163 UCNV_IBM, UCNV_UTF32, 4, 4,
1164 #if U_IS_BIG_ENDIAN
1165 { 0, 0, 0xff, 0xfd }, 4,
1166 #else
1167 { 0xfd, 0xff, 0, 0 }, 4,
1168 #endif
1169 FALSE, FALSE,
1170 0,
1171 0,
1172 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1173 };
1174
1175 const UConverterSharedData _UTF32Data = {
1176 sizeof(UConverterSharedData), ~((uint32_t) 0),
1177 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1178 0
1179 };
1180
1181 #endif