]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u32.c
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u32.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "unicode/utf.h"
23 #include "ucnv_bld.h"
24 #include "ucnv_cnv.h"
25 #include "cmemory.h"
26
27 #define MAXIMUM_UCS2 0x0000FFFF
28 #define MAXIMUM_UTF 0x0010FFFF
29 #define HALF_SHIFT 10
30 #define HALF_BASE 0x0010000
31 #define HALF_MASK 0x3FF
32 #define SURROGATE_HIGH_START 0xD800
33 #define SURROGATE_LOW_START 0xDC00
34
35 /* -SURROGATE_LOW_START + HALF_BASE */
36 #define SURROGATE_LOW_BASE 9216
37
38 enum {
39 UCNV_NEED_TO_WRITE_BOM=1
40 };
41
42 /* UTF-32BE ----------------------------------------------------------------- */
43
44 static void
45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
46 UErrorCode * err)
47 {
48 const unsigned char *mySource = (unsigned char *) args->source;
49 UChar *myTarget = args->target;
50 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
51 const UChar *targetLimit = args->targetLimit;
52 unsigned char *toUBytes = args->converter->toUBytes;
53 uint32_t ch, i;
54
55 /* Restore state of current sequence */
56 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
57 i = args->converter->toULength; /* restore # of bytes consumed */
58 args->converter->toULength = 0;
59
60 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
61 args->converter->toUnicodeStatus = 0;
62 goto morebytes;
63 }
64
65 while (mySource < sourceLimit && myTarget < targetLimit) {
66 i = 0;
67 ch = 0;
68 morebytes:
69 while (i < sizeof(uint32_t)) {
70 if (mySource < sourceLimit) {
71 ch = (ch << 8) | (uint8_t)(*mySource);
72 toUBytes[i++] = (char) *(mySource++);
73 }
74 else {
75 /* stores a partially calculated target*/
76 /* + 1 to make 0 a valid character */
77 args->converter->toUnicodeStatus = ch + 1;
78 args->converter->toULength = (int8_t) i;
79 goto donefornow;
80 }
81 }
82
83 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85 if (ch <= MAXIMUM_UCS2)
86 {
87 /* fits in 16 bits */
88 *(myTarget++) = (UChar) ch;
89 }
90 else {
91 /* write out the surrogates */
92 *(myTarget++) = U16_LEAD(ch);
93 ch = U16_TRAIL(ch);
94 if (myTarget < targetLimit) {
95 *(myTarget++) = (UChar)ch;
96 }
97 else {
98 /* Put in overflow buffer (not handled here) */
99 args->converter->UCharErrorBuffer[0] = (UChar) ch;
100 args->converter->UCharErrorBufferLength = 1;
101 *err = U_BUFFER_OVERFLOW_ERROR;
102 break;
103 }
104 }
105 }
106 else {
107 args->converter->toULength = (int8_t)i;
108 *err = U_ILLEGAL_CHAR_FOUND;
109 break;
110 }
111 }
112
113 donefornow:
114 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
115 /* End of target buffer */
116 *err = U_BUFFER_OVERFLOW_ERROR;
117 }
118
119 args->target = myTarget;
120 args->source = (const char *) mySource;
121 }
122
123 static void
124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
125 UErrorCode * err)
126 {
127 const unsigned char *mySource = (unsigned char *) args->source;
128 UChar *myTarget = args->target;
129 int32_t *myOffsets = args->offsets;
130 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
131 const UChar *targetLimit = args->targetLimit;
132 unsigned char *toUBytes = args->converter->toUBytes;
133 uint32_t ch, i;
134 int32_t offsetNum = 0;
135
136 /* Restore state of current sequence */
137 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
138 i = args->converter->toULength; /* restore # of bytes consumed */
139 args->converter->toULength = 0;
140
141 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
142 args->converter->toUnicodeStatus = 0;
143 goto morebytes;
144 }
145
146 while (mySource < sourceLimit && myTarget < targetLimit) {
147 i = 0;
148 ch = 0;
149 morebytes:
150 while (i < sizeof(uint32_t)) {
151 if (mySource < sourceLimit) {
152 ch = (ch << 8) | (uint8_t)(*mySource);
153 toUBytes[i++] = (char) *(mySource++);
154 }
155 else {
156 /* stores a partially calculated target*/
157 /* + 1 to make 0 a valid character */
158 args->converter->toUnicodeStatus = ch + 1;
159 args->converter->toULength = (int8_t) i;
160 goto donefornow;
161 }
162 }
163
164 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166 if (ch <= MAXIMUM_UCS2) {
167 /* fits in 16 bits */
168 *(myTarget++) = (UChar) ch;
169 *(myOffsets++) = offsetNum;
170 }
171 else {
172 /* write out the surrogates */
173 *(myTarget++) = U16_LEAD(ch);
174 *myOffsets++ = offsetNum;
175 ch = U16_TRAIL(ch);
176 if (myTarget < targetLimit)
177 {
178 *(myTarget++) = (UChar)ch;
179 *(myOffsets++) = offsetNum;
180 }
181 else {
182 /* Put in overflow buffer (not handled here) */
183 args->converter->UCharErrorBuffer[0] = (UChar) ch;
184 args->converter->UCharErrorBufferLength = 1;
185 *err = U_BUFFER_OVERFLOW_ERROR;
186 break;
187 }
188 }
189 }
190 else {
191 args->converter->toULength = (int8_t)i;
192 *err = U_ILLEGAL_CHAR_FOUND;
193 break;
194 }
195 offsetNum += i;
196 }
197
198 donefornow:
199 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
200 {
201 /* End of target buffer */
202 *err = U_BUFFER_OVERFLOW_ERROR;
203 }
204
205 args->target = myTarget;
206 args->source = (const char *) mySource;
207 args->offsets = myOffsets;
208 }
209
210 static void
211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
212 UErrorCode * err)
213 {
214 const UChar *mySource = args->source;
215 unsigned char *myTarget;
216 const UChar *sourceLimit = args->sourceLimit;
217 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
218 UChar32 ch, ch2;
219 unsigned int indexToWrite;
220 unsigned char temp[sizeof(uint32_t)];
221
222 if(mySource >= sourceLimit) {
223 /* no input, nothing to do */
224 return;
225 }
226
227 /* write the BOM if necessary */
228 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
229 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
230 ucnv_fromUWriteBytes(args->converter,
231 bom, 4,
232 &args->target, args->targetLimit,
233 &args->offsets, -1,
234 err);
235 args->converter->fromUnicodeStatus=0;
236 }
237
238 myTarget = (unsigned char *) args->target;
239 temp[0] = 0;
240
241 if (args->converter->fromUChar32) {
242 ch = args->converter->fromUChar32;
243 args->converter->fromUChar32 = 0;
244 goto lowsurogate;
245 }
246
247 while (mySource < sourceLimit && myTarget < targetLimit) {
248 ch = *(mySource++);
249
250 if (U_IS_SURROGATE(ch)) {
251 if (U_IS_LEAD(ch)) {
252 lowsurogate:
253 if (mySource < sourceLimit) {
254 ch2 = *mySource;
255 if (U_IS_TRAIL(ch2)) {
256 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
257 mySource++;
258 }
259 else {
260 /* this is an unmatched trail code unit (2nd surrogate) */
261 /* callback(illegal) */
262 args->converter->fromUChar32 = ch;
263 *err = U_ILLEGAL_CHAR_FOUND;
264 break;
265 }
266 }
267 else {
268 /* ran out of source */
269 args->converter->fromUChar32 = ch;
270 if (args->flush) {
271 /* this is an unmatched trail code unit (2nd surrogate) */
272 /* callback(illegal) */
273 *err = U_ILLEGAL_CHAR_FOUND;
274 }
275 break;
276 }
277 }
278 else {
279 /* this is an unmatched trail code unit (2nd surrogate) */
280 /* callback(illegal) */
281 args->converter->fromUChar32 = ch;
282 *err = U_ILLEGAL_CHAR_FOUND;
283 break;
284 }
285 }
286
287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
289 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
290 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
291
292 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
293 if (myTarget < targetLimit) {
294 *(myTarget++) = temp[indexToWrite];
295 }
296 else {
297 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
298 *err = U_BUFFER_OVERFLOW_ERROR;
299 }
300 }
301 }
302
303 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
304 *err = U_BUFFER_OVERFLOW_ERROR;
305 }
306
307 args->target = (char *) myTarget;
308 args->source = mySource;
309 }
310
311 static void
312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
313 UErrorCode * err)
314 {
315 const UChar *mySource = args->source;
316 unsigned char *myTarget;
317 int32_t *myOffsets;
318 const UChar *sourceLimit = args->sourceLimit;
319 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
320 UChar32 ch, ch2;
321 int32_t offsetNum = 0;
322 unsigned int indexToWrite;
323 unsigned char temp[sizeof(uint32_t)];
324
325 if(mySource >= sourceLimit) {
326 /* no input, nothing to do */
327 return;
328 }
329
330 /* write the BOM if necessary */
331 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
332 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
333 ucnv_fromUWriteBytes(args->converter,
334 bom, 4,
335 &args->target, args->targetLimit,
336 &args->offsets, -1,
337 err);
338 args->converter->fromUnicodeStatus=0;
339 }
340
341 myTarget = (unsigned char *) args->target;
342 myOffsets = args->offsets;
343 temp[0] = 0;
344
345 if (args->converter->fromUChar32) {
346 ch = args->converter->fromUChar32;
347 args->converter->fromUChar32 = 0;
348 goto lowsurogate;
349 }
350
351 while (mySource < sourceLimit && myTarget < targetLimit) {
352 ch = *(mySource++);
353
354 if (U_IS_SURROGATE(ch)) {
355 if (U_IS_LEAD(ch)) {
356 lowsurogate:
357 if (mySource < sourceLimit) {
358 ch2 = *mySource;
359 if (U_IS_TRAIL(ch2)) {
360 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
361 mySource++;
362 }
363 else {
364 /* this is an unmatched trail code unit (2nd surrogate) */
365 /* callback(illegal) */
366 args->converter->fromUChar32 = ch;
367 *err = U_ILLEGAL_CHAR_FOUND;
368 break;
369 }
370 }
371 else {
372 /* ran out of source */
373 args->converter->fromUChar32 = ch;
374 if (args->flush) {
375 /* this is an unmatched trail code unit (2nd surrogate) */
376 /* callback(illegal) */
377 *err = U_ILLEGAL_CHAR_FOUND;
378 }
379 break;
380 }
381 }
382 else {
383 /* this is an unmatched trail code unit (2nd surrogate) */
384 /* callback(illegal) */
385 args->converter->fromUChar32 = ch;
386 *err = U_ILLEGAL_CHAR_FOUND;
387 break;
388 }
389 }
390
391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
393 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
394 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
395
396 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
397 if (myTarget < targetLimit) {
398 *(myTarget++) = temp[indexToWrite];
399 *(myOffsets++) = offsetNum;
400 }
401 else {
402 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
403 *err = U_BUFFER_OVERFLOW_ERROR;
404 }
405 }
406 offsetNum = offsetNum + 1 + (temp[1] != 0);
407 }
408
409 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
410 *err = U_BUFFER_OVERFLOW_ERROR;
411 }
412
413 args->target = (char *) myTarget;
414 args->source = mySource;
415 args->offsets = myOffsets;
416 }
417
418 static UChar32
419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
420 UErrorCode* err)
421 {
422 const uint8_t *mySource;
423 UChar32 myUChar;
424 int32_t length;
425
426 mySource = (const uint8_t *)args->source;
427 if (mySource >= (const uint8_t *)args->sourceLimit)
428 {
429 /* no input */
430 *err = U_INDEX_OUTOFBOUNDS_ERROR;
431 return 0xffff;
432 }
433
434 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
435 if (length < 4)
436 {
437 /* got a partial character */
438 uprv_memcpy(args->converter->toUBytes, mySource, length);
439 args->converter->toULength = (int8_t)length;
440 args->source = (const char *)(mySource + length);
441 *err = U_TRUNCATED_CHAR_FOUND;
442 return 0xffff;
443 }
444
445 /* Don't even try to do a direct cast because the value may be on an odd address. */
446 myUChar = ((UChar32)mySource[0] << 24)
447 | ((UChar32)mySource[1] << 16)
448 | ((UChar32)mySource[2] << 8)
449 | ((UChar32)mySource[3]);
450
451 args->source = (const char *)(mySource + 4);
452 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
453 return myUChar;
454 }
455
456 uprv_memcpy(args->converter->toUBytes, mySource, 4);
457 args->converter->toULength = 4;
458
459 *err = U_ILLEGAL_CHAR_FOUND;
460 return 0xffff;
461 }
462
463 static const UConverterImpl _UTF32BEImpl = {
464 UCNV_UTF32_BigEndian,
465
466 NULL,
467 NULL,
468
469 NULL,
470 NULL,
471 NULL,
472
473 T_UConverter_toUnicode_UTF32_BE,
474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
475 T_UConverter_fromUnicode_UTF32_BE,
476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
477 T_UConverter_getNextUChar_UTF32_BE,
478
479 NULL,
480 NULL,
481 NULL,
482 NULL,
483 ucnv_getNonSurrogateUnicodeSet
484 };
485
486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487 static const UConverterStaticData _UTF32BEStaticData = {
488 sizeof(UConverterStaticData),
489 "UTF-32BE",
490 1232,
491 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
492 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
493 0,
494 0,
495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
496 };
497
498 const UConverterSharedData _UTF32BEData = {
499 sizeof(UConverterSharedData), ~((uint32_t) 0),
500 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
501 0
502 };
503
504 /* UTF-32LE ---------------------------------------------------------- */
505
506 static void
507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
508 UErrorCode * err)
509 {
510 const unsigned char *mySource = (unsigned char *) args->source;
511 UChar *myTarget = args->target;
512 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
513 const UChar *targetLimit = args->targetLimit;
514 unsigned char *toUBytes = args->converter->toUBytes;
515 uint32_t ch, i;
516
517 /* Restore state of current sequence */
518 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
519 {
520 i = args->converter->toULength; /* restore # of bytes consumed */
521 args->converter->toULength = 0;
522
523 /* Stores the previously calculated ch from a previous call*/
524 ch = args->converter->toUnicodeStatus - 1;
525 args->converter->toUnicodeStatus = 0;
526 goto morebytes;
527 }
528
529 while (mySource < sourceLimit && myTarget < targetLimit)
530 {
531 i = 0;
532 ch = 0;
533 morebytes:
534 while (i < sizeof(uint32_t))
535 {
536 if (mySource < sourceLimit)
537 {
538 ch |= ((uint8_t)(*mySource)) << (i * 8);
539 toUBytes[i++] = (char) *(mySource++);
540 }
541 else
542 {
543 /* stores a partially calculated target*/
544 /* + 1 to make 0 a valid character */
545 args->converter->toUnicodeStatus = ch + 1;
546 args->converter->toULength = (int8_t) i;
547 goto donefornow;
548 }
549 }
550
551 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
552 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
553 if (ch <= MAXIMUM_UCS2) {
554 /* fits in 16 bits */
555 *(myTarget++) = (UChar) ch;
556 }
557 else {
558 /* write out the surrogates */
559 *(myTarget++) = U16_LEAD(ch);
560 ch = U16_TRAIL(ch);
561 if (myTarget < targetLimit) {
562 *(myTarget++) = (UChar)ch;
563 }
564 else {
565 /* Put in overflow buffer (not handled here) */
566 args->converter->UCharErrorBuffer[0] = (UChar) ch;
567 args->converter->UCharErrorBufferLength = 1;
568 *err = U_BUFFER_OVERFLOW_ERROR;
569 break;
570 }
571 }
572 }
573 else {
574 args->converter->toULength = (int8_t)i;
575 *err = U_ILLEGAL_CHAR_FOUND;
576 break;
577 }
578 }
579
580 donefornow:
581 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
582 {
583 /* End of target buffer */
584 *err = U_BUFFER_OVERFLOW_ERROR;
585 }
586
587 args->target = myTarget;
588 args->source = (const char *) mySource;
589 }
590
591 static void
592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
593 UErrorCode * err)
594 {
595 const unsigned char *mySource = (unsigned char *) args->source;
596 UChar *myTarget = args->target;
597 int32_t *myOffsets = args->offsets;
598 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
599 const UChar *targetLimit = args->targetLimit;
600 unsigned char *toUBytes = args->converter->toUBytes;
601 uint32_t ch, i;
602 int32_t offsetNum = 0;
603
604 /* Restore state of current sequence */
605 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
606 {
607 i = args->converter->toULength; /* restore # of bytes consumed */
608 args->converter->toULength = 0;
609
610 /* Stores the previously calculated ch from a previous call*/
611 ch = args->converter->toUnicodeStatus - 1;
612 args->converter->toUnicodeStatus = 0;
613 goto morebytes;
614 }
615
616 while (mySource < sourceLimit && myTarget < targetLimit)
617 {
618 i = 0;
619 ch = 0;
620 morebytes:
621 while (i < sizeof(uint32_t))
622 {
623 if (mySource < sourceLimit)
624 {
625 ch |= ((uint8_t)(*mySource)) << (i * 8);
626 toUBytes[i++] = (char) *(mySource++);
627 }
628 else
629 {
630 /* stores a partially calculated target*/
631 /* + 1 to make 0 a valid character */
632 args->converter->toUnicodeStatus = ch + 1;
633 args->converter->toULength = (int8_t) i;
634 goto donefornow;
635 }
636 }
637
638 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
639 {
640 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
641 if (ch <= MAXIMUM_UCS2)
642 {
643 /* fits in 16 bits */
644 *(myTarget++) = (UChar) ch;
645 *(myOffsets++) = offsetNum;
646 }
647 else {
648 /* write out the surrogates */
649 *(myTarget++) = U16_LEAD(ch);
650 *(myOffsets++) = offsetNum;
651 ch = U16_TRAIL(ch);
652 if (myTarget < targetLimit)
653 {
654 *(myTarget++) = (UChar)ch;
655 *(myOffsets++) = offsetNum;
656 }
657 else
658 {
659 /* Put in overflow buffer (not handled here) */
660 args->converter->UCharErrorBuffer[0] = (UChar) ch;
661 args->converter->UCharErrorBufferLength = 1;
662 *err = U_BUFFER_OVERFLOW_ERROR;
663 break;
664 }
665 }
666 }
667 else
668 {
669 args->converter->toULength = (int8_t)i;
670 *err = U_ILLEGAL_CHAR_FOUND;
671 break;
672 }
673 offsetNum += i;
674 }
675
676 donefornow:
677 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
678 {
679 /* End of target buffer */
680 *err = U_BUFFER_OVERFLOW_ERROR;
681 }
682
683 args->target = myTarget;
684 args->source = (const char *) mySource;
685 args->offsets = myOffsets;
686 }
687
688 static void
689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
690 UErrorCode * err)
691 {
692 const UChar *mySource = args->source;
693 unsigned char *myTarget;
694 const UChar *sourceLimit = args->sourceLimit;
695 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
696 UChar32 ch, ch2;
697 unsigned int indexToWrite;
698 unsigned char temp[sizeof(uint32_t)];
699
700 if(mySource >= sourceLimit) {
701 /* no input, nothing to do */
702 return;
703 }
704
705 /* write the BOM if necessary */
706 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
707 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
708 ucnv_fromUWriteBytes(args->converter,
709 bom, 4,
710 &args->target, args->targetLimit,
711 &args->offsets, -1,
712 err);
713 args->converter->fromUnicodeStatus=0;
714 }
715
716 myTarget = (unsigned char *) args->target;
717 temp[3] = 0;
718
719 if (args->converter->fromUChar32)
720 {
721 ch = args->converter->fromUChar32;
722 args->converter->fromUChar32 = 0;
723 goto lowsurogate;
724 }
725
726 while (mySource < sourceLimit && myTarget < targetLimit)
727 {
728 ch = *(mySource++);
729
730 if (U16_IS_SURROGATE(ch)) {
731 if (U16_IS_LEAD(ch))
732 {
733 lowsurogate:
734 if (mySource < sourceLimit)
735 {
736 ch2 = *mySource;
737 if (U16_IS_TRAIL(ch2)) {
738 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
739 mySource++;
740 }
741 else {
742 /* this is an unmatched trail code unit (2nd surrogate) */
743 /* callback(illegal) */
744 args->converter->fromUChar32 = ch;
745 *err = U_ILLEGAL_CHAR_FOUND;
746 break;
747 }
748 }
749 else {
750 /* ran out of source */
751 args->converter->fromUChar32 = ch;
752 if (args->flush) {
753 /* this is an unmatched trail code unit (2nd surrogate) */
754 /* callback(illegal) */
755 *err = U_ILLEGAL_CHAR_FOUND;
756 }
757 break;
758 }
759 }
760 else {
761 /* this is an unmatched trail code unit (2nd surrogate) */
762 /* callback(illegal) */
763 args->converter->fromUChar32 = ch;
764 *err = U_ILLEGAL_CHAR_FOUND;
765 break;
766 }
767 }
768
769 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
770 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
771 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
772 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
773
774 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
775 {
776 if (myTarget < targetLimit)
777 {
778 *(myTarget++) = temp[indexToWrite];
779 }
780 else
781 {
782 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
783 *err = U_BUFFER_OVERFLOW_ERROR;
784 }
785 }
786 }
787
788 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
789 {
790 *err = U_BUFFER_OVERFLOW_ERROR;
791 }
792
793 args->target = (char *) myTarget;
794 args->source = mySource;
795 }
796
797 static void
798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
799 UErrorCode * err)
800 {
801 const UChar *mySource = args->source;
802 unsigned char *myTarget;
803 int32_t *myOffsets;
804 const UChar *sourceLimit = args->sourceLimit;
805 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
806 UChar32 ch, ch2;
807 unsigned int indexToWrite;
808 unsigned char temp[sizeof(uint32_t)];
809 int32_t offsetNum = 0;
810
811 if(mySource >= sourceLimit) {
812 /* no input, nothing to do */
813 return;
814 }
815
816 /* write the BOM if necessary */
817 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
818 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
819 ucnv_fromUWriteBytes(args->converter,
820 bom, 4,
821 &args->target, args->targetLimit,
822 &args->offsets, -1,
823 err);
824 args->converter->fromUnicodeStatus=0;
825 }
826
827 myTarget = (unsigned char *) args->target;
828 myOffsets = args->offsets;
829 temp[3] = 0;
830
831 if (args->converter->fromUChar32)
832 {
833 ch = args->converter->fromUChar32;
834 args->converter->fromUChar32 = 0;
835 goto lowsurogate;
836 }
837
838 while (mySource < sourceLimit && myTarget < targetLimit)
839 {
840 ch = *(mySource++);
841
842 if (U16_IS_SURROGATE(ch)) {
843 if (U16_IS_LEAD(ch))
844 {
845 lowsurogate:
846 if (mySource < sourceLimit)
847 {
848 ch2 = *mySource;
849 if (U16_IS_TRAIL(ch2))
850 {
851 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
852 mySource++;
853 }
854 else {
855 /* this is an unmatched trail code unit (2nd surrogate) */
856 /* callback(illegal) */
857 args->converter->fromUChar32 = ch;
858 *err = U_ILLEGAL_CHAR_FOUND;
859 break;
860 }
861 }
862 else {
863 /* ran out of source */
864 args->converter->fromUChar32 = ch;
865 if (args->flush) {
866 /* this is an unmatched trail code unit (2nd surrogate) */
867 /* callback(illegal) */
868 *err = U_ILLEGAL_CHAR_FOUND;
869 }
870 break;
871 }
872 }
873 else {
874 /* this is an unmatched trail code unit (2nd surrogate) */
875 /* callback(illegal) */
876 args->converter->fromUChar32 = ch;
877 *err = U_ILLEGAL_CHAR_FOUND;
878 break;
879 }
880 }
881
882 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
883 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
884 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
885 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
886
887 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
888 {
889 if (myTarget < targetLimit)
890 {
891 *(myTarget++) = temp[indexToWrite];
892 *(myOffsets++) = offsetNum;
893 }
894 else
895 {
896 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
897 *err = U_BUFFER_OVERFLOW_ERROR;
898 }
899 }
900 offsetNum = offsetNum + 1 + (temp[2] != 0);
901 }
902
903 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
904 {
905 *err = U_BUFFER_OVERFLOW_ERROR;
906 }
907
908 args->target = (char *) myTarget;
909 args->source = mySource;
910 args->offsets = myOffsets;
911 }
912
913 static UChar32
914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
915 UErrorCode* err)
916 {
917 const uint8_t *mySource;
918 UChar32 myUChar;
919 int32_t length;
920
921 mySource = (const uint8_t *)args->source;
922 if (mySource >= (const uint8_t *)args->sourceLimit)
923 {
924 /* no input */
925 *err = U_INDEX_OUTOFBOUNDS_ERROR;
926 return 0xffff;
927 }
928
929 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
930 if (length < 4)
931 {
932 /* got a partial character */
933 uprv_memcpy(args->converter->toUBytes, mySource, length);
934 args->converter->toULength = (int8_t)length;
935 args->source = (const char *)(mySource + length);
936 *err = U_TRUNCATED_CHAR_FOUND;
937 return 0xffff;
938 }
939
940 /* Don't even try to do a direct cast because the value may be on an odd address. */
941 myUChar = ((UChar32)mySource[3] << 24)
942 | ((UChar32)mySource[2] << 16)
943 | ((UChar32)mySource[1] << 8)
944 | ((UChar32)mySource[0]);
945
946 args->source = (const char *)(mySource + 4);
947 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
948 return myUChar;
949 }
950
951 uprv_memcpy(args->converter->toUBytes, mySource, 4);
952 args->converter->toULength = 4;
953
954 *err = U_ILLEGAL_CHAR_FOUND;
955 return 0xffff;
956 }
957
958 static const UConverterImpl _UTF32LEImpl = {
959 UCNV_UTF32_LittleEndian,
960
961 NULL,
962 NULL,
963
964 NULL,
965 NULL,
966 NULL,
967
968 T_UConverter_toUnicode_UTF32_LE,
969 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
970 T_UConverter_fromUnicode_UTF32_LE,
971 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
972 T_UConverter_getNextUChar_UTF32_LE,
973
974 NULL,
975 NULL,
976 NULL,
977 NULL,
978 ucnv_getNonSurrogateUnicodeSet
979 };
980
981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
982 static const UConverterStaticData _UTF32LEStaticData = {
983 sizeof(UConverterStaticData),
984 "UTF-32LE",
985 1234,
986 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
987 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
988 0,
989 0,
990 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
991 };
992
993
994 const UConverterSharedData _UTF32LEData = {
995 sizeof(UConverterSharedData), ~((uint32_t) 0),
996 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
997 0
998 };
999
1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1001
1002 /*
1003 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1004 * accordingly.
1005 *
1006 * State values:
1007 * 0 initial state
1008 * 1 saw 00
1009 * 2 saw 00 00
1010 * 3 saw 00 00 FE
1011 * 4 -
1012 * 5 saw FF
1013 * 6 saw FF FE
1014 * 7 saw FF FE 00
1015 * 8 UTF-32BE mode
1016 * 9 UTF-32LE mode
1017 *
1018 * During detection: state&3==number of matching bytes so far.
1019 *
1020 * On output, emit U+FEFF as the first code point.
1021 */
1022
1023 static void
1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1025 if(choice<=UCNV_RESET_TO_UNICODE) {
1026 /* reset toUnicode: state=0 */
1027 cnv->mode=0;
1028 }
1029 if(choice!=UCNV_RESET_TO_UNICODE) {
1030 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1031 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1032 }
1033 }
1034
1035 static void
1036 _UTF32Open(UConverter *cnv,
1037 UConverterLoadArgs *pArgs,
1038 UErrorCode *pErrorCode) {
1039 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1040 }
1041
1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1043
1044 static void
1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1046 UErrorCode *pErrorCode) {
1047 UConverter *cnv=pArgs->converter;
1048 const char *source=pArgs->source;
1049 const char *sourceLimit=pArgs->sourceLimit;
1050 int32_t *offsets=pArgs->offsets;
1051
1052 int32_t state, offsetDelta;
1053 char b;
1054
1055 state=cnv->mode;
1056
1057 /*
1058 * If we detect a BOM in this buffer, then we must add the BOM size to the
1059 * offsets because the actual converter function will not see and count the BOM.
1060 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1061 */
1062 offsetDelta=0;
1063
1064 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1065 switch(state) {
1066 case 0:
1067 b=*source;
1068 if(b==0) {
1069 state=1; /* could be 00 00 FE FF */
1070 } else if(b==(char)0xff) {
1071 state=5; /* could be FF FE 00 00 */
1072 } else {
1073 state=8; /* default to UTF-32BE */
1074 continue;
1075 }
1076 ++source;
1077 break;
1078 case 1:
1079 case 2:
1080 case 3:
1081 case 5:
1082 case 6:
1083 case 7:
1084 if(*source==utf32BOM[state]) {
1085 ++state;
1086 ++source;
1087 if(state==4) {
1088 state=8; /* detect UTF-32BE */
1089 offsetDelta=(int32_t)(source-pArgs->source);
1090 } else if(state==8) {
1091 state=9; /* detect UTF-32LE */
1092 offsetDelta=(int32_t)(source-pArgs->source);
1093 }
1094 } else {
1095 /* switch to UTF-32BE and pass the previous bytes */
1096 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1097
1098 /* reset the source */
1099 source=pArgs->source;
1100
1101 if(count==(state&3)) {
1102 /* simple: all in the same buffer, just reset source */
1103 } else {
1104 UBool oldFlush=pArgs->flush;
1105
1106 /* some of the bytes are from a previous buffer, replay those first */
1107 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1108 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1109 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1110
1111 /* no offsets: bytes from previous buffer, and not enough for output */
1112 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1113
1114 /* restore real pointers; pArgs->source will be set in case 8/9 */
1115 pArgs->sourceLimit=sourceLimit;
1116 pArgs->flush=oldFlush;
1117 }
1118 state=8;
1119 continue;
1120 }
1121 break;
1122 case 8:
1123 /* call UTF-32BE */
1124 pArgs->source=source;
1125 if(offsets==NULL) {
1126 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1127 } else {
1128 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1129 }
1130 source=pArgs->source;
1131 break;
1132 case 9:
1133 /* call UTF-32LE */
1134 pArgs->source=source;
1135 if(offsets==NULL) {
1136 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1137 } else {
1138 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1139 }
1140 source=pArgs->source;
1141 break;
1142 default:
1143 break; /* does not occur */
1144 }
1145 }
1146
1147 /* add BOM size to offsets - see comment at offsetDelta declaration */
1148 if(offsets!=NULL && offsetDelta!=0) {
1149 int32_t *offsetsLimit=pArgs->offsets;
1150 while(offsets<offsetsLimit) {
1151 *offsets++ += offsetDelta;
1152 }
1153 }
1154
1155 pArgs->source=source;
1156
1157 if(source==sourceLimit && pArgs->flush) {
1158 /* handle truncated input */
1159 switch(state) {
1160 case 0:
1161 break; /* no input at all, nothing to do */
1162 case 8:
1163 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1164 break;
1165 case 9:
1166 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1167 break;
1168 default:
1169 /* handle 0<state<8: call UTF-32BE with too-short input */
1170 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1171 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1172
1173 /* no offsets: not enough for output */
1174 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1175 pArgs->source=source;
1176 pArgs->sourceLimit=sourceLimit;
1177 state=8;
1178 break;
1179 }
1180 }
1181
1182 cnv->mode=state;
1183 }
1184
1185 static UChar32
1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1187 UErrorCode *pErrorCode) {
1188 switch(pArgs->converter->mode) {
1189 case 8:
1190 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1191 case 9:
1192 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1193 default:
1194 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1195 }
1196 }
1197
1198 static const UConverterImpl _UTF32Impl = {
1199 UCNV_UTF32,
1200
1201 NULL,
1202 NULL,
1203
1204 _UTF32Open,
1205 NULL,
1206 _UTF32Reset,
1207
1208 _UTF32ToUnicodeWithOffsets,
1209 _UTF32ToUnicodeWithOffsets,
1210 #if U_IS_BIG_ENDIAN
1211 T_UConverter_fromUnicode_UTF32_BE,
1212 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1213 #else
1214 T_UConverter_fromUnicode_UTF32_LE,
1215 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1216 #endif
1217 _UTF32GetNextUChar,
1218
1219 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1220 NULL,
1221 NULL,
1222 NULL,
1223 ucnv_getNonSurrogateUnicodeSet
1224 };
1225
1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1227 static const UConverterStaticData _UTF32StaticData = {
1228 sizeof(UConverterStaticData),
1229 "UTF-32",
1230 1236,
1231 UCNV_IBM, UCNV_UTF32, 4, 4,
1232 #if U_IS_BIG_ENDIAN
1233 { 0, 0, 0xff, 0xfd }, 4,
1234 #else
1235 { 0xfd, 0xff, 0, 0 }, 4,
1236 #endif
1237 FALSE, FALSE,
1238 0,
1239 0,
1240 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1241 };
1242
1243 const UConverterSharedData _UTF32Data = {
1244 sizeof(UConverterSharedData), ~((uint32_t) 0),
1245 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1246 0
1247 };
1248
1249 #endif