]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u32.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u32.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25
26 #define MAXIMUM_UCS2 0x0000FFFF
27 #define MAXIMUM_UTF 0x0010FFFF
28 #define HALF_SHIFT 10
29 #define HALF_BASE 0x0010000
30 #define HALF_MASK 0x3FF
31 #define SURROGATE_HIGH_START 0xD800
32 #define SURROGATE_LOW_START 0xDC00
33
34 /* -SURROGATE_LOW_START + HALF_BASE */
35 #define SURROGATE_LOW_BASE 9216
36
37 enum {
38 UCNV_NEED_TO_WRITE_BOM=1
39 };
40
41 /* UTF-32BE ----------------------------------------------------------------- */
42
43 static void
44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
45 UErrorCode * err)
46 {
47 const unsigned char *mySource = (unsigned char *) args->source;
48 UChar *myTarget = args->target;
49 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
50 const UChar *targetLimit = args->targetLimit;
51 unsigned char *toUBytes = args->converter->toUBytes;
52 uint32_t ch, i;
53
54 /* Restore state of current sequence */
55 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
56 i = args->converter->toULength; /* restore # of bytes consumed */
57 args->converter->toULength = 0;
58
59 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
60 args->converter->toUnicodeStatus = 0;
61 goto morebytes;
62 }
63
64 while (mySource < sourceLimit && myTarget < targetLimit) {
65 i = 0;
66 ch = 0;
67 morebytes:
68 while (i < sizeof(uint32_t)) {
69 if (mySource < sourceLimit) {
70 ch = (ch << 8) | (uint8_t)(*mySource);
71 toUBytes[i++] = (char) *(mySource++);
72 }
73 else {
74 /* stores a partially calculated target*/
75 /* + 1 to make 0 a valid character */
76 args->converter->toUnicodeStatus = ch + 1;
77 args->converter->toULength = (int8_t) i;
78 goto donefornow;
79 }
80 }
81
82 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
83 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
84 if (ch <= MAXIMUM_UCS2)
85 {
86 /* fits in 16 bits */
87 *(myTarget++) = (UChar) ch;
88 }
89 else {
90 /* write out the surrogates */
91 *(myTarget++) = U16_LEAD(ch);
92 ch = U16_TRAIL(ch);
93 if (myTarget < targetLimit) {
94 *(myTarget++) = (UChar)ch;
95 }
96 else {
97 /* Put in overflow buffer (not handled here) */
98 args->converter->UCharErrorBuffer[0] = (UChar) ch;
99 args->converter->UCharErrorBufferLength = 1;
100 *err = U_BUFFER_OVERFLOW_ERROR;
101 break;
102 }
103 }
104 }
105 else {
106 args->converter->toULength = (int8_t)i;
107 *err = U_ILLEGAL_CHAR_FOUND;
108 break;
109 }
110 }
111
112 donefornow:
113 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
114 /* End of target buffer */
115 *err = U_BUFFER_OVERFLOW_ERROR;
116 }
117
118 args->target = myTarget;
119 args->source = (const char *) mySource;
120 }
121
122 static void
123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
124 UErrorCode * err)
125 {
126 const unsigned char *mySource = (unsigned char *) args->source;
127 UChar *myTarget = args->target;
128 int32_t *myOffsets = args->offsets;
129 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
130 const UChar *targetLimit = args->targetLimit;
131 unsigned char *toUBytes = args->converter->toUBytes;
132 uint32_t ch, i;
133 int32_t offsetNum = 0;
134
135 /* Restore state of current sequence */
136 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
137 i = args->converter->toULength; /* restore # of bytes consumed */
138 args->converter->toULength = 0;
139
140 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
141 args->converter->toUnicodeStatus = 0;
142 goto morebytes;
143 }
144
145 while (mySource < sourceLimit && myTarget < targetLimit) {
146 i = 0;
147 ch = 0;
148 morebytes:
149 while (i < sizeof(uint32_t)) {
150 if (mySource < sourceLimit) {
151 ch = (ch << 8) | (uint8_t)(*mySource);
152 toUBytes[i++] = (char) *(mySource++);
153 }
154 else {
155 /* stores a partially calculated target*/
156 /* + 1 to make 0 a valid character */
157 args->converter->toUnicodeStatus = ch + 1;
158 args->converter->toULength = (int8_t) i;
159 goto donefornow;
160 }
161 }
162
163 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
164 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
165 if (ch <= MAXIMUM_UCS2) {
166 /* fits in 16 bits */
167 *(myTarget++) = (UChar) ch;
168 *(myOffsets++) = offsetNum;
169 }
170 else {
171 /* write out the surrogates */
172 *(myTarget++) = U16_LEAD(ch);
173 *myOffsets++ = offsetNum;
174 ch = U16_TRAIL(ch);
175 if (myTarget < targetLimit)
176 {
177 *(myTarget++) = (UChar)ch;
178 *(myOffsets++) = offsetNum;
179 }
180 else {
181 /* Put in overflow buffer (not handled here) */
182 args->converter->UCharErrorBuffer[0] = (UChar) ch;
183 args->converter->UCharErrorBufferLength = 1;
184 *err = U_BUFFER_OVERFLOW_ERROR;
185 break;
186 }
187 }
188 }
189 else {
190 args->converter->toULength = (int8_t)i;
191 *err = U_ILLEGAL_CHAR_FOUND;
192 break;
193 }
194 offsetNum += i;
195 }
196
197 donefornow:
198 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
199 {
200 /* End of target buffer */
201 *err = U_BUFFER_OVERFLOW_ERROR;
202 }
203
204 args->target = myTarget;
205 args->source = (const char *) mySource;
206 args->offsets = myOffsets;
207 }
208
209 static void
210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
211 UErrorCode * err)
212 {
213 const UChar *mySource = args->source;
214 unsigned char *myTarget;
215 const UChar *sourceLimit = args->sourceLimit;
216 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
217 UChar32 ch, ch2;
218 unsigned int indexToWrite;
219 unsigned char temp[sizeof(uint32_t)];
220
221 if(mySource >= sourceLimit) {
222 /* no input, nothing to do */
223 return;
224 }
225
226 /* write the BOM if necessary */
227 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
228 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
229 ucnv_fromUWriteBytes(args->converter,
230 bom, 4,
231 &args->target, args->targetLimit,
232 &args->offsets, -1,
233 err);
234 args->converter->fromUnicodeStatus=0;
235 }
236
237 myTarget = (unsigned char *) args->target;
238 temp[0] = 0;
239
240 if (args->converter->fromUChar32) {
241 ch = args->converter->fromUChar32;
242 args->converter->fromUChar32 = 0;
243 goto lowsurogate;
244 }
245
246 while (mySource < sourceLimit && myTarget < targetLimit) {
247 ch = *(mySource++);
248
249 if (UTF_IS_SURROGATE(ch)) {
250 if (U_IS_LEAD(ch)) {
251 lowsurogate:
252 if (mySource < sourceLimit) {
253 ch2 = *mySource;
254 if (U_IS_TRAIL(ch2)) {
255 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
256 mySource++;
257 }
258 else {
259 /* this is an unmatched trail code unit (2nd surrogate) */
260 /* callback(illegal) */
261 args->converter->fromUChar32 = ch;
262 *err = U_ILLEGAL_CHAR_FOUND;
263 break;
264 }
265 }
266 else {
267 /* ran out of source */
268 args->converter->fromUChar32 = ch;
269 if (args->flush) {
270 /* this is an unmatched trail code unit (2nd surrogate) */
271 /* callback(illegal) */
272 *err = U_ILLEGAL_CHAR_FOUND;
273 }
274 break;
275 }
276 }
277 else {
278 /* this is an unmatched trail code unit (2nd surrogate) */
279 /* callback(illegal) */
280 args->converter->fromUChar32 = ch;
281 *err = U_ILLEGAL_CHAR_FOUND;
282 break;
283 }
284 }
285
286 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
287 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
288 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
289 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
290
291 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
292 if (myTarget < targetLimit) {
293 *(myTarget++) = temp[indexToWrite];
294 }
295 else {
296 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
297 *err = U_BUFFER_OVERFLOW_ERROR;
298 }
299 }
300 }
301
302 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
303 *err = U_BUFFER_OVERFLOW_ERROR;
304 }
305
306 args->target = (char *) myTarget;
307 args->source = mySource;
308 }
309
310 static void
311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
312 UErrorCode * err)
313 {
314 const UChar *mySource = args->source;
315 unsigned char *myTarget;
316 int32_t *myOffsets;
317 const UChar *sourceLimit = args->sourceLimit;
318 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
319 UChar32 ch, ch2;
320 int32_t offsetNum = 0;
321 unsigned int indexToWrite;
322 unsigned char temp[sizeof(uint32_t)];
323
324 if(mySource >= sourceLimit) {
325 /* no input, nothing to do */
326 return;
327 }
328
329 /* write the BOM if necessary */
330 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
331 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
332 ucnv_fromUWriteBytes(args->converter,
333 bom, 4,
334 &args->target, args->targetLimit,
335 &args->offsets, -1,
336 err);
337 args->converter->fromUnicodeStatus=0;
338 }
339
340 myTarget = (unsigned char *) args->target;
341 myOffsets = args->offsets;
342 temp[0] = 0;
343
344 if (args->converter->fromUChar32) {
345 ch = args->converter->fromUChar32;
346 args->converter->fromUChar32 = 0;
347 goto lowsurogate;
348 }
349
350 while (mySource < sourceLimit && myTarget < targetLimit) {
351 ch = *(mySource++);
352
353 if (UTF_IS_SURROGATE(ch)) {
354 if (U_IS_LEAD(ch)) {
355 lowsurogate:
356 if (mySource < sourceLimit) {
357 ch2 = *mySource;
358 if (U_IS_TRAIL(ch2)) {
359 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
360 mySource++;
361 }
362 else {
363 /* this is an unmatched trail code unit (2nd surrogate) */
364 /* callback(illegal) */
365 args->converter->fromUChar32 = ch;
366 *err = U_ILLEGAL_CHAR_FOUND;
367 break;
368 }
369 }
370 else {
371 /* ran out of source */
372 args->converter->fromUChar32 = ch;
373 if (args->flush) {
374 /* this is an unmatched trail code unit (2nd surrogate) */
375 /* callback(illegal) */
376 *err = U_ILLEGAL_CHAR_FOUND;
377 }
378 break;
379 }
380 }
381 else {
382 /* this is an unmatched trail code unit (2nd surrogate) */
383 /* callback(illegal) */
384 args->converter->fromUChar32 = ch;
385 *err = U_ILLEGAL_CHAR_FOUND;
386 break;
387 }
388 }
389
390 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
391 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
392 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
393 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
394
395 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
396 if (myTarget < targetLimit) {
397 *(myTarget++) = temp[indexToWrite];
398 *(myOffsets++) = offsetNum;
399 }
400 else {
401 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
402 *err = U_BUFFER_OVERFLOW_ERROR;
403 }
404 }
405 offsetNum = offsetNum + 1 + (temp[1] != 0);
406 }
407
408 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
409 *err = U_BUFFER_OVERFLOW_ERROR;
410 }
411
412 args->target = (char *) myTarget;
413 args->source = mySource;
414 args->offsets = myOffsets;
415 }
416
417 static UChar32
418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
419 UErrorCode* err)
420 {
421 const uint8_t *mySource;
422 UChar32 myUChar;
423 int32_t length;
424
425 mySource = (const uint8_t *)args->source;
426 if (mySource >= (const uint8_t *)args->sourceLimit)
427 {
428 /* no input */
429 *err = U_INDEX_OUTOFBOUNDS_ERROR;
430 return 0xffff;
431 }
432
433 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
434 if (length < 4)
435 {
436 /* got a partial character */
437 uprv_memcpy(args->converter->toUBytes, mySource, length);
438 args->converter->toULength = (int8_t)length;
439 args->source = (const char *)(mySource + length);
440 *err = U_TRUNCATED_CHAR_FOUND;
441 return 0xffff;
442 }
443
444 /* Don't even try to do a direct cast because the value may be on an odd address. */
445 myUChar = ((UChar32)mySource[0] << 24)
446 | ((UChar32)mySource[1] << 16)
447 | ((UChar32)mySource[2] << 8)
448 | ((UChar32)mySource[3]);
449
450 args->source = (const char *)(mySource + 4);
451 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
452 return myUChar;
453 }
454
455 uprv_memcpy(args->converter->toUBytes, mySource, 4);
456 args->converter->toULength = 4;
457
458 *err = U_ILLEGAL_CHAR_FOUND;
459 return 0xffff;
460 }
461
462 static const UConverterImpl _UTF32BEImpl = {
463 UCNV_UTF32_BigEndian,
464
465 NULL,
466 NULL,
467
468 NULL,
469 NULL,
470 NULL,
471
472 T_UConverter_toUnicode_UTF32_BE,
473 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
474 T_UConverter_fromUnicode_UTF32_BE,
475 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
476 T_UConverter_getNextUChar_UTF32_BE,
477
478 NULL,
479 NULL,
480 NULL,
481 NULL,
482 ucnv_getNonSurrogateUnicodeSet
483 };
484
485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
486 static const UConverterStaticData _UTF32BEStaticData = {
487 sizeof(UConverterStaticData),
488 "UTF-32BE",
489 1232,
490 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
491 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
492 0,
493 0,
494 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
495 };
496
497 const UConverterSharedData _UTF32BEData = {
498 sizeof(UConverterSharedData), ~((uint32_t) 0),
499 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
500 0
501 };
502
503 /* UTF-32LE ---------------------------------------------------------- */
504
505 static void
506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
507 UErrorCode * err)
508 {
509 const unsigned char *mySource = (unsigned char *) args->source;
510 UChar *myTarget = args->target;
511 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
512 const UChar *targetLimit = args->targetLimit;
513 unsigned char *toUBytes = args->converter->toUBytes;
514 uint32_t ch, i;
515
516 /* Restore state of current sequence */
517 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
518 {
519 i = args->converter->toULength; /* restore # of bytes consumed */
520 args->converter->toULength = 0;
521
522 /* Stores the previously calculated ch from a previous call*/
523 ch = args->converter->toUnicodeStatus - 1;
524 args->converter->toUnicodeStatus = 0;
525 goto morebytes;
526 }
527
528 while (mySource < sourceLimit && myTarget < targetLimit)
529 {
530 i = 0;
531 ch = 0;
532 morebytes:
533 while (i < sizeof(uint32_t))
534 {
535 if (mySource < sourceLimit)
536 {
537 ch |= ((uint8_t)(*mySource)) << (i * 8);
538 toUBytes[i++] = (char) *(mySource++);
539 }
540 else
541 {
542 /* stores a partially calculated target*/
543 /* + 1 to make 0 a valid character */
544 args->converter->toUnicodeStatus = ch + 1;
545 args->converter->toULength = (int8_t) i;
546 goto donefornow;
547 }
548 }
549
550 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
551 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
552 if (ch <= MAXIMUM_UCS2) {
553 /* fits in 16 bits */
554 *(myTarget++) = (UChar) ch;
555 }
556 else {
557 /* write out the surrogates */
558 *(myTarget++) = U16_LEAD(ch);
559 ch = U16_TRAIL(ch);
560 if (myTarget < targetLimit) {
561 *(myTarget++) = (UChar)ch;
562 }
563 else {
564 /* Put in overflow buffer (not handled here) */
565 args->converter->UCharErrorBuffer[0] = (UChar) ch;
566 args->converter->UCharErrorBufferLength = 1;
567 *err = U_BUFFER_OVERFLOW_ERROR;
568 break;
569 }
570 }
571 }
572 else {
573 args->converter->toULength = (int8_t)i;
574 *err = U_ILLEGAL_CHAR_FOUND;
575 break;
576 }
577 }
578
579 donefornow:
580 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
581 {
582 /* End of target buffer */
583 *err = U_BUFFER_OVERFLOW_ERROR;
584 }
585
586 args->target = myTarget;
587 args->source = (const char *) mySource;
588 }
589
590 static void
591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
592 UErrorCode * err)
593 {
594 const unsigned char *mySource = (unsigned char *) args->source;
595 UChar *myTarget = args->target;
596 int32_t *myOffsets = args->offsets;
597 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
598 const UChar *targetLimit = args->targetLimit;
599 unsigned char *toUBytes = args->converter->toUBytes;
600 uint32_t ch, i;
601 int32_t offsetNum = 0;
602
603 /* Restore state of current sequence */
604 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
605 {
606 i = args->converter->toULength; /* restore # of bytes consumed */
607 args->converter->toULength = 0;
608
609 /* Stores the previously calculated ch from a previous call*/
610 ch = args->converter->toUnicodeStatus - 1;
611 args->converter->toUnicodeStatus = 0;
612 goto morebytes;
613 }
614
615 while (mySource < sourceLimit && myTarget < targetLimit)
616 {
617 i = 0;
618 ch = 0;
619 morebytes:
620 while (i < sizeof(uint32_t))
621 {
622 if (mySource < sourceLimit)
623 {
624 ch |= ((uint8_t)(*mySource)) << (i * 8);
625 toUBytes[i++] = (char) *(mySource++);
626 }
627 else
628 {
629 /* stores a partially calculated target*/
630 /* + 1 to make 0 a valid character */
631 args->converter->toUnicodeStatus = ch + 1;
632 args->converter->toULength = (int8_t) i;
633 goto donefornow;
634 }
635 }
636
637 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
638 {
639 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
640 if (ch <= MAXIMUM_UCS2)
641 {
642 /* fits in 16 bits */
643 *(myTarget++) = (UChar) ch;
644 *(myOffsets++) = offsetNum;
645 }
646 else {
647 /* write out the surrogates */
648 *(myTarget++) = U16_LEAD(ch);
649 *(myOffsets++) = offsetNum;
650 ch = U16_TRAIL(ch);
651 if (myTarget < targetLimit)
652 {
653 *(myTarget++) = (UChar)ch;
654 *(myOffsets++) = offsetNum;
655 }
656 else
657 {
658 /* Put in overflow buffer (not handled here) */
659 args->converter->UCharErrorBuffer[0] = (UChar) ch;
660 args->converter->UCharErrorBufferLength = 1;
661 *err = U_BUFFER_OVERFLOW_ERROR;
662 break;
663 }
664 }
665 }
666 else
667 {
668 args->converter->toULength = (int8_t)i;
669 *err = U_ILLEGAL_CHAR_FOUND;
670 break;
671 }
672 offsetNum += i;
673 }
674
675 donefornow:
676 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
677 {
678 /* End of target buffer */
679 *err = U_BUFFER_OVERFLOW_ERROR;
680 }
681
682 args->target = myTarget;
683 args->source = (const char *) mySource;
684 args->offsets = myOffsets;
685 }
686
687 static void
688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
689 UErrorCode * err)
690 {
691 const UChar *mySource = args->source;
692 unsigned char *myTarget;
693 const UChar *sourceLimit = args->sourceLimit;
694 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
695 UChar32 ch, ch2;
696 unsigned int indexToWrite;
697 unsigned char temp[sizeof(uint32_t)];
698
699 if(mySource >= sourceLimit) {
700 /* no input, nothing to do */
701 return;
702 }
703
704 /* write the BOM if necessary */
705 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
706 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
707 ucnv_fromUWriteBytes(args->converter,
708 bom, 4,
709 &args->target, args->targetLimit,
710 &args->offsets, -1,
711 err);
712 args->converter->fromUnicodeStatus=0;
713 }
714
715 myTarget = (unsigned char *) args->target;
716 temp[3] = 0;
717
718 if (args->converter->fromUChar32)
719 {
720 ch = args->converter->fromUChar32;
721 args->converter->fromUChar32 = 0;
722 goto lowsurogate;
723 }
724
725 while (mySource < sourceLimit && myTarget < targetLimit)
726 {
727 ch = *(mySource++);
728
729 if (UTF_IS_SURROGATE(ch)) {
730 if (U_IS_LEAD(ch))
731 {
732 lowsurogate:
733 if (mySource < sourceLimit)
734 {
735 ch2 = *mySource;
736 if (U_IS_TRAIL(ch2)) {
737 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
738 mySource++;
739 }
740 else {
741 /* this is an unmatched trail code unit (2nd surrogate) */
742 /* callback(illegal) */
743 args->converter->fromUChar32 = ch;
744 *err = U_ILLEGAL_CHAR_FOUND;
745 break;
746 }
747 }
748 else {
749 /* ran out of source */
750 args->converter->fromUChar32 = ch;
751 if (args->flush) {
752 /* this is an unmatched trail code unit (2nd surrogate) */
753 /* callback(illegal) */
754 *err = U_ILLEGAL_CHAR_FOUND;
755 }
756 break;
757 }
758 }
759 else {
760 /* this is an unmatched trail code unit (2nd surrogate) */
761 /* callback(illegal) */
762 args->converter->fromUChar32 = ch;
763 *err = U_ILLEGAL_CHAR_FOUND;
764 break;
765 }
766 }
767
768 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
769 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
770 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
771 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
772
773 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
774 {
775 if (myTarget < targetLimit)
776 {
777 *(myTarget++) = temp[indexToWrite];
778 }
779 else
780 {
781 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
782 *err = U_BUFFER_OVERFLOW_ERROR;
783 }
784 }
785 }
786
787 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
788 {
789 *err = U_BUFFER_OVERFLOW_ERROR;
790 }
791
792 args->target = (char *) myTarget;
793 args->source = mySource;
794 }
795
796 static void
797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
798 UErrorCode * err)
799 {
800 const UChar *mySource = args->source;
801 unsigned char *myTarget;
802 int32_t *myOffsets;
803 const UChar *sourceLimit = args->sourceLimit;
804 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
805 UChar32 ch, ch2;
806 unsigned int indexToWrite;
807 unsigned char temp[sizeof(uint32_t)];
808 int32_t offsetNum = 0;
809
810 if(mySource >= sourceLimit) {
811 /* no input, nothing to do */
812 return;
813 }
814
815 /* write the BOM if necessary */
816 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
817 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
818 ucnv_fromUWriteBytes(args->converter,
819 bom, 4,
820 &args->target, args->targetLimit,
821 &args->offsets, -1,
822 err);
823 args->converter->fromUnicodeStatus=0;
824 }
825
826 myTarget = (unsigned char *) args->target;
827 myOffsets = args->offsets;
828 temp[3] = 0;
829
830 if (args->converter->fromUChar32)
831 {
832 ch = args->converter->fromUChar32;
833 args->converter->fromUChar32 = 0;
834 goto lowsurogate;
835 }
836
837 while (mySource < sourceLimit && myTarget < targetLimit)
838 {
839 ch = *(mySource++);
840
841 if (UTF_IS_SURROGATE(ch)) {
842 if (U_IS_LEAD(ch))
843 {
844 lowsurogate:
845 if (mySource < sourceLimit)
846 {
847 ch2 = *mySource;
848 if (U_IS_TRAIL(ch2))
849 {
850 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
851 mySource++;
852 }
853 else {
854 /* this is an unmatched trail code unit (2nd surrogate) */
855 /* callback(illegal) */
856 args->converter->fromUChar32 = ch;
857 *err = U_ILLEGAL_CHAR_FOUND;
858 break;
859 }
860 }
861 else {
862 /* ran out of source */
863 args->converter->fromUChar32 = ch;
864 if (args->flush) {
865 /* this is an unmatched trail code unit (2nd surrogate) */
866 /* callback(illegal) */
867 *err = U_ILLEGAL_CHAR_FOUND;
868 }
869 break;
870 }
871 }
872 else {
873 /* this is an unmatched trail code unit (2nd surrogate) */
874 /* callback(illegal) */
875 args->converter->fromUChar32 = ch;
876 *err = U_ILLEGAL_CHAR_FOUND;
877 break;
878 }
879 }
880
881 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
882 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
883 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
884 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
885
886 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
887 {
888 if (myTarget < targetLimit)
889 {
890 *(myTarget++) = temp[indexToWrite];
891 *(myOffsets++) = offsetNum;
892 }
893 else
894 {
895 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
896 *err = U_BUFFER_OVERFLOW_ERROR;
897 }
898 }
899 offsetNum = offsetNum + 1 + (temp[2] != 0);
900 }
901
902 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
903 {
904 *err = U_BUFFER_OVERFLOW_ERROR;
905 }
906
907 args->target = (char *) myTarget;
908 args->source = mySource;
909 args->offsets = myOffsets;
910 }
911
912 static UChar32
913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
914 UErrorCode* err)
915 {
916 const uint8_t *mySource;
917 UChar32 myUChar;
918 int32_t length;
919
920 mySource = (const uint8_t *)args->source;
921 if (mySource >= (const uint8_t *)args->sourceLimit)
922 {
923 /* no input */
924 *err = U_INDEX_OUTOFBOUNDS_ERROR;
925 return 0xffff;
926 }
927
928 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
929 if (length < 4)
930 {
931 /* got a partial character */
932 uprv_memcpy(args->converter->toUBytes, mySource, length);
933 args->converter->toULength = (int8_t)length;
934 args->source = (const char *)(mySource + length);
935 *err = U_TRUNCATED_CHAR_FOUND;
936 return 0xffff;
937 }
938
939 /* Don't even try to do a direct cast because the value may be on an odd address. */
940 myUChar = ((UChar32)mySource[3] << 24)
941 | ((UChar32)mySource[2] << 16)
942 | ((UChar32)mySource[1] << 8)
943 | ((UChar32)mySource[0]);
944
945 args->source = (const char *)(mySource + 4);
946 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
947 return myUChar;
948 }
949
950 uprv_memcpy(args->converter->toUBytes, mySource, 4);
951 args->converter->toULength = 4;
952
953 *err = U_ILLEGAL_CHAR_FOUND;
954 return 0xffff;
955 }
956
957 static const UConverterImpl _UTF32LEImpl = {
958 UCNV_UTF32_LittleEndian,
959
960 NULL,
961 NULL,
962
963 NULL,
964 NULL,
965 NULL,
966
967 T_UConverter_toUnicode_UTF32_LE,
968 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
969 T_UConverter_fromUnicode_UTF32_LE,
970 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
971 T_UConverter_getNextUChar_UTF32_LE,
972
973 NULL,
974 NULL,
975 NULL,
976 NULL,
977 ucnv_getNonSurrogateUnicodeSet
978 };
979
980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
981 static const UConverterStaticData _UTF32LEStaticData = {
982 sizeof(UConverterStaticData),
983 "UTF-32LE",
984 1234,
985 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
986 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
987 0,
988 0,
989 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
990 };
991
992
993 const UConverterSharedData _UTF32LEData = {
994 sizeof(UConverterSharedData), ~((uint32_t) 0),
995 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
996 0
997 };
998
999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1000
1001 /*
1002 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1003 * accordingly.
1004 *
1005 * State values:
1006 * 0 initial state
1007 * 1 saw 00
1008 * 2 saw 00 00
1009 * 3 saw 00 00 FE
1010 * 4 -
1011 * 5 saw FF
1012 * 6 saw FF FE
1013 * 7 saw FF FE 00
1014 * 8 UTF-32BE mode
1015 * 9 UTF-32LE mode
1016 *
1017 * During detection: state&3==number of matching bytes so far.
1018 *
1019 * On output, emit U+FEFF as the first code point.
1020 */
1021
1022 static void
1023 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1024 if(choice<=UCNV_RESET_TO_UNICODE) {
1025 /* reset toUnicode: state=0 */
1026 cnv->mode=0;
1027 }
1028 if(choice!=UCNV_RESET_TO_UNICODE) {
1029 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1030 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1031 }
1032 }
1033
1034 static void
1035 _UTF32Open(UConverter *cnv,
1036 const char *name,
1037 const char *locale,
1038 uint32_t options,
1039 UErrorCode *pErrorCode) {
1040 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1041 }
1042
1043 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1044
1045 static void
1046 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1047 UErrorCode *pErrorCode) {
1048 UConverter *cnv=pArgs->converter;
1049 const char *source=pArgs->source;
1050 const char *sourceLimit=pArgs->sourceLimit;
1051 int32_t *offsets=pArgs->offsets;
1052
1053 int32_t state, offsetDelta;
1054 char b;
1055
1056 state=cnv->mode;
1057
1058 /*
1059 * If we detect a BOM in this buffer, then we must add the BOM size to the
1060 * offsets because the actual converter function will not see and count the BOM.
1061 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1062 */
1063 offsetDelta=0;
1064
1065 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1066 switch(state) {
1067 case 0:
1068 b=*source;
1069 if(b==0) {
1070 state=1; /* could be 00 00 FE FF */
1071 } else if(b==(char)0xff) {
1072 state=5; /* could be FF FE 00 00 */
1073 } else {
1074 state=8; /* default to UTF-32BE */
1075 continue;
1076 }
1077 ++source;
1078 break;
1079 case 1:
1080 case 2:
1081 case 3:
1082 case 5:
1083 case 6:
1084 case 7:
1085 if(*source==utf32BOM[state]) {
1086 ++state;
1087 ++source;
1088 if(state==4) {
1089 state=8; /* detect UTF-32BE */
1090 offsetDelta=(int32_t)(source-pArgs->source);
1091 } else if(state==8) {
1092 state=9; /* detect UTF-32LE */
1093 offsetDelta=(int32_t)(source-pArgs->source);
1094 }
1095 } else {
1096 /* switch to UTF-32BE and pass the previous bytes */
1097 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1098
1099 /* reset the source */
1100 source=pArgs->source;
1101
1102 if(count==(state&3)) {
1103 /* simple: all in the same buffer, just reset source */
1104 } else {
1105 UBool oldFlush=pArgs->flush;
1106
1107 /* some of the bytes are from a previous buffer, replay those first */
1108 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1109 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1110 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1111
1112 /* no offsets: bytes from previous buffer, and not enough for output */
1113 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1114
1115 /* restore real pointers; pArgs->source will be set in case 8/9 */
1116 pArgs->sourceLimit=sourceLimit;
1117 pArgs->flush=oldFlush;
1118 }
1119 state=8;
1120 continue;
1121 }
1122 break;
1123 case 8:
1124 /* call UTF-32BE */
1125 pArgs->source=source;
1126 if(offsets==NULL) {
1127 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1128 } else {
1129 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1130 }
1131 source=pArgs->source;
1132 break;
1133 case 9:
1134 /* call UTF-32LE */
1135 pArgs->source=source;
1136 if(offsets==NULL) {
1137 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1138 } else {
1139 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1140 }
1141 source=pArgs->source;
1142 break;
1143 default:
1144 break; /* does not occur */
1145 }
1146 }
1147
1148 /* add BOM size to offsets - see comment at offsetDelta declaration */
1149 if(offsets!=NULL && offsetDelta!=0) {
1150 int32_t *offsetsLimit=pArgs->offsets;
1151 while(offsets<offsetsLimit) {
1152 *offsets++ += offsetDelta;
1153 }
1154 }
1155
1156 pArgs->source=source;
1157
1158 if(source==sourceLimit && pArgs->flush) {
1159 /* handle truncated input */
1160 switch(state) {
1161 case 0:
1162 break; /* no input at all, nothing to do */
1163 case 8:
1164 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1165 break;
1166 case 9:
1167 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1168 break;
1169 default:
1170 /* handle 0<state<8: call UTF-32BE with too-short input */
1171 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1172 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1173
1174 /* no offsets: not enough for output */
1175 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1176 pArgs->source=source;
1177 pArgs->sourceLimit=sourceLimit;
1178 state=8;
1179 break;
1180 }
1181 }
1182
1183 cnv->mode=state;
1184 }
1185
1186 static UChar32
1187 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1188 UErrorCode *pErrorCode) {
1189 switch(pArgs->converter->mode) {
1190 case 8:
1191 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1192 case 9:
1193 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1194 default:
1195 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1196 }
1197 }
1198
1199 static const UConverterImpl _UTF32Impl = {
1200 UCNV_UTF32,
1201
1202 NULL,
1203 NULL,
1204
1205 _UTF32Open,
1206 NULL,
1207 _UTF32Reset,
1208
1209 _UTF32ToUnicodeWithOffsets,
1210 _UTF32ToUnicodeWithOffsets,
1211 #if U_IS_BIG_ENDIAN
1212 T_UConverter_fromUnicode_UTF32_BE,
1213 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1214 #else
1215 T_UConverter_fromUnicode_UTF32_LE,
1216 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1217 #endif
1218 _UTF32GetNextUChar,
1219
1220 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1221 NULL,
1222 NULL,
1223 NULL,
1224 ucnv_getNonSurrogateUnicodeSet
1225 };
1226
1227 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1228 static const UConverterStaticData _UTF32StaticData = {
1229 sizeof(UConverterStaticData),
1230 "UTF-32",
1231 1236,
1232 UCNV_IBM, UCNV_UTF32, 4, 4,
1233 #if U_IS_BIG_ENDIAN
1234 { 0, 0, 0xff, 0xfd }, 4,
1235 #else
1236 { 0xfd, 0xff, 0, 0 }, 4,
1237 #endif
1238 FALSE, FALSE,
1239 0,
1240 0,
1241 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1242 };
1243
1244 const UConverterSharedData _UTF32Data = {
1245 sizeof(UConverterSharedData), ~((uint32_t) 0),
1246 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1247 0
1248 };
1249
1250 #endif