2 **********************************************************************
3 * Copyright (C) 2000-2003, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvlat1.cpp
8 * tab size: 8 (not used)
11 * created on: 2000feb07
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
16 #include "unicode/ucnv.h"
17 #include "unicode/ucnv_err.h"
18 #include "unicode/uset.h"
22 /* control optimizations according to the platform */
23 #define LATIN1_UNROLL_TO_UNICODE 1
24 #define LATIN1_UNROLL_FROM_UNICODE 1
25 #define ASCII_UNROLL_TO_UNICODE 1
27 /* ISO 8859-1 --------------------------------------------------------------- */
29 /* This is a table-less and callback-less version of _MBCSSingleToBMPWithOffsets(). */
31 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
32 UErrorCode
*pErrorCode
) {
33 const uint8_t *source
;
35 int32_t targetCapacity
, length
;
40 /* set up the local pointers */
41 source
=(const uint8_t *)pArgs
->source
;
43 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
44 offsets
=pArgs
->offsets
;
49 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
50 * for the minimum of the sourceLength and targetCapacity
52 length
=(const uint8_t *)pArgs
->sourceLimit
-source
;
53 if(length
<=targetCapacity
) {
54 targetCapacity
=length
;
56 /* target will be full */
57 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
58 length
=targetCapacity
;
61 #if LATIN1_UNROLL_TO_UNICODE
62 if(targetCapacity
>=16) {
65 loops
=count
=targetCapacity
>>4;
66 length
=targetCapacity
&=0xf;
88 *offsets
++=sourceIndex
++;
89 *offsets
++=sourceIndex
++;
90 *offsets
++=sourceIndex
++;
91 *offsets
++=sourceIndex
++;
92 *offsets
++=sourceIndex
++;
93 *offsets
++=sourceIndex
++;
94 *offsets
++=sourceIndex
++;
95 *offsets
++=sourceIndex
++;
96 *offsets
++=sourceIndex
++;
97 *offsets
++=sourceIndex
++;
98 *offsets
++=sourceIndex
++;
99 *offsets
++=sourceIndex
++;
100 *offsets
++=sourceIndex
++;
101 *offsets
++=sourceIndex
++;
102 *offsets
++=sourceIndex
++;
103 *offsets
++=sourceIndex
++;
109 /* conversion loop */
110 while(targetCapacity
>0) {
115 /* write back the updated pointers */
116 pArgs
->source
=(const char *)source
;
117 pArgs
->target
=target
;
122 *offsets
++=sourceIndex
++;
125 pArgs
->offsets
=offsets
;
129 /* This is a table-less and callback-less version of _MBCSSingleGetNextUChar(). */
131 _Latin1GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
132 UErrorCode
*pErrorCode
) {
133 const uint8_t *source
=(const uint8_t *)pArgs
->source
;
134 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
135 pArgs
->source
=(const char *)(source
+1);
139 /* no output because of empty input */
140 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
144 /* This is a table-less version of _MBCSSingleFromBMPWithOffsets(). */
146 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
147 UErrorCode
*pErrorCode
) {
149 const UChar
*source
, *sourceLimit
, *lastSource
;
151 int32_t targetCapacity
, length
;
158 UConverterCallbackReason reason
;
161 /* set up the local pointers */
162 cnv
=pArgs
->converter
;
163 source
=pArgs
->source
;
164 sourceLimit
=pArgs
->sourceLimit
;
165 target
=(uint8_t *)pArgs
->target
;
166 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
167 offsets
=pArgs
->offsets
;
169 if(cnv
->sharedData
==&_Latin1Data
) {
170 max
=0xff; /* Latin-1 */
172 max
=0x7f; /* US-ASCII */
175 /* get the converter state from UConverter */
176 c
=cnv
->fromUSurrogateLead
;
178 /* sourceIndex=-1 if the current character began in the previous buffer */
179 sourceIndex
= c
==0 ? 0 : -1;
183 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
184 * for the minimum of the sourceLength and targetCapacity
186 length
=sourceLimit
-source
;
187 if(length
<targetCapacity
) {
188 targetCapacity
=length
;
191 /* conversion loop */
192 if(c
!=0 && targetCapacity
>0) {
196 #if LATIN1_UNROLL_FROM_UNICODE
197 /* unroll the loop with the most common case */
199 if(targetCapacity
>=16) {
200 int32_t count
, loops
;
203 loops
=count
=targetCapacity
>>4;
205 oredChars
=u
=*source
++;
206 *target
++=(uint8_t)u
;
207 oredChars
|=u
=*source
++;
208 *target
++=(uint8_t)u
;
209 oredChars
|=u
=*source
++;
210 *target
++=(uint8_t)u
;
211 oredChars
|=u
=*source
++;
212 *target
++=(uint8_t)u
;
213 oredChars
|=u
=*source
++;
214 *target
++=(uint8_t)u
;
215 oredChars
|=u
=*source
++;
216 *target
++=(uint8_t)u
;
217 oredChars
|=u
=*source
++;
218 *target
++=(uint8_t)u
;
219 oredChars
|=u
=*source
++;
220 *target
++=(uint8_t)u
;
221 oredChars
|=u
=*source
++;
222 *target
++=(uint8_t)u
;
223 oredChars
|=u
=*source
++;
224 *target
++=(uint8_t)u
;
225 oredChars
|=u
=*source
++;
226 *target
++=(uint8_t)u
;
227 oredChars
|=u
=*source
++;
228 *target
++=(uint8_t)u
;
229 oredChars
|=u
=*source
++;
230 *target
++=(uint8_t)u
;
231 oredChars
|=u
=*source
++;
232 *target
++=(uint8_t)u
;
233 oredChars
|=u
=*source
++;
234 *target
++=(uint8_t)u
;
235 oredChars
|=u
=*source
++;
236 *target
++=(uint8_t)u
;
238 /* were all 16 entries really valid? */
240 /* no, return to the first of these 16 */
247 targetCapacity
-=16*count
;
250 lastSource
+=16*count
;
252 *offsets
++=sourceIndex
++;
253 *offsets
++=sourceIndex
++;
254 *offsets
++=sourceIndex
++;
255 *offsets
++=sourceIndex
++;
256 *offsets
++=sourceIndex
++;
257 *offsets
++=sourceIndex
++;
258 *offsets
++=sourceIndex
++;
259 *offsets
++=sourceIndex
++;
260 *offsets
++=sourceIndex
++;
261 *offsets
++=sourceIndex
++;
262 *offsets
++=sourceIndex
++;
263 *offsets
++=sourceIndex
++;
264 *offsets
++=sourceIndex
++;
265 *offsets
++=sourceIndex
++;
266 *offsets
++=sourceIndex
++;
267 *offsets
++=sourceIndex
++;
276 while(targetCapacity
>0) {
278 * Get a correct Unicode code point:
279 * a single UChar for a BMP code point or
280 * a matched surrogate pair for a "surrogate code point".
284 /* convert the Unicode code point */
285 *target
++=(uint8_t)c
;
288 /* normal end of conversion: prepare for a new character */
291 if(!UTF_IS_SURROGATE(c
)) {
292 /* callback(unassigned) */
293 reason
=UCNV_UNASSIGNED
;
294 *pErrorCode
=U_INVALID_CHAR_FOUND
;
295 } else if(UTF_IS_SURROGATE_FIRST(c
)) {
297 if(source
<sourceLimit
) {
298 /* test the following code unit */
300 if(UTF_IS_SECOND_SURROGATE(trail
)) {
302 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
303 /* this codepage does not map supplementary code points */
304 /* callback(unassigned) */
305 reason
=UCNV_UNASSIGNED
;
306 *pErrorCode
=U_INVALID_CHAR_FOUND
;
308 /* this is an unmatched lead code unit (1st surrogate) */
309 /* callback(illegal) */
311 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
318 /* this is an unmatched trail code unit (2nd surrogate) */
319 /* callback(illegal) */
321 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
324 /* call the callback function with all the preparations and post-processing */
325 /* get the number of code units for c to correctly advance sourceIndex after the callback call */
326 length
=UTF_CHAR_LENGTH(c
);
328 /* set offsets since the start or the last callback */
330 int32_t count
=(int32_t)(source
-lastSource
);
332 /* do not set the offset for the callback-causing character */
336 *offsets
++=sourceIndex
++;
339 /* offset and sourceIndex are now set for the current character */
342 /* update the arguments structure */
343 pArgs
->source
=source
;
344 pArgs
->target
=(char *)target
;
345 pArgs
->offsets
=offsets
;
347 /* set the converter state in UConverter to deal with the next character */
348 cnv
->fromUSurrogateLead
=0;
350 /* write the code point as code units */
352 UTF_APPEND_CHAR_UNSAFE(cnv
->invalidUCharBuffer
, i
, c
);
353 cnv
->invalidUCharLength
=(int8_t)i
;
356 /* call the callback function */
357 cnv
->fromUCharErrorBehaviour(cnv
->fromUContext
, pArgs
, cnv
->invalidUCharBuffer
, i
, c
, reason
, pErrorCode
);
359 /* get the converter state from UConverter */
360 c
=cnv
->fromUSurrogateLead
;
362 /* update target and deal with offsets if necessary */
363 offsets
=ucnv_updateCallbackOffsets(offsets
, ((uint8_t *)pArgs
->target
)-target
, sourceIndex
);
364 target
=(uint8_t *)pArgs
->target
;
366 /* update the source pointer and index */
367 sourceIndex
+=length
+(pArgs
->source
-source
);
368 source
=lastSource
=pArgs
->source
;
369 targetCapacity
=(uint8_t *)pArgs
->targetLimit
-target
;
370 length
=sourceLimit
-source
;
371 if(length
<targetCapacity
) {
372 targetCapacity
=length
;
376 * If the callback overflowed the target, then we need to
377 * stop here with an overflow indication.
379 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
381 } else if(U_FAILURE(*pErrorCode
)) {
385 } else if(cnv
->charErrorBufferLength
>0) {
387 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
391 #if LATIN1_UNROLL_FROM_UNICODE
397 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
399 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
402 /* set offsets since the start or the last callback */
404 size_t count
=source
-lastSource
;
406 *offsets
++=sourceIndex
++;
411 if(pArgs
->flush
&& source
>=sourceLimit
) {
412 /* reset the state for the next conversion */
413 if(c
!=0 && U_SUCCESS(*pErrorCode
)) {
414 /* a Unicode code point remains incomplete (only a first surrogate) */
415 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
417 cnv
->fromUSurrogateLead
=0;
419 /* set the converter state back into UConverter */
420 cnv
->fromUSurrogateLead
=(UChar
)c
;
423 /* write back the updated pointers */
424 pArgs
->source
=source
;
425 pArgs
->target
=(char *)target
;
426 pArgs
->offsets
=offsets
;
430 _Latin1GetUnicodeSet(const UConverter
*cnv
,
432 UConverterUnicodeSet which
,
433 UErrorCode
*pErrorCode
) {
434 uset_addRange(set
, 0, 0xff);
437 static const UConverterImpl _Latin1Impl
={
447 _Latin1ToUnicodeWithOffsets
,
448 _Latin1ToUnicodeWithOffsets
,
449 _Latin1FromUnicodeWithOffsets
,
450 _Latin1FromUnicodeWithOffsets
,
460 static const UConverterStaticData _Latin1StaticData
={
461 sizeof(UConverterStaticData
),
463 819, UCNV_IBM
, UCNV_LATIN_1
, 1, 1,
464 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
467 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
470 const UConverterSharedData _Latin1Data
={
471 sizeof(UConverterSharedData
), ~((uint32_t) 0),
472 NULL
, NULL
, &_Latin1StaticData
, FALSE
, &_Latin1Impl
,
476 /* US-ASCII ----------------------------------------------------------------- */
478 /* This is a table-less version of _MBCSSingleToBMPWithOffsets(). */
480 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
481 UErrorCode
*pErrorCode
) {
482 const uint8_t *source
, *sourceLimit
, *lastSource
;
484 int32_t targetCapacity
, length
;
489 /* set up the local pointers */
490 source
=(const uint8_t *)pArgs
->source
;
491 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
492 target
=pArgs
->target
;
493 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
494 offsets
=pArgs
->offsets
;
496 /* sourceIndex=-1 if the current character began in the previous buffer */
501 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
502 * for the minimum of the sourceLength and targetCapacity
504 length
=sourceLimit
-source
;
505 if(length
<targetCapacity
) {
506 targetCapacity
=length
;
509 #if ASCII_UNROLL_TO_UNICODE
510 /* unroll the loop with the most common case */
512 if(targetCapacity
>=16) {
513 int32_t count
, loops
;
516 loops
=count
=targetCapacity
>>4;
518 oredChars
=*target
++=*source
++;
519 oredChars
|=*target
++=*source
++;
520 oredChars
|=*target
++=*source
++;
521 oredChars
|=*target
++=*source
++;
522 oredChars
|=*target
++=*source
++;
523 oredChars
|=*target
++=*source
++;
524 oredChars
|=*target
++=*source
++;
525 oredChars
|=*target
++=*source
++;
526 oredChars
|=*target
++=*source
++;
527 oredChars
|=*target
++=*source
++;
528 oredChars
|=*target
++=*source
++;
529 oredChars
|=*target
++=*source
++;
530 oredChars
|=*target
++=*source
++;
531 oredChars
|=*target
++=*source
++;
532 oredChars
|=*target
++=*source
++;
533 oredChars
|=*target
++=*source
++;
535 /* were all 16 entries really valid? */
537 /* no, return to the first of these 16 */
544 targetCapacity
-=16*count
;
547 lastSource
+=16*count
;
549 *offsets
++=sourceIndex
++;
550 *offsets
++=sourceIndex
++;
551 *offsets
++=sourceIndex
++;
552 *offsets
++=sourceIndex
++;
553 *offsets
++=sourceIndex
++;
554 *offsets
++=sourceIndex
++;
555 *offsets
++=sourceIndex
++;
556 *offsets
++=sourceIndex
++;
557 *offsets
++=sourceIndex
++;
558 *offsets
++=sourceIndex
++;
559 *offsets
++=sourceIndex
++;
560 *offsets
++=sourceIndex
++;
561 *offsets
++=sourceIndex
++;
562 *offsets
++=sourceIndex
++;
563 *offsets
++=sourceIndex
++;
564 *offsets
++=sourceIndex
++;
571 /* conversion loop */
572 while(targetCapacity
>0) {
573 if((*target
++=*source
++)<=0x7f) {
578 /* back out the illegal character */
581 /* call the callback function with all the preparations and post-processing */
582 cnv
=pArgs
->converter
;
584 /* callback(illegal) */
585 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
587 /* set offsets since the start or the last callback */
589 int32_t count
=(int32_t)(source
-lastSource
);
591 /* predecrement: do not set the offset for the callback-causing character */
593 *offsets
++=sourceIndex
++;
595 /* offset and sourceIndex are now set for the current character */
598 /* update the arguments structure */
599 pArgs
->source
=(const char *)source
;
600 pArgs
->target
=target
;
601 pArgs
->offsets
=offsets
;
603 /* copy the current bytes to invalidCharBuffer */
604 cnv
->invalidCharBuffer
[0]=*(source
-1);
605 cnv
->invalidCharLength
=1;
607 /* call the callback function */
608 cnv
->fromCharErrorBehaviour(cnv
->toUContext
, pArgs
, cnv
->invalidCharBuffer
, 1, UCNV_ILLEGAL
, pErrorCode
);
610 /* update target and deal with offsets if necessary */
611 offsets
=ucnv_updateCallbackOffsets(offsets
, pArgs
->target
-target
, sourceIndex
);
612 target
=pArgs
->target
;
614 /* update the source pointer and index */
615 sourceIndex
+=1+((const uint8_t *)pArgs
->source
-source
);
616 source
=lastSource
=(const uint8_t *)pArgs
->source
;
617 targetCapacity
=pArgs
->targetLimit
-target
;
618 length
=sourceLimit
-source
;
619 if(length
<targetCapacity
) {
620 targetCapacity
=length
;
624 * If the callback overflowed the target, then we need to
625 * stop here with an overflow indication.
627 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
629 } else if(U_FAILURE(*pErrorCode
)) {
632 } else if(cnv
->UCharErrorBufferLength
>0) {
634 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
638 #if ASCII_UNROLL_TO_UNICODE
644 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
646 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
649 /* set offsets since the start or the last callback */
651 size_t count
=source
-lastSource
;
653 *offsets
++=sourceIndex
++;
658 /* write back the updated pointers */
659 pArgs
->source
=(const char *)source
;
660 pArgs
->target
=target
;
661 pArgs
->offsets
=offsets
;
664 /* This is a table-less version of _MBCSSingleGetNextUChar(). */
666 _ASCIIGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
667 UErrorCode
*pErrorCode
) {
668 UChar buffer
[UTF_MAX_CHAR_LENGTH
];
669 const uint8_t *source
;
672 /* set up the local pointers */
673 source
=(const uint8_t *)pArgs
->source
;
675 /* conversion loop */
676 while(source
<(const uint8_t *)pArgs
->sourceLimit
) {
678 pArgs
->source
=(const char *)source
;
682 /* call the callback function with all the preparations and post-processing */
683 UConverter
*cnv
=pArgs
->converter
;
685 /* callback(illegal) */
686 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
688 /* update the arguments structure */
689 pArgs
->target
=buffer
;
690 pArgs
->targetLimit
=buffer
+UTF_MAX_CHAR_LENGTH
;
692 /* copy the current byte to invalidCharBuffer */
693 cnv
->invalidCharBuffer
[0]=(char)b
;
694 cnv
->invalidCharLength
=1;
696 /* call the callback function */
697 cnv
->fromCharErrorBehaviour(cnv
->toUContext
, pArgs
, cnv
->invalidCharBuffer
, 1, UCNV_ILLEGAL
, pErrorCode
);
699 /* update the source pointer */
700 source
=(const uint8_t *)pArgs
->source
;
703 * return the first character if the callback wrote some
704 * we do not need to goto finish because the converter state is already set
706 if(U_SUCCESS(*pErrorCode
)) {
707 int32_t length
=pArgs
->target
-buffer
;
709 return ucnv_getUChar32KeepOverflow(cnv
, buffer
, length
);
711 /* else (callback did not write anything) continue */
712 } else if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
713 *pErrorCode
=U_ZERO_ERROR
;
714 return ucnv_getUChar32KeepOverflow(cnv
, buffer
, UTF_MAX_CHAR_LENGTH
);
717 /* ### what if a callback set an error but _also_ generated output?! */
723 /* no output because of empty input or only skipping callbacks */
724 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
729 _ASCIIGetUnicodeSet(const UConverter
*cnv
,
731 UConverterUnicodeSet which
,
732 UErrorCode
*pErrorCode
) {
733 uset_addRange(set
, 0, 0x7f);
736 static const UConverterImpl _ASCIIImpl
={
746 _ASCIIToUnicodeWithOffsets
,
747 _ASCIIToUnicodeWithOffsets
,
748 _Latin1FromUnicodeWithOffsets
,
749 _Latin1FromUnicodeWithOffsets
,
759 static const UConverterStaticData _ASCIIStaticData
={
760 sizeof(UConverterStaticData
),
762 367, UCNV_IBM
, UCNV_US_ASCII
, 1, 1,
763 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
766 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
769 const UConverterSharedData _ASCIIData
={
770 sizeof(UConverterSharedData
), ~((uint32_t) 0),
771 NULL
, NULL
, &_ASCIIStaticData
, FALSE
, &_ASCIIImpl
,