2 **********************************************************************
3 * Copyright (C) 2000-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvlat1.cpp
8 * tab size: 8 (not used)
11 * created on: 2000feb07
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_CONVERSION
19 #include "unicode/ucnv.h"
20 #include "unicode/uset.h"
21 #include "unicode/utf8.h"
25 /* control optimizations according to the platform */
26 #define LATIN1_UNROLL_FROM_UNICODE 1
28 /* ISO 8859-1 --------------------------------------------------------------- */
30 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
32 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
33 UErrorCode
*pErrorCode
) {
34 const uint8_t *source
;
36 int32_t targetCapacity
, length
;
41 /* set up the local pointers */
42 source
=(const uint8_t *)pArgs
->source
;
44 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
45 offsets
=pArgs
->offsets
;
50 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
51 * for the minimum of the sourceLength and targetCapacity
53 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
54 if(length
<=targetCapacity
) {
55 targetCapacity
=length
;
57 /* target will be full */
58 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
59 length
=targetCapacity
;
62 if(targetCapacity
>=8) {
63 /* This loop is unrolled for speed and improved pipelining. */
66 loops
=count
=targetCapacity
>>3;
67 length
=targetCapacity
&=0x7;
83 offsets
[0]=sourceIndex
++;
84 offsets
[1]=sourceIndex
++;
85 offsets
[2]=sourceIndex
++;
86 offsets
[3]=sourceIndex
++;
87 offsets
[4]=sourceIndex
++;
88 offsets
[5]=sourceIndex
++;
89 offsets
[6]=sourceIndex
++;
90 offsets
[7]=sourceIndex
++;
97 while(targetCapacity
>0) {
102 /* write back the updated pointers */
103 pArgs
->source
=(const char *)source
;
104 pArgs
->target
=target
;
109 *offsets
++=sourceIndex
++;
112 pArgs
->offsets
=offsets
;
116 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
118 _Latin1GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
119 UErrorCode
*pErrorCode
) {
120 const uint8_t *source
=(const uint8_t *)pArgs
->source
;
121 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
122 pArgs
->source
=(const char *)(source
+1);
126 /* no output because of empty input */
127 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
131 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
133 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
134 UErrorCode
*pErrorCode
) {
136 const UChar
*source
, *sourceLimit
;
137 uint8_t *target
, *oldTarget
;
138 int32_t targetCapacity
, length
;
146 /* set up the local pointers */
147 cnv
=pArgs
->converter
;
148 source
=pArgs
->source
;
149 sourceLimit
=pArgs
->sourceLimit
;
150 target
=oldTarget
=(uint8_t *)pArgs
->target
;
151 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
152 offsets
=pArgs
->offsets
;
154 if(cnv
->sharedData
==&_Latin1Data
) {
155 max
=0xff; /* Latin-1 */
157 max
=0x7f; /* US-ASCII */
160 /* get the converter state from UConverter */
163 /* sourceIndex=-1 if the current character began in the previous buffer */
164 sourceIndex
= cp
==0 ? 0 : -1;
167 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
168 * for the minimum of the sourceLength and targetCapacity
170 length
=(int32_t)(sourceLimit
-source
);
171 if(length
<targetCapacity
) {
172 targetCapacity
=length
;
175 /* conversion loop */
176 if(cp
!=0 && targetCapacity
>0) {
180 #if LATIN1_UNROLL_FROM_UNICODE
181 /* unroll the loop with the most common case */
182 if(targetCapacity
>=16) {
183 int32_t count
, loops
;
186 loops
=count
=targetCapacity
>>4;
188 oredChars
=u
=*source
++;
189 *target
++=(uint8_t)u
;
190 oredChars
|=u
=*source
++;
191 *target
++=(uint8_t)u
;
192 oredChars
|=u
=*source
++;
193 *target
++=(uint8_t)u
;
194 oredChars
|=u
=*source
++;
195 *target
++=(uint8_t)u
;
196 oredChars
|=u
=*source
++;
197 *target
++=(uint8_t)u
;
198 oredChars
|=u
=*source
++;
199 *target
++=(uint8_t)u
;
200 oredChars
|=u
=*source
++;
201 *target
++=(uint8_t)u
;
202 oredChars
|=u
=*source
++;
203 *target
++=(uint8_t)u
;
204 oredChars
|=u
=*source
++;
205 *target
++=(uint8_t)u
;
206 oredChars
|=u
=*source
++;
207 *target
++=(uint8_t)u
;
208 oredChars
|=u
=*source
++;
209 *target
++=(uint8_t)u
;
210 oredChars
|=u
=*source
++;
211 *target
++=(uint8_t)u
;
212 oredChars
|=u
=*source
++;
213 *target
++=(uint8_t)u
;
214 oredChars
|=u
=*source
++;
215 *target
++=(uint8_t)u
;
216 oredChars
|=u
=*source
++;
217 *target
++=(uint8_t)u
;
218 oredChars
|=u
=*source
++;
219 *target
++=(uint8_t)u
;
221 /* were all 16 entries really valid? */
223 /* no, return to the first of these 16 */
230 targetCapacity
-=16*count
;
235 *offsets
++=sourceIndex
++;
236 *offsets
++=sourceIndex
++;
237 *offsets
++=sourceIndex
++;
238 *offsets
++=sourceIndex
++;
239 *offsets
++=sourceIndex
++;
240 *offsets
++=sourceIndex
++;
241 *offsets
++=sourceIndex
++;
242 *offsets
++=sourceIndex
++;
243 *offsets
++=sourceIndex
++;
244 *offsets
++=sourceIndex
++;
245 *offsets
++=sourceIndex
++;
246 *offsets
++=sourceIndex
++;
247 *offsets
++=sourceIndex
++;
248 *offsets
++=sourceIndex
++;
249 *offsets
++=sourceIndex
++;
250 *offsets
++=sourceIndex
++;
257 /* conversion loop */
259 while(targetCapacity
>0 && (c
=*source
++)<=max
) {
260 /* convert the Unicode code point */
261 *target
++=(uint8_t)c
;
267 if(!U_IS_SURROGATE(cp
)) {
268 /* callback(unassigned) */
269 } else if(U_IS_SURROGATE_LEAD(cp
)) {
271 if(source
<sourceLimit
) {
272 /* test the following code unit */
274 if(U16_IS_TRAIL(trail
)) {
276 cp
=U16_GET_SUPPLEMENTARY(cp
, trail
);
277 /* this codepage does not map supplementary code points */
278 /* callback(unassigned) */
280 /* this is an unmatched lead code unit (1st surrogate) */
281 /* callback(illegal) */
289 /* this is an unmatched trail code unit (2nd surrogate) */
290 /* callback(illegal) */
293 *pErrorCode
= U_IS_SURROGATE(cp
) ? U_ILLEGAL_CHAR_FOUND
: U_INVALID_CHAR_FOUND
;
298 /* set offsets since the start */
300 size_t count
=target
-oldTarget
;
302 *offsets
++=sourceIndex
++;
307 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
309 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
312 /* write back the updated pointers */
313 pArgs
->source
=source
;
314 pArgs
->target
=(char *)target
;
315 pArgs
->offsets
=offsets
;
318 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
320 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
321 UConverterToUnicodeArgs
*pToUArgs
,
322 UErrorCode
*pErrorCode
) {
324 const uint8_t *source
, *sourceLimit
;
326 int32_t targetCapacity
;
331 /* set up the local pointers */
332 utf8
=pToUArgs
->converter
;
333 source
=(uint8_t *)pToUArgs
->source
;
334 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
335 target
=(uint8_t *)pFromUArgs
->target
;
336 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
338 /* get the converter state from the UTF-8 UConverter */
339 c
=(UChar32
)utf8
->toUnicodeStatus
;
340 if(c
!=0 && source
<sourceLimit
) {
341 if(targetCapacity
==0) {
342 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
344 } else if(c
>=0xc2 && c
<=0xc3 && (t1
=(uint8_t)(*source
-0x80)) <= 0x3f) {
346 *target
++=(uint8_t)(((c
&3)<<6)|t1
);
349 utf8
->toUnicodeStatus
=0;
352 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
353 *pErrorCode
=U_USING_DEFAULT_WARNING
;
359 * Make sure that the last byte sequence before sourceLimit is complete
360 * or runs into a lead byte.
361 * In the conversion loop compare source with sourceLimit only once
362 * per multi-byte character.
363 * For Latin-1, adjust sourceLimit only for 1 trail byte because
364 * the conversion loop handles at most 2-byte sequences.
366 if(source
<sourceLimit
&& U8_IS_LEAD(*(sourceLimit
-1))) {
370 /* conversion loop */
371 while(source
<sourceLimit
) {
372 if(targetCapacity
>0) {
376 *target
++=(uint8_t)b
;
378 } else if( /* handle U+0080..U+00FF inline */
379 b
>=0xc2 && b
<=0xc3 &&
380 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
383 *target
++=(uint8_t)(((b
&3)<<6)|t1
);
386 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
387 pToUArgs
->source
=(char *)(source
-1);
388 pFromUArgs
->target
=(char *)target
;
389 *pErrorCode
=U_USING_DEFAULT_WARNING
;
394 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
400 * The sourceLimit may have been adjusted before the conversion loop
401 * to stop before a truncated sequence.
402 * If so, then collect the truncated sequence now.
403 * For Latin-1, there is at most exactly one lead byte because of the
404 * smaller sourceLimit adjustment logic.
406 if(U_SUCCESS(*pErrorCode
) && source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
407 utf8
->toUnicodeStatus
=utf8
->toUBytes
[0]=b
=*source
++;
409 utf8
->mode
=U8_COUNT_TRAIL_BYTES(b
)+1;
412 /* write back the updated pointers */
413 pToUArgs
->source
=(char *)source
;
414 pFromUArgs
->target
=(char *)target
;
418 _Latin1GetUnicodeSet(const UConverter
*cnv
,
420 UConverterUnicodeSet which
,
421 UErrorCode
*pErrorCode
) {
422 sa
->addRange(sa
->set
, 0, 0xff);
425 static const UConverterImpl _Latin1Impl
={
435 _Latin1ToUnicodeWithOffsets
,
436 _Latin1ToUnicodeWithOffsets
,
437 _Latin1FromUnicodeWithOffsets
,
438 _Latin1FromUnicodeWithOffsets
,
445 _Latin1GetUnicodeSet
,
451 static const UConverterStaticData _Latin1StaticData
={
452 sizeof(UConverterStaticData
),
454 819, UCNV_IBM
, UCNV_LATIN_1
, 1, 1,
455 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
458 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
461 const UConverterSharedData _Latin1Data
={
462 sizeof(UConverterSharedData
), ~((uint32_t) 0),
463 NULL
, NULL
, &_Latin1StaticData
, FALSE
, &_Latin1Impl
,
467 /* US-ASCII ----------------------------------------------------------------- */
469 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
471 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
472 UErrorCode
*pErrorCode
) {
473 const uint8_t *source
, *sourceLimit
;
474 UChar
*target
, *oldTarget
;
475 int32_t targetCapacity
, length
;
482 /* set up the local pointers */
483 source
=(const uint8_t *)pArgs
->source
;
484 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
485 target
=oldTarget
=pArgs
->target
;
486 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
487 offsets
=pArgs
->offsets
;
489 /* sourceIndex=-1 if the current character began in the previous buffer */
493 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
494 * for the minimum of the sourceLength and targetCapacity
496 length
=(int32_t)(sourceLimit
-source
);
497 if(length
<targetCapacity
) {
498 targetCapacity
=length
;
501 if(targetCapacity
>=8) {
502 /* This loop is unrolled for speed and improved pipelining. */
503 int32_t count
, loops
;
506 loops
=count
=targetCapacity
>>3;
508 oredChars
=target
[0]=source
[0];
509 oredChars
|=target
[1]=source
[1];
510 oredChars
|=target
[2]=source
[2];
511 oredChars
|=target
[3]=source
[3];
512 oredChars
|=target
[4]=source
[4];
513 oredChars
|=target
[5]=source
[5];
514 oredChars
|=target
[6]=source
[6];
515 oredChars
|=target
[7]=source
[7];
517 /* were all 16 entries really valid? */
519 /* no, return to the first of these 16 */
526 targetCapacity
-=count
*8;
531 offsets
[0]=sourceIndex
++;
532 offsets
[1]=sourceIndex
++;
533 offsets
[2]=sourceIndex
++;
534 offsets
[3]=sourceIndex
++;
535 offsets
[4]=sourceIndex
++;
536 offsets
[5]=sourceIndex
++;
537 offsets
[6]=sourceIndex
++;
538 offsets
[7]=sourceIndex
++;
545 /* conversion loop */
547 while(targetCapacity
>0 && (c
=*source
++)<=0x7f) {
553 /* callback(illegal); copy the current bytes to toUBytes[] */
554 UConverter
*cnv
=pArgs
->converter
;
557 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
558 } else if(source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
560 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
563 /* set offsets since the start */
565 size_t count
=target
-oldTarget
;
567 *offsets
++=sourceIndex
++;
572 /* write back the updated pointers */
573 pArgs
->source
=(const char *)source
;
574 pArgs
->target
=target
;
575 pArgs
->offsets
=offsets
;
578 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
580 _ASCIIGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
581 UErrorCode
*pErrorCode
) {
582 const uint8_t *source
;
585 source
=(const uint8_t *)pArgs
->source
;
586 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
588 pArgs
->source
=(const char *)source
;
592 UConverter
*cnv
=pArgs
->converter
;
595 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
600 /* no output because of empty input */
601 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
605 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */
607 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
608 UConverterToUnicodeArgs
*pToUArgs
,
609 UErrorCode
*pErrorCode
) {
610 const uint8_t *source
, *sourceLimit
;
612 int32_t targetCapacity
, length
;
616 if(pToUArgs
->converter
->toUnicodeStatus
!=0) {
617 /* no handling of partial UTF-8 characters here, fall back to pivoting */
618 *pErrorCode
=U_USING_DEFAULT_WARNING
;
622 /* set up the local pointers */
623 source
=(const uint8_t *)pToUArgs
->source
;
624 sourceLimit
=(const uint8_t *)pToUArgs
->sourceLimit
;
625 target
=(uint8_t *)pFromUArgs
->target
;
626 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
629 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
630 * for the minimum of the sourceLength and targetCapacity
632 length
=(int32_t)(sourceLimit
-source
);
633 if(length
<targetCapacity
) {
634 targetCapacity
=length
;
637 /* unroll the loop with the most common case */
638 if(targetCapacity
>=16) {
639 int32_t count
, loops
;
642 loops
=count
=targetCapacity
>>4;
644 oredChars
=*target
++=*source
++;
645 oredChars
|=*target
++=*source
++;
646 oredChars
|=*target
++=*source
++;
647 oredChars
|=*target
++=*source
++;
648 oredChars
|=*target
++=*source
++;
649 oredChars
|=*target
++=*source
++;
650 oredChars
|=*target
++=*source
++;
651 oredChars
|=*target
++=*source
++;
652 oredChars
|=*target
++=*source
++;
653 oredChars
|=*target
++=*source
++;
654 oredChars
|=*target
++=*source
++;
655 oredChars
|=*target
++=*source
++;
656 oredChars
|=*target
++=*source
++;
657 oredChars
|=*target
++=*source
++;
658 oredChars
|=*target
++=*source
++;
659 oredChars
|=*target
++=*source
++;
661 /* were all 16 entries really valid? */
663 /* no, return to the first of these 16 */
670 targetCapacity
-=16*count
;
673 /* conversion loop */
675 while(targetCapacity
>0 && (c
=*source
)<=0x7f) {
682 /* non-ASCII character, handle in standard converter */
683 *pErrorCode
=U_USING_DEFAULT_WARNING
;
684 } else if(source
<sourceLimit
&& target
>=(const uint8_t *)pFromUArgs
->targetLimit
) {
686 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
689 /* write back the updated pointers */
690 pToUArgs
->source
=(const char *)source
;
691 pFromUArgs
->target
=(char *)target
;
695 _ASCIIGetUnicodeSet(const UConverter
*cnv
,
697 UConverterUnicodeSet which
,
698 UErrorCode
*pErrorCode
) {
699 sa
->addRange(sa
->set
, 0, 0x7f);
702 static const UConverterImpl _ASCIIImpl
={
712 _ASCIIToUnicodeWithOffsets
,
713 _ASCIIToUnicodeWithOffsets
,
714 _Latin1FromUnicodeWithOffsets
,
715 _Latin1FromUnicodeWithOffsets
,
728 static const UConverterStaticData _ASCIIStaticData
={
729 sizeof(UConverterStaticData
),
731 367, UCNV_IBM
, UCNV_US_ASCII
, 1, 1,
732 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
735 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
738 const UConverterSharedData _ASCIIData
={
739 sizeof(UConverterSharedData
), ~((uint32_t) 0),
740 NULL
, NULL
, &_ASCIIStaticData
, FALSE
, &_ASCIIImpl
,