2 **********************************************************************
3 * Copyright (C) 2000-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvlat1.cpp
8 * tab size: 8 (not used)
11 * created on: 2000feb07
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_CONVERSION
19 #include "unicode/ucnv.h"
20 #include "unicode/uset.h"
24 /* control optimizations according to the platform */
25 #define LATIN1_UNROLL_FROM_UNICODE 1
27 /* ISO 8859-1 --------------------------------------------------------------- */
29 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
31 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
32 UErrorCode
*pErrorCode
) {
33 const uint8_t *source
;
35 int32_t targetCapacity
, length
;
40 /* set up the local pointers */
41 source
=(const uint8_t *)pArgs
->source
;
43 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
44 offsets
=pArgs
->offsets
;
49 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
50 * for the minimum of the sourceLength and targetCapacity
52 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
53 if(length
<=targetCapacity
) {
54 targetCapacity
=length
;
56 /* target will be full */
57 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
58 length
=targetCapacity
;
61 if(targetCapacity
>=8) {
62 /* This loop is unrolled for speed and improved pipelining. */
65 loops
=count
=targetCapacity
>>3;
66 length
=targetCapacity
&=0x7;
82 offsets
[0]=sourceIndex
++;
83 offsets
[1]=sourceIndex
++;
84 offsets
[2]=sourceIndex
++;
85 offsets
[3]=sourceIndex
++;
86 offsets
[4]=sourceIndex
++;
87 offsets
[5]=sourceIndex
++;
88 offsets
[6]=sourceIndex
++;
89 offsets
[7]=sourceIndex
++;
96 while(targetCapacity
>0) {
101 /* write back the updated pointers */
102 pArgs
->source
=(const char *)source
;
103 pArgs
->target
=target
;
108 *offsets
++=sourceIndex
++;
111 pArgs
->offsets
=offsets
;
115 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
117 _Latin1GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
118 UErrorCode
*pErrorCode
) {
119 const uint8_t *source
=(const uint8_t *)pArgs
->source
;
120 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
121 pArgs
->source
=(const char *)(source
+1);
125 /* no output because of empty input */
126 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
130 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
132 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
133 UErrorCode
*pErrorCode
) {
135 const UChar
*source
, *sourceLimit
;
136 uint8_t *target
, *oldTarget
;
137 int32_t targetCapacity
, length
;
145 /* set up the local pointers */
146 cnv
=pArgs
->converter
;
147 source
=pArgs
->source
;
148 sourceLimit
=pArgs
->sourceLimit
;
149 target
=oldTarget
=(uint8_t *)pArgs
->target
;
150 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
151 offsets
=pArgs
->offsets
;
153 if(cnv
->sharedData
==&_Latin1Data
) {
154 max
=0xff; /* Latin-1 */
156 max
=0x7f; /* US-ASCII */
159 /* get the converter state from UConverter */
162 /* sourceIndex=-1 if the current character began in the previous buffer */
163 sourceIndex
= cp
==0 ? 0 : -1;
166 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
167 * for the minimum of the sourceLength and targetCapacity
169 length
=(int32_t)(sourceLimit
-source
);
170 if(length
<targetCapacity
) {
171 targetCapacity
=length
;
174 /* conversion loop */
175 if(cp
!=0 && targetCapacity
>0) {
179 #if LATIN1_UNROLL_FROM_UNICODE
180 /* unroll the loop with the most common case */
181 if(targetCapacity
>=16) {
182 int32_t count
, loops
;
185 loops
=count
=targetCapacity
>>4;
187 oredChars
=u
=*source
++;
188 *target
++=(uint8_t)u
;
189 oredChars
|=u
=*source
++;
190 *target
++=(uint8_t)u
;
191 oredChars
|=u
=*source
++;
192 *target
++=(uint8_t)u
;
193 oredChars
|=u
=*source
++;
194 *target
++=(uint8_t)u
;
195 oredChars
|=u
=*source
++;
196 *target
++=(uint8_t)u
;
197 oredChars
|=u
=*source
++;
198 *target
++=(uint8_t)u
;
199 oredChars
|=u
=*source
++;
200 *target
++=(uint8_t)u
;
201 oredChars
|=u
=*source
++;
202 *target
++=(uint8_t)u
;
203 oredChars
|=u
=*source
++;
204 *target
++=(uint8_t)u
;
205 oredChars
|=u
=*source
++;
206 *target
++=(uint8_t)u
;
207 oredChars
|=u
=*source
++;
208 *target
++=(uint8_t)u
;
209 oredChars
|=u
=*source
++;
210 *target
++=(uint8_t)u
;
211 oredChars
|=u
=*source
++;
212 *target
++=(uint8_t)u
;
213 oredChars
|=u
=*source
++;
214 *target
++=(uint8_t)u
;
215 oredChars
|=u
=*source
++;
216 *target
++=(uint8_t)u
;
217 oredChars
|=u
=*source
++;
218 *target
++=(uint8_t)u
;
220 /* were all 16 entries really valid? */
222 /* no, return to the first of these 16 */
229 targetCapacity
-=16*count
;
234 *offsets
++=sourceIndex
++;
235 *offsets
++=sourceIndex
++;
236 *offsets
++=sourceIndex
++;
237 *offsets
++=sourceIndex
++;
238 *offsets
++=sourceIndex
++;
239 *offsets
++=sourceIndex
++;
240 *offsets
++=sourceIndex
++;
241 *offsets
++=sourceIndex
++;
242 *offsets
++=sourceIndex
++;
243 *offsets
++=sourceIndex
++;
244 *offsets
++=sourceIndex
++;
245 *offsets
++=sourceIndex
++;
246 *offsets
++=sourceIndex
++;
247 *offsets
++=sourceIndex
++;
248 *offsets
++=sourceIndex
++;
249 *offsets
++=sourceIndex
++;
256 /* conversion loop */
258 while(targetCapacity
>0 && (c
=*source
++)<=max
) {
259 /* convert the Unicode code point */
260 *target
++=(uint8_t)c
;
266 if(!U_IS_SURROGATE(cp
)) {
267 /* callback(unassigned) */
268 } else if(U_IS_SURROGATE_LEAD(cp
)) {
270 if(source
<sourceLimit
) {
271 /* test the following code unit */
273 if(U16_IS_TRAIL(trail
)) {
275 cp
=U16_GET_SUPPLEMENTARY(cp
, trail
);
276 /* this codepage does not map supplementary code points */
277 /* callback(unassigned) */
279 /* this is an unmatched lead code unit (1st surrogate) */
280 /* callback(illegal) */
288 /* this is an unmatched trail code unit (2nd surrogate) */
289 /* callback(illegal) */
292 *pErrorCode
= U_IS_SURROGATE(cp
) ? U_ILLEGAL_CHAR_FOUND
: U_INVALID_CHAR_FOUND
;
297 /* set offsets since the start */
299 size_t count
=target
-oldTarget
;
301 *offsets
++=sourceIndex
++;
306 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
308 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
311 /* write back the updated pointers */
312 pArgs
->source
=source
;
313 pArgs
->target
=(char *)target
;
314 pArgs
->offsets
=offsets
;
317 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
319 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
320 UConverterToUnicodeArgs
*pToUArgs
,
321 UErrorCode
*pErrorCode
) {
323 const uint8_t *source
, *sourceLimit
;
325 int32_t targetCapacity
;
330 /* set up the local pointers */
331 utf8
=pToUArgs
->converter
;
332 source
=(uint8_t *)pToUArgs
->source
;
333 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
334 target
=(uint8_t *)pFromUArgs
->target
;
335 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
337 /* get the converter state from the UTF-8 UConverter */
338 c
=(UChar32
)utf8
->toUnicodeStatus
;
339 if(c
!=0 && source
<sourceLimit
) {
340 if(targetCapacity
==0) {
341 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
343 } else if(c
>=0xc2 && c
<=0xc3 && (t1
=(uint8_t)(*source
-0x80)) <= 0x3f) {
345 *target
++=(uint8_t)(((c
&3)<<6)|t1
);
348 utf8
->toUnicodeStatus
=0;
351 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
352 *pErrorCode
=U_USING_DEFAULT_WARNING
;
358 * Make sure that the last byte sequence before sourceLimit is complete
359 * or runs into a lead byte.
360 * In the conversion loop compare source with sourceLimit only once
361 * per multi-byte character.
362 * For Latin-1, adjust sourceLimit only for 1 trail byte because
363 * the conversion loop handles at most 2-byte sequences.
365 if(source
<sourceLimit
&& U8_IS_LEAD(*(sourceLimit
-1))) {
369 /* conversion loop */
370 while(source
<sourceLimit
) {
371 if(targetCapacity
>0) {
375 *target
++=(uint8_t)b
;
377 } else if( /* handle U+0080..U+00FF inline */
378 b
>=0xc2 && b
<=0xc3 &&
379 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
382 *target
++=(uint8_t)(((b
&3)<<6)|t1
);
385 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
386 pToUArgs
->source
=(char *)(source
-1);
387 pFromUArgs
->target
=(char *)target
;
388 *pErrorCode
=U_USING_DEFAULT_WARNING
;
393 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
399 * The sourceLimit may have been adjusted before the conversion loop
400 * to stop before a truncated sequence.
401 * If so, then collect the truncated sequence now.
402 * For Latin-1, there is at most exactly one lead byte because of the
403 * smaller sourceLimit adjustment logic.
405 if(U_SUCCESS(*pErrorCode
) && source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
406 utf8
->toUnicodeStatus
=utf8
->toUBytes
[0]=b
=*source
++;
408 utf8
->mode
=utf8_countTrailBytes
[b
]+1;
411 /* write back the updated pointers */
412 pToUArgs
->source
=(char *)source
;
413 pFromUArgs
->target
=(char *)target
;
417 _Latin1GetUnicodeSet(const UConverter
*cnv
,
419 UConverterUnicodeSet which
,
420 UErrorCode
*pErrorCode
) {
421 sa
->addRange(sa
->set
, 0, 0xff);
424 static const UConverterImpl _Latin1Impl
={
434 _Latin1ToUnicodeWithOffsets
,
435 _Latin1ToUnicodeWithOffsets
,
436 _Latin1FromUnicodeWithOffsets
,
437 _Latin1FromUnicodeWithOffsets
,
444 _Latin1GetUnicodeSet
,
450 static const UConverterStaticData _Latin1StaticData
={
451 sizeof(UConverterStaticData
),
453 819, UCNV_IBM
, UCNV_LATIN_1
, 1, 1,
454 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
457 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
460 const UConverterSharedData _Latin1Data
={
461 sizeof(UConverterSharedData
), ~((uint32_t) 0),
462 NULL
, NULL
, &_Latin1StaticData
, FALSE
, &_Latin1Impl
,
466 /* US-ASCII ----------------------------------------------------------------- */
468 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
470 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
471 UErrorCode
*pErrorCode
) {
472 const uint8_t *source
, *sourceLimit
;
473 UChar
*target
, *oldTarget
;
474 int32_t targetCapacity
, length
;
481 /* set up the local pointers */
482 source
=(const uint8_t *)pArgs
->source
;
483 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
484 target
=oldTarget
=pArgs
->target
;
485 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
486 offsets
=pArgs
->offsets
;
488 /* sourceIndex=-1 if the current character began in the previous buffer */
492 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
493 * for the minimum of the sourceLength and targetCapacity
495 length
=(int32_t)(sourceLimit
-source
);
496 if(length
<targetCapacity
) {
497 targetCapacity
=length
;
500 if(targetCapacity
>=8) {
501 /* This loop is unrolled for speed and improved pipelining. */
502 int32_t count
, loops
;
505 loops
=count
=targetCapacity
>>3;
507 oredChars
=target
[0]=source
[0];
508 oredChars
|=target
[1]=source
[1];
509 oredChars
|=target
[2]=source
[2];
510 oredChars
|=target
[3]=source
[3];
511 oredChars
|=target
[4]=source
[4];
512 oredChars
|=target
[5]=source
[5];
513 oredChars
|=target
[6]=source
[6];
514 oredChars
|=target
[7]=source
[7];
516 /* were all 16 entries really valid? */
518 /* no, return to the first of these 16 */
525 targetCapacity
-=count
*8;
530 offsets
[0]=sourceIndex
++;
531 offsets
[1]=sourceIndex
++;
532 offsets
[2]=sourceIndex
++;
533 offsets
[3]=sourceIndex
++;
534 offsets
[4]=sourceIndex
++;
535 offsets
[5]=sourceIndex
++;
536 offsets
[6]=sourceIndex
++;
537 offsets
[7]=sourceIndex
++;
544 /* conversion loop */
546 while(targetCapacity
>0 && (c
=*source
++)<=0x7f) {
552 /* callback(illegal); copy the current bytes to toUBytes[] */
553 UConverter
*cnv
=pArgs
->converter
;
556 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
557 } else if(source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
559 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
562 /* set offsets since the start */
564 size_t count
=target
-oldTarget
;
566 *offsets
++=sourceIndex
++;
571 /* write back the updated pointers */
572 pArgs
->source
=(const char *)source
;
573 pArgs
->target
=target
;
574 pArgs
->offsets
=offsets
;
577 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
579 _ASCIIGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
580 UErrorCode
*pErrorCode
) {
581 const uint8_t *source
;
584 source
=(const uint8_t *)pArgs
->source
;
585 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
587 pArgs
->source
=(const char *)source
;
591 UConverter
*cnv
=pArgs
->converter
;
594 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
599 /* no output because of empty input */
600 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
604 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */
606 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
607 UConverterToUnicodeArgs
*pToUArgs
,
608 UErrorCode
*pErrorCode
) {
609 const uint8_t *source
, *sourceLimit
;
611 int32_t targetCapacity
, length
;
615 if(pToUArgs
->converter
->toUnicodeStatus
!=0) {
616 /* no handling of partial UTF-8 characters here, fall back to pivoting */
617 *pErrorCode
=U_USING_DEFAULT_WARNING
;
621 /* set up the local pointers */
622 source
=(const uint8_t *)pToUArgs
->source
;
623 sourceLimit
=(const uint8_t *)pToUArgs
->sourceLimit
;
624 target
=(uint8_t *)pFromUArgs
->target
;
625 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
628 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
629 * for the minimum of the sourceLength and targetCapacity
631 length
=(int32_t)(sourceLimit
-source
);
632 if(length
<targetCapacity
) {
633 targetCapacity
=length
;
636 /* unroll the loop with the most common case */
637 if(targetCapacity
>=16) {
638 int32_t count
, loops
;
641 loops
=count
=targetCapacity
>>4;
643 oredChars
=*target
++=*source
++;
644 oredChars
|=*target
++=*source
++;
645 oredChars
|=*target
++=*source
++;
646 oredChars
|=*target
++=*source
++;
647 oredChars
|=*target
++=*source
++;
648 oredChars
|=*target
++=*source
++;
649 oredChars
|=*target
++=*source
++;
650 oredChars
|=*target
++=*source
++;
651 oredChars
|=*target
++=*source
++;
652 oredChars
|=*target
++=*source
++;
653 oredChars
|=*target
++=*source
++;
654 oredChars
|=*target
++=*source
++;
655 oredChars
|=*target
++=*source
++;
656 oredChars
|=*target
++=*source
++;
657 oredChars
|=*target
++=*source
++;
658 oredChars
|=*target
++=*source
++;
660 /* were all 16 entries really valid? */
662 /* no, return to the first of these 16 */
669 targetCapacity
-=16*count
;
672 /* conversion loop */
674 while(targetCapacity
>0 && (c
=*source
)<=0x7f) {
681 /* non-ASCII character, handle in standard converter */
682 *pErrorCode
=U_USING_DEFAULT_WARNING
;
683 } else if(source
<sourceLimit
&& target
>=(const uint8_t *)pFromUArgs
->targetLimit
) {
685 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
688 /* write back the updated pointers */
689 pToUArgs
->source
=(const char *)source
;
690 pFromUArgs
->target
=(char *)target
;
694 _ASCIIGetUnicodeSet(const UConverter
*cnv
,
696 UConverterUnicodeSet which
,
697 UErrorCode
*pErrorCode
) {
698 sa
->addRange(sa
->set
, 0, 0x7f);
701 static const UConverterImpl _ASCIIImpl
={
711 _ASCIIToUnicodeWithOffsets
,
712 _ASCIIToUnicodeWithOffsets
,
713 _Latin1FromUnicodeWithOffsets
,
714 _Latin1FromUnicodeWithOffsets
,
727 static const UConverterStaticData _ASCIIStaticData
={
728 sizeof(UConverterStaticData
),
730 367, UCNV_IBM
, UCNV_US_ASCII
, 1, 1,
731 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
734 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
737 const UConverterSharedData _ASCIIData
={
738 sizeof(UConverterSharedData
), ~((uint32_t) 0),
739 NULL
, NULL
, &_ASCIIStaticData
, FALSE
, &_ASCIIImpl
,