1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2000-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnvlat1.cpp
10 * tab size: 8 (not used)
13 * created on: 2000feb07
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
22 #include "unicode/uset.h"
23 #include "unicode/utf8.h"
28 /* control optimizations according to the platform */
29 #define LATIN1_UNROLL_FROM_UNICODE 1
31 /* ISO 8859-1 --------------------------------------------------------------- */
33 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
35 static void U_CALLCONV
36 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
37 UErrorCode
*pErrorCode
) {
38 const uint8_t *source
;
40 int32_t targetCapacity
, length
;
45 /* set up the local pointers */
46 source
=(const uint8_t *)pArgs
->source
;
48 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
49 offsets
=pArgs
->offsets
;
54 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
55 * for the minimum of the sourceLength and targetCapacity
57 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
58 if(length
<=targetCapacity
) {
59 targetCapacity
=length
;
61 /* target will be full */
62 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
63 length
=targetCapacity
;
66 if(targetCapacity
>=8) {
67 /* This loop is unrolled for speed and improved pipelining. */
70 loops
=count
=targetCapacity
>>3;
71 length
=targetCapacity
&=0x7;
87 offsets
[0]=sourceIndex
++;
88 offsets
[1]=sourceIndex
++;
89 offsets
[2]=sourceIndex
++;
90 offsets
[3]=sourceIndex
++;
91 offsets
[4]=sourceIndex
++;
92 offsets
[5]=sourceIndex
++;
93 offsets
[6]=sourceIndex
++;
94 offsets
[7]=sourceIndex
++;
100 /* conversion loop */
101 while(targetCapacity
>0) {
106 /* write back the updated pointers */
107 pArgs
->source
=(const char *)source
;
108 pArgs
->target
=target
;
113 *offsets
++=sourceIndex
++;
116 pArgs
->offsets
=offsets
;
120 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
121 static UChar32 U_CALLCONV
122 _Latin1GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
123 UErrorCode
*pErrorCode
) {
124 const uint8_t *source
=(const uint8_t *)pArgs
->source
;
125 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
126 pArgs
->source
=(const char *)(source
+1);
130 /* no output because of empty input */
131 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
135 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
136 static void U_CALLCONV
137 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
138 UErrorCode
*pErrorCode
) {
140 const UChar
*source
, *sourceLimit
;
141 uint8_t *target
, *oldTarget
;
142 int32_t targetCapacity
, length
;
150 /* set up the local pointers */
151 cnv
=pArgs
->converter
;
152 source
=pArgs
->source
;
153 sourceLimit
=pArgs
->sourceLimit
;
154 target
=oldTarget
=(uint8_t *)pArgs
->target
;
155 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
156 offsets
=pArgs
->offsets
;
158 if(cnv
->sharedData
==&_Latin1Data
) {
159 max
=0xff; /* Latin-1 */
161 max
=0x7f; /* US-ASCII */
164 /* get the converter state from UConverter */
167 /* sourceIndex=-1 if the current character began in the previous buffer */
168 sourceIndex
= cp
==0 ? 0 : -1;
171 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
172 * for the minimum of the sourceLength and targetCapacity
174 length
=(int32_t)(sourceLimit
-source
);
175 if(length
<targetCapacity
) {
176 targetCapacity
=length
;
179 /* conversion loop */
180 if(cp
!=0 && targetCapacity
>0) {
184 #if LATIN1_UNROLL_FROM_UNICODE
185 /* unroll the loop with the most common case */
186 if(targetCapacity
>=16) {
187 int32_t count
, loops
;
190 loops
=count
=targetCapacity
>>4;
192 oredChars
=u
=*source
++;
193 *target
++=(uint8_t)u
;
194 oredChars
|=u
=*source
++;
195 *target
++=(uint8_t)u
;
196 oredChars
|=u
=*source
++;
197 *target
++=(uint8_t)u
;
198 oredChars
|=u
=*source
++;
199 *target
++=(uint8_t)u
;
200 oredChars
|=u
=*source
++;
201 *target
++=(uint8_t)u
;
202 oredChars
|=u
=*source
++;
203 *target
++=(uint8_t)u
;
204 oredChars
|=u
=*source
++;
205 *target
++=(uint8_t)u
;
206 oredChars
|=u
=*source
++;
207 *target
++=(uint8_t)u
;
208 oredChars
|=u
=*source
++;
209 *target
++=(uint8_t)u
;
210 oredChars
|=u
=*source
++;
211 *target
++=(uint8_t)u
;
212 oredChars
|=u
=*source
++;
213 *target
++=(uint8_t)u
;
214 oredChars
|=u
=*source
++;
215 *target
++=(uint8_t)u
;
216 oredChars
|=u
=*source
++;
217 *target
++=(uint8_t)u
;
218 oredChars
|=u
=*source
++;
219 *target
++=(uint8_t)u
;
220 oredChars
|=u
=*source
++;
221 *target
++=(uint8_t)u
;
222 oredChars
|=u
=*source
++;
223 *target
++=(uint8_t)u
;
225 /* were all 16 entries really valid? */
227 /* no, return to the first of these 16 */
234 targetCapacity
-=16*count
;
239 *offsets
++=sourceIndex
++;
240 *offsets
++=sourceIndex
++;
241 *offsets
++=sourceIndex
++;
242 *offsets
++=sourceIndex
++;
243 *offsets
++=sourceIndex
++;
244 *offsets
++=sourceIndex
++;
245 *offsets
++=sourceIndex
++;
246 *offsets
++=sourceIndex
++;
247 *offsets
++=sourceIndex
++;
248 *offsets
++=sourceIndex
++;
249 *offsets
++=sourceIndex
++;
250 *offsets
++=sourceIndex
++;
251 *offsets
++=sourceIndex
++;
252 *offsets
++=sourceIndex
++;
253 *offsets
++=sourceIndex
++;
254 *offsets
++=sourceIndex
++;
261 /* conversion loop */
263 while(targetCapacity
>0 && (c
=*source
++)<=max
) {
264 /* convert the Unicode code point */
265 *target
++=(uint8_t)c
;
271 if(!U_IS_SURROGATE(cp
)) {
272 /* callback(unassigned) */
273 } else if(U_IS_SURROGATE_LEAD(cp
)) {
275 if(source
<sourceLimit
) {
276 /* test the following code unit */
278 if(U16_IS_TRAIL(trail
)) {
280 cp
=U16_GET_SUPPLEMENTARY(cp
, trail
);
281 /* this codepage does not map supplementary code points */
282 /* callback(unassigned) */
284 /* this is an unmatched lead code unit (1st surrogate) */
285 /* callback(illegal) */
293 /* this is an unmatched trail code unit (2nd surrogate) */
294 /* callback(illegal) */
297 *pErrorCode
= U_IS_SURROGATE(cp
) ? U_ILLEGAL_CHAR_FOUND
: U_INVALID_CHAR_FOUND
;
302 /* set offsets since the start */
304 size_t count
=target
-oldTarget
;
306 *offsets
++=sourceIndex
++;
311 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
313 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
316 /* write back the updated pointers */
317 pArgs
->source
=source
;
318 pArgs
->target
=(char *)target
;
319 pArgs
->offsets
=offsets
;
322 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
323 static void U_CALLCONV
324 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
325 UConverterToUnicodeArgs
*pToUArgs
,
326 UErrorCode
*pErrorCode
) {
328 const uint8_t *source
, *sourceLimit
;
330 int32_t targetCapacity
;
335 /* set up the local pointers */
336 utf8
=pToUArgs
->converter
;
337 source
=(uint8_t *)pToUArgs
->source
;
338 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
339 target
=(uint8_t *)pFromUArgs
->target
;
340 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
342 /* get the converter state from the UTF-8 UConverter */
343 if (utf8
->toULength
> 0) {
344 c
=(UChar32
)utf8
->toUnicodeStatus
;
348 if(c
!=0 && source
<sourceLimit
) {
349 if(targetCapacity
==0) {
350 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
352 } else if(c
>=0xc2 && c
<=0xc3 && (t1
=(uint8_t)(*source
-0x80)) <= 0x3f) {
354 *target
++=(uint8_t)(((c
&3)<<6)|t1
);
357 utf8
->toUnicodeStatus
=0;
360 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
361 *pErrorCode
=U_USING_DEFAULT_WARNING
;
367 * Make sure that the last byte sequence before sourceLimit is complete
368 * or runs into a lead byte.
369 * In the conversion loop compare source with sourceLimit only once
370 * per multi-byte character.
371 * For Latin-1, adjust sourceLimit only for 1 trail byte because
372 * the conversion loop handles at most 2-byte sequences.
374 if(source
<sourceLimit
&& U8_IS_LEAD(*(sourceLimit
-1))) {
378 /* conversion loop */
379 while(source
<sourceLimit
) {
380 if(targetCapacity
>0) {
382 if(U8_IS_SINGLE(b
)) {
384 *target
++=(uint8_t)b
;
386 } else if( /* handle U+0080..U+00FF inline */
387 b
>=0xc2 && b
<=0xc3 &&
388 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
391 *target
++=(uint8_t)(((b
&3)<<6)|t1
);
394 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
395 pToUArgs
->source
=(char *)(source
-1);
396 pFromUArgs
->target
=(char *)target
;
397 *pErrorCode
=U_USING_DEFAULT_WARNING
;
402 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
408 * The sourceLimit may have been adjusted before the conversion loop
409 * to stop before a truncated sequence.
410 * If so, then collect the truncated sequence now.
411 * For Latin-1, there is at most exactly one lead byte because of the
412 * smaller sourceLimit adjustment logic.
414 if(U_SUCCESS(*pErrorCode
) && source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
415 utf8
->toUnicodeStatus
=utf8
->toUBytes
[0]=b
=*source
++;
417 utf8
->mode
=U8_COUNT_BYTES(b
);
420 /* write back the updated pointers */
421 pToUArgs
->source
=(char *)source
;
422 pFromUArgs
->target
=(char *)target
;
425 static void U_CALLCONV
426 _Latin1GetUnicodeSet(const UConverter
*cnv
,
428 UConverterUnicodeSet which
,
429 UErrorCode
*pErrorCode
) {
433 sa
->addRange(sa
->set
, 0, 0xff);
438 static const UConverterImpl _Latin1Impl
={
448 _Latin1ToUnicodeWithOffsets
,
449 _Latin1ToUnicodeWithOffsets
,
450 _Latin1FromUnicodeWithOffsets
,
451 _Latin1FromUnicodeWithOffsets
,
458 _Latin1GetUnicodeSet
,
464 static const UConverterStaticData _Latin1StaticData
={
465 sizeof(UConverterStaticData
),
467 819, UCNV_IBM
, UCNV_LATIN_1
, 1, 1,
468 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
471 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
474 const UConverterSharedData _Latin1Data
=
475 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Latin1StaticData
, &_Latin1Impl
);
477 /* US-ASCII ----------------------------------------------------------------- */
480 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
481 static void U_CALLCONV
482 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
483 UErrorCode
*pErrorCode
) {
484 const uint8_t *source
, *sourceLimit
;
485 UChar
*target
, *oldTarget
;
486 int32_t targetCapacity
, length
;
493 /* set up the local pointers */
494 source
=(const uint8_t *)pArgs
->source
;
495 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
496 target
=oldTarget
=pArgs
->target
;
497 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
498 offsets
=pArgs
->offsets
;
500 /* sourceIndex=-1 if the current character began in the previous buffer */
504 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
505 * for the minimum of the sourceLength and targetCapacity
507 length
=(int32_t)(sourceLimit
-source
);
508 if(length
<targetCapacity
) {
509 targetCapacity
=length
;
512 if(targetCapacity
>=8) {
513 /* This loop is unrolled for speed and improved pipelining. */
514 int32_t count
, loops
;
517 loops
=count
=targetCapacity
>>3;
519 oredChars
=target
[0]=source
[0];
520 oredChars
|=target
[1]=source
[1];
521 oredChars
|=target
[2]=source
[2];
522 oredChars
|=target
[3]=source
[3];
523 oredChars
|=target
[4]=source
[4];
524 oredChars
|=target
[5]=source
[5];
525 oredChars
|=target
[6]=source
[6];
526 oredChars
|=target
[7]=source
[7];
528 /* were all 16 entries really valid? */
530 /* no, return to the first of these 16 */
537 targetCapacity
-=count
*8;
542 offsets
[0]=sourceIndex
++;
543 offsets
[1]=sourceIndex
++;
544 offsets
[2]=sourceIndex
++;
545 offsets
[3]=sourceIndex
++;
546 offsets
[4]=sourceIndex
++;
547 offsets
[5]=sourceIndex
++;
548 offsets
[6]=sourceIndex
++;
549 offsets
[7]=sourceIndex
++;
556 /* conversion loop */
558 while(targetCapacity
>0 && (c
=*source
++)<=0x7f) {
564 /* callback(illegal); copy the current bytes to toUBytes[] */
565 UConverter
*cnv
=pArgs
->converter
;
568 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
569 } else if(source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
571 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
574 /* set offsets since the start */
576 size_t count
=target
-oldTarget
;
578 *offsets
++=sourceIndex
++;
583 /* write back the updated pointers */
584 pArgs
->source
=(const char *)source
;
585 pArgs
->target
=target
;
586 pArgs
->offsets
=offsets
;
589 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
590 static UChar32 U_CALLCONV
591 _ASCIIGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
592 UErrorCode
*pErrorCode
) {
593 const uint8_t *source
;
596 source
=(const uint8_t *)pArgs
->source
;
597 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
599 pArgs
->source
=(const char *)source
;
603 UConverter
*cnv
=pArgs
->converter
;
606 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
611 /* no output because of empty input */
612 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
616 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */
617 static void U_CALLCONV
618 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
619 UConverterToUnicodeArgs
*pToUArgs
,
620 UErrorCode
*pErrorCode
) {
621 const uint8_t *source
, *sourceLimit
;
623 int32_t targetCapacity
, length
;
627 if(pToUArgs
->converter
->toULength
> 0) {
628 /* no handling of partial UTF-8 characters here, fall back to pivoting */
629 *pErrorCode
=U_USING_DEFAULT_WARNING
;
633 /* set up the local pointers */
634 source
=(const uint8_t *)pToUArgs
->source
;
635 sourceLimit
=(const uint8_t *)pToUArgs
->sourceLimit
;
636 target
=(uint8_t *)pFromUArgs
->target
;
637 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
640 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
641 * for the minimum of the sourceLength and targetCapacity
643 length
=(int32_t)(sourceLimit
-source
);
644 if(length
<targetCapacity
) {
645 targetCapacity
=length
;
648 /* unroll the loop with the most common case */
649 if(targetCapacity
>=16) {
650 int32_t count
, loops
;
653 loops
=count
=targetCapacity
>>4;
655 oredChars
=*target
++=*source
++;
656 oredChars
|=*target
++=*source
++;
657 oredChars
|=*target
++=*source
++;
658 oredChars
|=*target
++=*source
++;
659 oredChars
|=*target
++=*source
++;
660 oredChars
|=*target
++=*source
++;
661 oredChars
|=*target
++=*source
++;
662 oredChars
|=*target
++=*source
++;
663 oredChars
|=*target
++=*source
++;
664 oredChars
|=*target
++=*source
++;
665 oredChars
|=*target
++=*source
++;
666 oredChars
|=*target
++=*source
++;
667 oredChars
|=*target
++=*source
++;
668 oredChars
|=*target
++=*source
++;
669 oredChars
|=*target
++=*source
++;
670 oredChars
|=*target
++=*source
++;
672 /* were all 16 entries really valid? */
674 /* no, return to the first of these 16 */
681 targetCapacity
-=16*count
;
684 /* conversion loop */
686 while(targetCapacity
>0 && (c
=*source
)<=0x7f) {
693 /* non-ASCII character, handle in standard converter */
694 *pErrorCode
=U_USING_DEFAULT_WARNING
;
695 } else if(source
<sourceLimit
&& target
>=(const uint8_t *)pFromUArgs
->targetLimit
) {
697 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
700 /* write back the updated pointers */
701 pToUArgs
->source
=(const char *)source
;
702 pFromUArgs
->target
=(char *)target
;
705 static void U_CALLCONV
706 _ASCIIGetUnicodeSet(const UConverter
*cnv
,
708 UConverterUnicodeSet which
,
709 UErrorCode
*pErrorCode
) {
713 sa
->addRange(sa
->set
, 0, 0x7f);
717 static const UConverterImpl _ASCIIImpl
={
727 _ASCIIToUnicodeWithOffsets
,
728 _ASCIIToUnicodeWithOffsets
,
729 _Latin1FromUnicodeWithOffsets
,
730 _Latin1FromUnicodeWithOffsets
,
743 static const UConverterStaticData _ASCIIStaticData
={
744 sizeof(UConverterStaticData
),
746 367, UCNV_IBM
, UCNV_US_ASCII
, 1, 1,
747 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
750 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
753 const UConverterSharedData _ASCIIData
=
754 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ASCIIStaticData
, &_ASCIIImpl
);