1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2000-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnvlat1.cpp
10 * tab size: 8 (not used)
13 * created on: 2000feb07
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
22 #include "unicode/uset.h"
23 #include "unicode/utf8.h"
27 /* control optimizations according to the platform */
28 #define LATIN1_UNROLL_FROM_UNICODE 1
30 /* ISO 8859-1 --------------------------------------------------------------- */
32 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
34 static void U_CALLCONV
35 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
36 UErrorCode
*pErrorCode
) {
37 const uint8_t *source
;
39 int32_t targetCapacity
, length
;
44 /* set up the local pointers */
45 source
=(const uint8_t *)pArgs
->source
;
47 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
48 offsets
=pArgs
->offsets
;
53 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
54 * for the minimum of the sourceLength and targetCapacity
56 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
57 if(length
<=targetCapacity
) {
58 targetCapacity
=length
;
60 /* target will be full */
61 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
62 length
=targetCapacity
;
65 if(targetCapacity
>=8) {
66 /* This loop is unrolled for speed and improved pipelining. */
69 loops
=count
=targetCapacity
>>3;
70 length
=targetCapacity
&=0x7;
86 offsets
[0]=sourceIndex
++;
87 offsets
[1]=sourceIndex
++;
88 offsets
[2]=sourceIndex
++;
89 offsets
[3]=sourceIndex
++;
90 offsets
[4]=sourceIndex
++;
91 offsets
[5]=sourceIndex
++;
92 offsets
[6]=sourceIndex
++;
93 offsets
[7]=sourceIndex
++;
100 while(targetCapacity
>0) {
105 /* write back the updated pointers */
106 pArgs
->source
=(const char *)source
;
107 pArgs
->target
=target
;
112 *offsets
++=sourceIndex
++;
115 pArgs
->offsets
=offsets
;
119 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
120 static UChar32 U_CALLCONV
121 _Latin1GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
122 UErrorCode
*pErrorCode
) {
123 const uint8_t *source
=(const uint8_t *)pArgs
->source
;
124 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
125 pArgs
->source
=(const char *)(source
+1);
129 /* no output because of empty input */
130 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
134 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
135 static void U_CALLCONV
136 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
137 UErrorCode
*pErrorCode
) {
139 const UChar
*source
, *sourceLimit
;
140 uint8_t *target
, *oldTarget
;
141 int32_t targetCapacity
, length
;
149 /* set up the local pointers */
150 cnv
=pArgs
->converter
;
151 source
=pArgs
->source
;
152 sourceLimit
=pArgs
->sourceLimit
;
153 target
=oldTarget
=(uint8_t *)pArgs
->target
;
154 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
155 offsets
=pArgs
->offsets
;
157 if(cnv
->sharedData
==&_Latin1Data
) {
158 max
=0xff; /* Latin-1 */
160 max
=0x7f; /* US-ASCII */
163 /* get the converter state from UConverter */
166 /* sourceIndex=-1 if the current character began in the previous buffer */
167 sourceIndex
= cp
==0 ? 0 : -1;
170 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
171 * for the minimum of the sourceLength and targetCapacity
173 length
=(int32_t)(sourceLimit
-source
);
174 if(length
<targetCapacity
) {
175 targetCapacity
=length
;
178 /* conversion loop */
179 if(cp
!=0 && targetCapacity
>0) {
183 #if LATIN1_UNROLL_FROM_UNICODE
184 /* unroll the loop with the most common case */
185 if(targetCapacity
>=16) {
186 int32_t count
, loops
;
189 loops
=count
=targetCapacity
>>4;
191 oredChars
=u
=*source
++;
192 *target
++=(uint8_t)u
;
193 oredChars
|=u
=*source
++;
194 *target
++=(uint8_t)u
;
195 oredChars
|=u
=*source
++;
196 *target
++=(uint8_t)u
;
197 oredChars
|=u
=*source
++;
198 *target
++=(uint8_t)u
;
199 oredChars
|=u
=*source
++;
200 *target
++=(uint8_t)u
;
201 oredChars
|=u
=*source
++;
202 *target
++=(uint8_t)u
;
203 oredChars
|=u
=*source
++;
204 *target
++=(uint8_t)u
;
205 oredChars
|=u
=*source
++;
206 *target
++=(uint8_t)u
;
207 oredChars
|=u
=*source
++;
208 *target
++=(uint8_t)u
;
209 oredChars
|=u
=*source
++;
210 *target
++=(uint8_t)u
;
211 oredChars
|=u
=*source
++;
212 *target
++=(uint8_t)u
;
213 oredChars
|=u
=*source
++;
214 *target
++=(uint8_t)u
;
215 oredChars
|=u
=*source
++;
216 *target
++=(uint8_t)u
;
217 oredChars
|=u
=*source
++;
218 *target
++=(uint8_t)u
;
219 oredChars
|=u
=*source
++;
220 *target
++=(uint8_t)u
;
221 oredChars
|=u
=*source
++;
222 *target
++=(uint8_t)u
;
224 /* were all 16 entries really valid? */
226 /* no, return to the first of these 16 */
233 targetCapacity
-=16*count
;
238 *offsets
++=sourceIndex
++;
239 *offsets
++=sourceIndex
++;
240 *offsets
++=sourceIndex
++;
241 *offsets
++=sourceIndex
++;
242 *offsets
++=sourceIndex
++;
243 *offsets
++=sourceIndex
++;
244 *offsets
++=sourceIndex
++;
245 *offsets
++=sourceIndex
++;
246 *offsets
++=sourceIndex
++;
247 *offsets
++=sourceIndex
++;
248 *offsets
++=sourceIndex
++;
249 *offsets
++=sourceIndex
++;
250 *offsets
++=sourceIndex
++;
251 *offsets
++=sourceIndex
++;
252 *offsets
++=sourceIndex
++;
253 *offsets
++=sourceIndex
++;
260 /* conversion loop */
262 while(targetCapacity
>0 && (c
=*source
++)<=max
) {
263 /* convert the Unicode code point */
264 *target
++=(uint8_t)c
;
270 if(!U_IS_SURROGATE(cp
)) {
271 /* callback(unassigned) */
272 } else if(U_IS_SURROGATE_LEAD(cp
)) {
274 if(source
<sourceLimit
) {
275 /* test the following code unit */
277 if(U16_IS_TRAIL(trail
)) {
279 cp
=U16_GET_SUPPLEMENTARY(cp
, trail
);
280 /* this codepage does not map supplementary code points */
281 /* callback(unassigned) */
283 /* this is an unmatched lead code unit (1st surrogate) */
284 /* callback(illegal) */
292 /* this is an unmatched trail code unit (2nd surrogate) */
293 /* callback(illegal) */
296 *pErrorCode
= U_IS_SURROGATE(cp
) ? U_ILLEGAL_CHAR_FOUND
: U_INVALID_CHAR_FOUND
;
301 /* set offsets since the start */
303 size_t count
=target
-oldTarget
;
305 *offsets
++=sourceIndex
++;
310 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
312 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
315 /* write back the updated pointers */
316 pArgs
->source
=source
;
317 pArgs
->target
=(char *)target
;
318 pArgs
->offsets
=offsets
;
321 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
322 static void U_CALLCONV
323 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
324 UConverterToUnicodeArgs
*pToUArgs
,
325 UErrorCode
*pErrorCode
) {
327 const uint8_t *source
, *sourceLimit
;
329 int32_t targetCapacity
;
334 /* set up the local pointers */
335 utf8
=pToUArgs
->converter
;
336 source
=(uint8_t *)pToUArgs
->source
;
337 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
338 target
=(uint8_t *)pFromUArgs
->target
;
339 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
341 /* get the converter state from the UTF-8 UConverter */
342 c
=(UChar32
)utf8
->toUnicodeStatus
;
343 if(c
!=0 && source
<sourceLimit
) {
344 if(targetCapacity
==0) {
345 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
347 } else if(c
>=0xc2 && c
<=0xc3 && (t1
=(uint8_t)(*source
-0x80)) <= 0x3f) {
349 *target
++=(uint8_t)(((c
&3)<<6)|t1
);
352 utf8
->toUnicodeStatus
=0;
355 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
356 *pErrorCode
=U_USING_DEFAULT_WARNING
;
362 * Make sure that the last byte sequence before sourceLimit is complete
363 * or runs into a lead byte.
364 * In the conversion loop compare source with sourceLimit only once
365 * per multi-byte character.
366 * For Latin-1, adjust sourceLimit only for 1 trail byte because
367 * the conversion loop handles at most 2-byte sequences.
369 if(source
<sourceLimit
&& U8_IS_LEAD(*(sourceLimit
-1))) {
373 /* conversion loop */
374 while(source
<sourceLimit
) {
375 if(targetCapacity
>0) {
379 *target
++=(uint8_t)b
;
381 } else if( /* handle U+0080..U+00FF inline */
382 b
>=0xc2 && b
<=0xc3 &&
383 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
386 *target
++=(uint8_t)(((b
&3)<<6)|t1
);
389 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
390 pToUArgs
->source
=(char *)(source
-1);
391 pFromUArgs
->target
=(char *)target
;
392 *pErrorCode
=U_USING_DEFAULT_WARNING
;
397 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
403 * The sourceLimit may have been adjusted before the conversion loop
404 * to stop before a truncated sequence.
405 * If so, then collect the truncated sequence now.
406 * For Latin-1, there is at most exactly one lead byte because of the
407 * smaller sourceLimit adjustment logic.
409 if(U_SUCCESS(*pErrorCode
) && source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
410 utf8
->toUnicodeStatus
=utf8
->toUBytes
[0]=b
=*source
++;
412 utf8
->mode
=U8_COUNT_TRAIL_BYTES(b
)+1;
415 /* write back the updated pointers */
416 pToUArgs
->source
=(char *)source
;
417 pFromUArgs
->target
=(char *)target
;
420 static void U_CALLCONV
421 _Latin1GetUnicodeSet(const UConverter
*cnv
,
423 UConverterUnicodeSet which
,
424 UErrorCode
*pErrorCode
) {
428 sa
->addRange(sa
->set
, 0, 0xff);
433 static const UConverterImpl _Latin1Impl
={
443 _Latin1ToUnicodeWithOffsets
,
444 _Latin1ToUnicodeWithOffsets
,
445 _Latin1FromUnicodeWithOffsets
,
446 _Latin1FromUnicodeWithOffsets
,
453 _Latin1GetUnicodeSet
,
459 static const UConverterStaticData _Latin1StaticData
={
460 sizeof(UConverterStaticData
),
462 819, UCNV_IBM
, UCNV_LATIN_1
, 1, 1,
463 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
466 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
469 const UConverterSharedData _Latin1Data
=
470 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Latin1StaticData
, &_Latin1Impl
);
472 /* US-ASCII ----------------------------------------------------------------- */
475 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
476 static void U_CALLCONV
477 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
478 UErrorCode
*pErrorCode
) {
479 const uint8_t *source
, *sourceLimit
;
480 UChar
*target
, *oldTarget
;
481 int32_t targetCapacity
, length
;
488 /* set up the local pointers */
489 source
=(const uint8_t *)pArgs
->source
;
490 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
491 target
=oldTarget
=pArgs
->target
;
492 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
493 offsets
=pArgs
->offsets
;
495 /* sourceIndex=-1 if the current character began in the previous buffer */
499 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
500 * for the minimum of the sourceLength and targetCapacity
502 length
=(int32_t)(sourceLimit
-source
);
503 if(length
<targetCapacity
) {
504 targetCapacity
=length
;
507 if(targetCapacity
>=8) {
508 /* This loop is unrolled for speed and improved pipelining. */
509 int32_t count
, loops
;
512 loops
=count
=targetCapacity
>>3;
514 oredChars
=target
[0]=source
[0];
515 oredChars
|=target
[1]=source
[1];
516 oredChars
|=target
[2]=source
[2];
517 oredChars
|=target
[3]=source
[3];
518 oredChars
|=target
[4]=source
[4];
519 oredChars
|=target
[5]=source
[5];
520 oredChars
|=target
[6]=source
[6];
521 oredChars
|=target
[7]=source
[7];
523 /* were all 16 entries really valid? */
525 /* no, return to the first of these 16 */
532 targetCapacity
-=count
*8;
537 offsets
[0]=sourceIndex
++;
538 offsets
[1]=sourceIndex
++;
539 offsets
[2]=sourceIndex
++;
540 offsets
[3]=sourceIndex
++;
541 offsets
[4]=sourceIndex
++;
542 offsets
[5]=sourceIndex
++;
543 offsets
[6]=sourceIndex
++;
544 offsets
[7]=sourceIndex
++;
551 /* conversion loop */
553 while(targetCapacity
>0 && (c
=*source
++)<=0x7f) {
559 /* callback(illegal); copy the current bytes to toUBytes[] */
560 UConverter
*cnv
=pArgs
->converter
;
563 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
564 } else if(source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
566 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
569 /* set offsets since the start */
571 size_t count
=target
-oldTarget
;
573 *offsets
++=sourceIndex
++;
578 /* write back the updated pointers */
579 pArgs
->source
=(const char *)source
;
580 pArgs
->target
=target
;
581 pArgs
->offsets
=offsets
;
584 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
585 static UChar32 U_CALLCONV
586 _ASCIIGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
587 UErrorCode
*pErrorCode
) {
588 const uint8_t *source
;
591 source
=(const uint8_t *)pArgs
->source
;
592 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
594 pArgs
->source
=(const char *)source
;
598 UConverter
*cnv
=pArgs
->converter
;
601 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
606 /* no output because of empty input */
607 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
611 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */
612 static void U_CALLCONV
613 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
614 UConverterToUnicodeArgs
*pToUArgs
,
615 UErrorCode
*pErrorCode
) {
616 const uint8_t *source
, *sourceLimit
;
618 int32_t targetCapacity
, length
;
622 if(pToUArgs
->converter
->toUnicodeStatus
!=0) {
623 /* no handling of partial UTF-8 characters here, fall back to pivoting */
624 *pErrorCode
=U_USING_DEFAULT_WARNING
;
628 /* set up the local pointers */
629 source
=(const uint8_t *)pToUArgs
->source
;
630 sourceLimit
=(const uint8_t *)pToUArgs
->sourceLimit
;
631 target
=(uint8_t *)pFromUArgs
->target
;
632 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
635 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
636 * for the minimum of the sourceLength and targetCapacity
638 length
=(int32_t)(sourceLimit
-source
);
639 if(length
<targetCapacity
) {
640 targetCapacity
=length
;
643 /* unroll the loop with the most common case */
644 if(targetCapacity
>=16) {
645 int32_t count
, loops
;
648 loops
=count
=targetCapacity
>>4;
650 oredChars
=*target
++=*source
++;
651 oredChars
|=*target
++=*source
++;
652 oredChars
|=*target
++=*source
++;
653 oredChars
|=*target
++=*source
++;
654 oredChars
|=*target
++=*source
++;
655 oredChars
|=*target
++=*source
++;
656 oredChars
|=*target
++=*source
++;
657 oredChars
|=*target
++=*source
++;
658 oredChars
|=*target
++=*source
++;
659 oredChars
|=*target
++=*source
++;
660 oredChars
|=*target
++=*source
++;
661 oredChars
|=*target
++=*source
++;
662 oredChars
|=*target
++=*source
++;
663 oredChars
|=*target
++=*source
++;
664 oredChars
|=*target
++=*source
++;
665 oredChars
|=*target
++=*source
++;
667 /* were all 16 entries really valid? */
669 /* no, return to the first of these 16 */
676 targetCapacity
-=16*count
;
679 /* conversion loop */
681 while(targetCapacity
>0 && (c
=*source
)<=0x7f) {
688 /* non-ASCII character, handle in standard converter */
689 *pErrorCode
=U_USING_DEFAULT_WARNING
;
690 } else if(source
<sourceLimit
&& target
>=(const uint8_t *)pFromUArgs
->targetLimit
) {
692 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
695 /* write back the updated pointers */
696 pToUArgs
->source
=(const char *)source
;
697 pFromUArgs
->target
=(char *)target
;
700 static void U_CALLCONV
701 _ASCIIGetUnicodeSet(const UConverter
*cnv
,
703 UConverterUnicodeSet which
,
704 UErrorCode
*pErrorCode
) {
708 sa
->addRange(sa
->set
, 0, 0x7f);
712 static const UConverterImpl _ASCIIImpl
={
722 _ASCIIToUnicodeWithOffsets
,
723 _ASCIIToUnicodeWithOffsets
,
724 _Latin1FromUnicodeWithOffsets
,
725 _Latin1FromUnicodeWithOffsets
,
738 static const UConverterStaticData _ASCIIStaticData
={
739 sizeof(UConverterStaticData
),
741 367, UCNV_IBM
, UCNV_US_ASCII
, 1, 1,
742 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
745 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
748 const UConverterSharedData _ASCIIData
=
749 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ASCIIStaticData
, &_ASCIIImpl
);