2 **********************************************************************
3 * Copyright (C) 2000-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvlat1.cpp
8 * tab size: 8 (not used)
11 * created on: 2000feb07
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_CONVERSION
19 #include "unicode/ucnv.h"
20 #include "unicode/uset.h"
24 /* control optimizations according to the platform */
25 #define LATIN1_UNROLL_TO_UNICODE 1
26 #define LATIN1_UNROLL_FROM_UNICODE 1
27 #define ASCII_UNROLL_TO_UNICODE 1
29 /* ISO 8859-1 --------------------------------------------------------------- */
31 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
33 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
34 UErrorCode
*pErrorCode
) {
35 const uint8_t *source
;
37 int32_t targetCapacity
, length
;
42 /* set up the local pointers */
43 source
=(const uint8_t *)pArgs
->source
;
45 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
46 offsets
=pArgs
->offsets
;
51 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
52 * for the minimum of the sourceLength and targetCapacity
54 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
55 if(length
<=targetCapacity
) {
56 targetCapacity
=length
;
58 /* target will be full */
59 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
60 length
=targetCapacity
;
63 #if LATIN1_UNROLL_TO_UNICODE
64 if(targetCapacity
>=16) {
67 loops
=count
=targetCapacity
>>4;
68 length
=targetCapacity
&=0xf;
90 *offsets
++=sourceIndex
++;
91 *offsets
++=sourceIndex
++;
92 *offsets
++=sourceIndex
++;
93 *offsets
++=sourceIndex
++;
94 *offsets
++=sourceIndex
++;
95 *offsets
++=sourceIndex
++;
96 *offsets
++=sourceIndex
++;
97 *offsets
++=sourceIndex
++;
98 *offsets
++=sourceIndex
++;
99 *offsets
++=sourceIndex
++;
100 *offsets
++=sourceIndex
++;
101 *offsets
++=sourceIndex
++;
102 *offsets
++=sourceIndex
++;
103 *offsets
++=sourceIndex
++;
104 *offsets
++=sourceIndex
++;
105 *offsets
++=sourceIndex
++;
111 /* conversion loop */
112 while(targetCapacity
>0) {
117 /* write back the updated pointers */
118 pArgs
->source
=(const char *)source
;
119 pArgs
->target
=target
;
124 *offsets
++=sourceIndex
++;
127 pArgs
->offsets
=offsets
;
131 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
133 _Latin1GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
134 UErrorCode
*pErrorCode
) {
135 const uint8_t *source
=(const uint8_t *)pArgs
->source
;
136 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
137 pArgs
->source
=(const char *)(source
+1);
141 /* no output because of empty input */
142 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
146 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
148 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
149 UErrorCode
*pErrorCode
) {
151 const UChar
*source
, *sourceLimit
;
152 uint8_t *target
, *oldTarget
;
153 int32_t targetCapacity
, length
;
161 /* set up the local pointers */
162 cnv
=pArgs
->converter
;
163 source
=pArgs
->source
;
164 sourceLimit
=pArgs
->sourceLimit
;
165 target
=oldTarget
=(uint8_t *)pArgs
->target
;
166 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
167 offsets
=pArgs
->offsets
;
169 if(cnv
->sharedData
==&_Latin1Data
) {
170 max
=0xff; /* Latin-1 */
172 max
=0x7f; /* US-ASCII */
175 /* get the converter state from UConverter */
178 /* sourceIndex=-1 if the current character began in the previous buffer */
179 sourceIndex
= cp
==0 ? 0 : -1;
182 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
183 * for the minimum of the sourceLength and targetCapacity
185 length
=(int32_t)(sourceLimit
-source
);
186 if(length
<targetCapacity
) {
187 targetCapacity
=length
;
190 /* conversion loop */
191 if(cp
!=0 && targetCapacity
>0) {
195 #if LATIN1_UNROLL_FROM_UNICODE
196 /* unroll the loop with the most common case */
197 if(targetCapacity
>=16) {
198 int32_t count
, loops
;
201 loops
=count
=targetCapacity
>>4;
203 oredChars
=u
=*source
++;
204 *target
++=(uint8_t)u
;
205 oredChars
|=u
=*source
++;
206 *target
++=(uint8_t)u
;
207 oredChars
|=u
=*source
++;
208 *target
++=(uint8_t)u
;
209 oredChars
|=u
=*source
++;
210 *target
++=(uint8_t)u
;
211 oredChars
|=u
=*source
++;
212 *target
++=(uint8_t)u
;
213 oredChars
|=u
=*source
++;
214 *target
++=(uint8_t)u
;
215 oredChars
|=u
=*source
++;
216 *target
++=(uint8_t)u
;
217 oredChars
|=u
=*source
++;
218 *target
++=(uint8_t)u
;
219 oredChars
|=u
=*source
++;
220 *target
++=(uint8_t)u
;
221 oredChars
|=u
=*source
++;
222 *target
++=(uint8_t)u
;
223 oredChars
|=u
=*source
++;
224 *target
++=(uint8_t)u
;
225 oredChars
|=u
=*source
++;
226 *target
++=(uint8_t)u
;
227 oredChars
|=u
=*source
++;
228 *target
++=(uint8_t)u
;
229 oredChars
|=u
=*source
++;
230 *target
++=(uint8_t)u
;
231 oredChars
|=u
=*source
++;
232 *target
++=(uint8_t)u
;
233 oredChars
|=u
=*source
++;
234 *target
++=(uint8_t)u
;
236 /* were all 16 entries really valid? */
238 /* no, return to the first of these 16 */
245 targetCapacity
-=16*count
;
250 *offsets
++=sourceIndex
++;
251 *offsets
++=sourceIndex
++;
252 *offsets
++=sourceIndex
++;
253 *offsets
++=sourceIndex
++;
254 *offsets
++=sourceIndex
++;
255 *offsets
++=sourceIndex
++;
256 *offsets
++=sourceIndex
++;
257 *offsets
++=sourceIndex
++;
258 *offsets
++=sourceIndex
++;
259 *offsets
++=sourceIndex
++;
260 *offsets
++=sourceIndex
++;
261 *offsets
++=sourceIndex
++;
262 *offsets
++=sourceIndex
++;
263 *offsets
++=sourceIndex
++;
264 *offsets
++=sourceIndex
++;
265 *offsets
++=sourceIndex
++;
272 /* conversion loop */
274 while(targetCapacity
>0 && (c
=*source
++)<=max
) {
275 /* convert the Unicode code point */
276 *target
++=(uint8_t)c
;
282 if(!U_IS_SURROGATE(cp
)) {
283 /* callback(unassigned) */
284 } else if(U_IS_SURROGATE_LEAD(cp
)) {
286 if(source
<sourceLimit
) {
287 /* test the following code unit */
289 if(U16_IS_TRAIL(trail
)) {
291 cp
=U16_GET_SUPPLEMENTARY(cp
, trail
);
292 /* this codepage does not map supplementary code points */
293 /* callback(unassigned) */
295 /* this is an unmatched lead code unit (1st surrogate) */
296 /* callback(illegal) */
304 /* this is an unmatched trail code unit (2nd surrogate) */
305 /* callback(illegal) */
308 *pErrorCode
= U_IS_SURROGATE(cp
) ? U_ILLEGAL_CHAR_FOUND
: U_INVALID_CHAR_FOUND
;
313 /* set offsets since the start */
315 size_t count
=target
-oldTarget
;
317 *offsets
++=sourceIndex
++;
322 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
324 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
327 /* write back the updated pointers */
328 pArgs
->source
=source
;
329 pArgs
->target
=(char *)target
;
330 pArgs
->offsets
=offsets
;
334 _Latin1GetUnicodeSet(const UConverter
*cnv
,
336 UConverterUnicodeSet which
,
337 UErrorCode
*pErrorCode
) {
338 sa
->addRange(sa
->set
, 0, 0xff);
341 static const UConverterImpl _Latin1Impl
={
351 _Latin1ToUnicodeWithOffsets
,
352 _Latin1ToUnicodeWithOffsets
,
353 _Latin1FromUnicodeWithOffsets
,
354 _Latin1FromUnicodeWithOffsets
,
364 static const UConverterStaticData _Latin1StaticData
={
365 sizeof(UConverterStaticData
),
367 819, UCNV_IBM
, UCNV_LATIN_1
, 1, 1,
368 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
371 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
374 const UConverterSharedData _Latin1Data
={
375 sizeof(UConverterSharedData
), ~((uint32_t) 0),
376 NULL
, NULL
, &_Latin1StaticData
, FALSE
, &_Latin1Impl
,
380 /* US-ASCII ----------------------------------------------------------------- */
382 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
384 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
385 UErrorCode
*pErrorCode
) {
386 const uint8_t *source
, *sourceLimit
;
387 UChar
*target
, *oldTarget
;
388 int32_t targetCapacity
, length
;
395 /* set up the local pointers */
396 source
=(const uint8_t *)pArgs
->source
;
397 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
398 target
=oldTarget
=pArgs
->target
;
399 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
400 offsets
=pArgs
->offsets
;
402 /* sourceIndex=-1 if the current character began in the previous buffer */
406 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
407 * for the minimum of the sourceLength and targetCapacity
409 length
=(int32_t)(sourceLimit
-source
);
410 if(length
<targetCapacity
) {
411 targetCapacity
=length
;
414 #if ASCII_UNROLL_TO_UNICODE
415 /* unroll the loop with the most common case */
416 if(targetCapacity
>=16) {
417 int32_t count
, loops
;
420 loops
=count
=targetCapacity
>>4;
422 oredChars
=*target
++=*source
++;
423 oredChars
|=*target
++=*source
++;
424 oredChars
|=*target
++=*source
++;
425 oredChars
|=*target
++=*source
++;
426 oredChars
|=*target
++=*source
++;
427 oredChars
|=*target
++=*source
++;
428 oredChars
|=*target
++=*source
++;
429 oredChars
|=*target
++=*source
++;
430 oredChars
|=*target
++=*source
++;
431 oredChars
|=*target
++=*source
++;
432 oredChars
|=*target
++=*source
++;
433 oredChars
|=*target
++=*source
++;
434 oredChars
|=*target
++=*source
++;
435 oredChars
|=*target
++=*source
++;
436 oredChars
|=*target
++=*source
++;
437 oredChars
|=*target
++=*source
++;
439 /* were all 16 entries really valid? */
441 /* no, return to the first of these 16 */
448 targetCapacity
-=16*count
;
453 *offsets
++=sourceIndex
++;
454 *offsets
++=sourceIndex
++;
455 *offsets
++=sourceIndex
++;
456 *offsets
++=sourceIndex
++;
457 *offsets
++=sourceIndex
++;
458 *offsets
++=sourceIndex
++;
459 *offsets
++=sourceIndex
++;
460 *offsets
++=sourceIndex
++;
461 *offsets
++=sourceIndex
++;
462 *offsets
++=sourceIndex
++;
463 *offsets
++=sourceIndex
++;
464 *offsets
++=sourceIndex
++;
465 *offsets
++=sourceIndex
++;
466 *offsets
++=sourceIndex
++;
467 *offsets
++=sourceIndex
++;
468 *offsets
++=sourceIndex
++;
475 /* conversion loop */
477 while(targetCapacity
>0 && (c
=*source
++)<=0x7f) {
483 /* callback(illegal); copy the current bytes to toUBytes[] */
484 UConverter
*cnv
=pArgs
->converter
;
487 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
488 } else if(source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
490 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
493 /* set offsets since the start */
495 size_t count
=target
-oldTarget
;
497 *offsets
++=sourceIndex
++;
502 /* write back the updated pointers */
503 pArgs
->source
=(const char *)source
;
504 pArgs
->target
=target
;
505 pArgs
->offsets
=offsets
;
508 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
510 _ASCIIGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
511 UErrorCode
*pErrorCode
) {
512 const uint8_t *source
;
515 source
=(const uint8_t *)pArgs
->source
;
516 if(source
<(const uint8_t *)pArgs
->sourceLimit
) {
518 pArgs
->source
=(const char *)source
;
522 UConverter
*cnv
=pArgs
->converter
;
525 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
530 /* no output because of empty input */
531 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
536 _ASCIIGetUnicodeSet(const UConverter
*cnv
,
538 UConverterUnicodeSet which
,
539 UErrorCode
*pErrorCode
) {
540 sa
->addRange(sa
->set
, 0, 0x7f);
543 static const UConverterImpl _ASCIIImpl
={
553 _ASCIIToUnicodeWithOffsets
,
554 _ASCIIToUnicodeWithOffsets
,
555 _Latin1FromUnicodeWithOffsets
,
556 _Latin1FromUnicodeWithOffsets
,
566 static const UConverterStaticData _ASCIIStaticData
={
567 sizeof(UConverterStaticData
),
569 367, UCNV_IBM
, UCNV_US_ASCII
, 1, 1,
570 { 0x1a, 0, 0, 0 }, 1, FALSE
, FALSE
,
573 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
576 const UConverterSharedData _ASCIIData
={
577 sizeof(UConverterSharedData
), ~((uint32_t) 0),
578 NULL
, NULL
, &_ASCIIStaticData
, FALSE
, &_ASCIIImpl
,