]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvlat1.c
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / common / ucnvlat1.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 2000-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnvlat1.cpp
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000feb07
12* created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
374ca955
A
16
17#if !UCONFIG_NO_CONVERSION
18
b75a7d8f 19#include "unicode/ucnv.h"
b75a7d8f
A
20#include "unicode/uset.h"
21#include "ucnv_bld.h"
22#include "ucnv_cnv.h"
23
24/* control optimizations according to the platform */
25#define LATIN1_UNROLL_TO_UNICODE 1
26#define LATIN1_UNROLL_FROM_UNICODE 1
27#define ASCII_UNROLL_TO_UNICODE 1
28
29/* ISO 8859-1 --------------------------------------------------------------- */
30
374ca955 31/* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
b75a7d8f
A
32static void
33_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
34 UErrorCode *pErrorCode) {
35 const uint8_t *source;
36 UChar *target;
37 int32_t targetCapacity, length;
38 int32_t *offsets;
39
40 int32_t sourceIndex;
41
42 /* set up the local pointers */
43 source=(const uint8_t *)pArgs->source;
44 target=pArgs->target;
73c04bcf 45 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
46 offsets=pArgs->offsets;
47
48 sourceIndex=0;
49
50 /*
51 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
52 * for the minimum of the sourceLength and targetCapacity
53 */
73c04bcf 54 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
b75a7d8f
A
55 if(length<=targetCapacity) {
56 targetCapacity=length;
57 } else {
58 /* target will be full */
59 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
60 length=targetCapacity;
61 }
62
63#if LATIN1_UNROLL_TO_UNICODE
64 if(targetCapacity>=16) {
65 int32_t count, loops;
66
67 loops=count=targetCapacity>>4;
68 length=targetCapacity&=0xf;
69 do {
70 *target++=*source++;
71 *target++=*source++;
72 *target++=*source++;
73 *target++=*source++;
74 *target++=*source++;
75 *target++=*source++;
76 *target++=*source++;
77 *target++=*source++;
78 *target++=*source++;
79 *target++=*source++;
80 *target++=*source++;
81 *target++=*source++;
82 *target++=*source++;
83 *target++=*source++;
84 *target++=*source++;
85 *target++=*source++;
86 } while(--count>0);
87
88 if(offsets!=NULL) {
89 do {
90 *offsets++=sourceIndex++;
91 *offsets++=sourceIndex++;
92 *offsets++=sourceIndex++;
93 *offsets++=sourceIndex++;
94 *offsets++=sourceIndex++;
95 *offsets++=sourceIndex++;
96 *offsets++=sourceIndex++;
97 *offsets++=sourceIndex++;
98 *offsets++=sourceIndex++;
99 *offsets++=sourceIndex++;
100 *offsets++=sourceIndex++;
101 *offsets++=sourceIndex++;
102 *offsets++=sourceIndex++;
103 *offsets++=sourceIndex++;
104 *offsets++=sourceIndex++;
105 *offsets++=sourceIndex++;
106 } while(--loops>0);
107 }
108 }
109#endif
110
111 /* conversion loop */
112 while(targetCapacity>0) {
113 *target++=*source++;
114 --targetCapacity;
115 }
116
117 /* write back the updated pointers */
118 pArgs->source=(const char *)source;
119 pArgs->target=target;
120
121 /* set offsets */
122 if(offsets!=NULL) {
123 while(length>0) {
124 *offsets++=sourceIndex++;
125 --length;
126 }
127 pArgs->offsets=offsets;
128 }
129}
130
374ca955 131/* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
b75a7d8f
A
132static UChar32
133_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
134 UErrorCode *pErrorCode) {
135 const uint8_t *source=(const uint8_t *)pArgs->source;
136 if(source<(const uint8_t *)pArgs->sourceLimit) {
137 pArgs->source=(const char *)(source+1);
138 return *source;
139 }
140
141 /* no output because of empty input */
142 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
143 return 0xffff;
144}
145
374ca955 146/* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
b75a7d8f
A
147static void
148_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
149 UErrorCode *pErrorCode) {
150 UConverter *cnv;
374ca955
A
151 const UChar *source, *sourceLimit;
152 uint8_t *target, *oldTarget;
b75a7d8f
A
153 int32_t targetCapacity, length;
154 int32_t *offsets;
155
374ca955
A
156 UChar32 cp;
157 UChar c, max;
b75a7d8f
A
158
159 int32_t sourceIndex;
160
b75a7d8f
A
161 /* set up the local pointers */
162 cnv=pArgs->converter;
163 source=pArgs->source;
164 sourceLimit=pArgs->sourceLimit;
374ca955 165 target=oldTarget=(uint8_t *)pArgs->target;
73c04bcf 166 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
167 offsets=pArgs->offsets;
168
169 if(cnv->sharedData==&_Latin1Data) {
170 max=0xff; /* Latin-1 */
171 } else {
172 max=0x7f; /* US-ASCII */
173 }
174
175 /* get the converter state from UConverter */
374ca955 176 cp=cnv->fromUChar32;
b75a7d8f
A
177
178 /* sourceIndex=-1 if the current character began in the previous buffer */
374ca955 179 sourceIndex= cp==0 ? 0 : -1;
b75a7d8f
A
180
181 /*
182 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
183 * for the minimum of the sourceLength and targetCapacity
184 */
73c04bcf 185 length=(int32_t)(sourceLimit-source);
b75a7d8f
A
186 if(length<targetCapacity) {
187 targetCapacity=length;
188 }
189
190 /* conversion loop */
374ca955 191 if(cp!=0 && targetCapacity>0) {
b75a7d8f
A
192 goto getTrail;
193 }
194
195#if LATIN1_UNROLL_FROM_UNICODE
196 /* unroll the loop with the most common case */
b75a7d8f
A
197 if(targetCapacity>=16) {
198 int32_t count, loops;
199 UChar u, oredChars;
200
201 loops=count=targetCapacity>>4;
202 do {
203 oredChars=u=*source++;
204 *target++=(uint8_t)u;
205 oredChars|=u=*source++;
206 *target++=(uint8_t)u;
207 oredChars|=u=*source++;
208 *target++=(uint8_t)u;
209 oredChars|=u=*source++;
210 *target++=(uint8_t)u;
211 oredChars|=u=*source++;
212 *target++=(uint8_t)u;
213 oredChars|=u=*source++;
214 *target++=(uint8_t)u;
215 oredChars|=u=*source++;
216 *target++=(uint8_t)u;
217 oredChars|=u=*source++;
218 *target++=(uint8_t)u;
219 oredChars|=u=*source++;
220 *target++=(uint8_t)u;
221 oredChars|=u=*source++;
222 *target++=(uint8_t)u;
223 oredChars|=u=*source++;
224 *target++=(uint8_t)u;
225 oredChars|=u=*source++;
226 *target++=(uint8_t)u;
227 oredChars|=u=*source++;
228 *target++=(uint8_t)u;
229 oredChars|=u=*source++;
230 *target++=(uint8_t)u;
231 oredChars|=u=*source++;
232 *target++=(uint8_t)u;
233 oredChars|=u=*source++;
234 *target++=(uint8_t)u;
235
236 /* were all 16 entries really valid? */
237 if(oredChars>max) {
238 /* no, return to the first of these 16 */
239 source-=16;
240 target-=16;
241 break;
242 }
243 } while(--count>0);
244 count=loops-count;
245 targetCapacity-=16*count;
246
247 if(offsets!=NULL) {
374ca955 248 oldTarget+=16*count;
b75a7d8f
A
249 while(count>0) {
250 *offsets++=sourceIndex++;
251 *offsets++=sourceIndex++;
252 *offsets++=sourceIndex++;
253 *offsets++=sourceIndex++;
254 *offsets++=sourceIndex++;
255 *offsets++=sourceIndex++;
256 *offsets++=sourceIndex++;
257 *offsets++=sourceIndex++;
258 *offsets++=sourceIndex++;
259 *offsets++=sourceIndex++;
260 *offsets++=sourceIndex++;
261 *offsets++=sourceIndex++;
262 *offsets++=sourceIndex++;
263 *offsets++=sourceIndex++;
264 *offsets++=sourceIndex++;
265 *offsets++=sourceIndex++;
266 --count;
267 }
268 }
b75a7d8f
A
269 }
270#endif
271
374ca955
A
272 /* conversion loop */
273 c=0;
274 while(targetCapacity>0 && (c=*source++)<=max) {
275 /* convert the Unicode code point */
276 *target++=(uint8_t)c;
277 --targetCapacity;
278 }
279
280 if(c>max) {
281 cp=c;
282 if(!U_IS_SURROGATE(cp)) {
283 /* callback(unassigned) */
284 } else if(U_IS_SURROGATE_LEAD(cp)) {
b75a7d8f 285getTrail:
374ca955
A
286 if(source<sourceLimit) {
287 /* test the following code unit */
288 UChar trail=*source;
289 if(U16_IS_TRAIL(trail)) {
290 ++source;
291 cp=U16_GET_SUPPLEMENTARY(cp, trail);
292 /* this codepage does not map supplementary code points */
293 /* callback(unassigned) */
b75a7d8f 294 } else {
374ca955
A
295 /* this is an unmatched lead code unit (1st surrogate) */
296 /* callback(illegal) */
b75a7d8f
A
297 }
298 } else {
374ca955
A
299 /* no more input */
300 cnv->fromUChar32=cp;
301 goto noMoreInput;
b75a7d8f 302 }
374ca955
A
303 } else {
304 /* this is an unmatched trail code unit (2nd surrogate) */
305 /* callback(illegal) */
b75a7d8f 306 }
b75a7d8f 307
374ca955
A
308 *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
309 cnv->fromUChar32=cp;
b75a7d8f 310 }
374ca955 311noMoreInput:
b75a7d8f 312
374ca955 313 /* set offsets since the start */
b75a7d8f 314 if(offsets!=NULL) {
374ca955 315 size_t count=target-oldTarget;
b75a7d8f
A
316 while(count>0) {
317 *offsets++=sourceIndex++;
318 --count;
319 }
320 }
321
374ca955
A
322 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
323 /* target is full */
324 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f
A
325 }
326
327 /* write back the updated pointers */
328 pArgs->source=source;
329 pArgs->target=(char *)target;
330 pArgs->offsets=offsets;
331}
332
333static void
334_Latin1GetUnicodeSet(const UConverter *cnv,
73c04bcf 335 const USetAdder *sa,
b75a7d8f
A
336 UConverterUnicodeSet which,
337 UErrorCode *pErrorCode) {
374ca955 338 sa->addRange(sa->set, 0, 0xff);
b75a7d8f
A
339}
340
341static const UConverterImpl _Latin1Impl={
342 UCNV_LATIN_1,
343
344 NULL,
345 NULL,
346
347 NULL,
348 NULL,
349 NULL,
350
351 _Latin1ToUnicodeWithOffsets,
352 _Latin1ToUnicodeWithOffsets,
353 _Latin1FromUnicodeWithOffsets,
354 _Latin1FromUnicodeWithOffsets,
355 _Latin1GetNextUChar,
356
357 NULL,
358 NULL,
359 NULL,
360 NULL,
361 _Latin1GetUnicodeSet
362};
363
364static const UConverterStaticData _Latin1StaticData={
365 sizeof(UConverterStaticData),
366 "ISO-8859-1",
367 819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
368 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
369 0,
370 0,
371 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
372};
373
374const UConverterSharedData _Latin1Data={
375 sizeof(UConverterSharedData), ~((uint32_t) 0),
376 NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl,
377 0
378};
379
380/* US-ASCII ----------------------------------------------------------------- */
381
374ca955 382/* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
b75a7d8f
A
383static void
384_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
385 UErrorCode *pErrorCode) {
374ca955
A
386 const uint8_t *source, *sourceLimit;
387 UChar *target, *oldTarget;
b75a7d8f
A
388 int32_t targetCapacity, length;
389 int32_t *offsets;
390
391 int32_t sourceIndex;
392
374ca955
A
393 uint8_t c;
394
b75a7d8f
A
395 /* set up the local pointers */
396 source=(const uint8_t *)pArgs->source;
397 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
374ca955 398 target=oldTarget=pArgs->target;
73c04bcf 399 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
400 offsets=pArgs->offsets;
401
402 /* sourceIndex=-1 if the current character began in the previous buffer */
403 sourceIndex=0;
b75a7d8f
A
404
405 /*
406 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
407 * for the minimum of the sourceLength and targetCapacity
408 */
73c04bcf 409 length=(int32_t)(sourceLimit-source);
b75a7d8f
A
410 if(length<targetCapacity) {
411 targetCapacity=length;
412 }
413
414#if ASCII_UNROLL_TO_UNICODE
415 /* unroll the loop with the most common case */
b75a7d8f
A
416 if(targetCapacity>=16) {
417 int32_t count, loops;
418 UChar oredChars;
419
420 loops=count=targetCapacity>>4;
421 do {
422 oredChars=*target++=*source++;
423 oredChars|=*target++=*source++;
424 oredChars|=*target++=*source++;
425 oredChars|=*target++=*source++;
426 oredChars|=*target++=*source++;
427 oredChars|=*target++=*source++;
428 oredChars|=*target++=*source++;
429 oredChars|=*target++=*source++;
430 oredChars|=*target++=*source++;
431 oredChars|=*target++=*source++;
432 oredChars|=*target++=*source++;
433 oredChars|=*target++=*source++;
434 oredChars|=*target++=*source++;
435 oredChars|=*target++=*source++;
436 oredChars|=*target++=*source++;
437 oredChars|=*target++=*source++;
438
439 /* were all 16 entries really valid? */
440 if(oredChars>0x7f) {
441 /* no, return to the first of these 16 */
442 source-=16;
443 target-=16;
444 break;
445 }
446 } while(--count>0);
447 count=loops-count;
448 targetCapacity-=16*count;
449
450 if(offsets!=NULL) {
374ca955 451 oldTarget+=16*count;
b75a7d8f
A
452 while(count>0) {
453 *offsets++=sourceIndex++;
454 *offsets++=sourceIndex++;
455 *offsets++=sourceIndex++;
456 *offsets++=sourceIndex++;
457 *offsets++=sourceIndex++;
458 *offsets++=sourceIndex++;
459 *offsets++=sourceIndex++;
460 *offsets++=sourceIndex++;
461 *offsets++=sourceIndex++;
462 *offsets++=sourceIndex++;
463 *offsets++=sourceIndex++;
464 *offsets++=sourceIndex++;
465 *offsets++=sourceIndex++;
466 *offsets++=sourceIndex++;
467 *offsets++=sourceIndex++;
468 *offsets++=sourceIndex++;
469 --count;
470 }
471 }
472 }
473#endif
474
475 /* conversion loop */
374ca955
A
476 c=0;
477 while(targetCapacity>0 && (c=*source++)<=0x7f) {
478 *target++=c;
479 --targetCapacity;
b75a7d8f
A
480 }
481
374ca955
A
482 if(c>0x7f) {
483 /* callback(illegal); copy the current bytes to toUBytes[] */
484 UConverter *cnv=pArgs->converter;
485 cnv->toUBytes[0]=c;
486 cnv->toULength=1;
487 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
488 } else if(source<sourceLimit && target>=pArgs->targetLimit) {
b75a7d8f
A
489 /* target is full */
490 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
491 }
492
374ca955 493 /* set offsets since the start */
b75a7d8f 494 if(offsets!=NULL) {
374ca955 495 size_t count=target-oldTarget;
b75a7d8f
A
496 while(count>0) {
497 *offsets++=sourceIndex++;
498 --count;
499 }
500 }
501
502 /* write back the updated pointers */
503 pArgs->source=(const char *)source;
504 pArgs->target=target;
505 pArgs->offsets=offsets;
506}
507
374ca955 508/* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
b75a7d8f
A
509static UChar32
510_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
511 UErrorCode *pErrorCode) {
b75a7d8f
A
512 const uint8_t *source;
513 uint8_t b;
514
b75a7d8f 515 source=(const uint8_t *)pArgs->source;
374ca955 516 if(source<(const uint8_t *)pArgs->sourceLimit) {
b75a7d8f
A
517 b=*source++;
518 pArgs->source=(const char *)source;
519 if(b<=0x7f) {
520 return b;
521 } else {
b75a7d8f 522 UConverter *cnv=pArgs->converter;
374ca955
A
523 cnv->toUBytes[0]=b;
524 cnv->toULength=1;
b75a7d8f 525 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 526 return 0xffff;
b75a7d8f
A
527 }
528 }
529
374ca955 530 /* no output because of empty input */
b75a7d8f
A
531 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
532 return 0xffff;
533}
534
535static void
536_ASCIIGetUnicodeSet(const UConverter *cnv,
73c04bcf 537 const USetAdder *sa,
b75a7d8f
A
538 UConverterUnicodeSet which,
539 UErrorCode *pErrorCode) {
374ca955 540 sa->addRange(sa->set, 0, 0x7f);
b75a7d8f
A
541}
542
543static const UConverterImpl _ASCIIImpl={
544 UCNV_US_ASCII,
545
546 NULL,
547 NULL,
548
549 NULL,
550 NULL,
551 NULL,
552
553 _ASCIIToUnicodeWithOffsets,
554 _ASCIIToUnicodeWithOffsets,
555 _Latin1FromUnicodeWithOffsets,
556 _Latin1FromUnicodeWithOffsets,
557 _ASCIIGetNextUChar,
558
559 NULL,
560 NULL,
561 NULL,
562 NULL,
563 _ASCIIGetUnicodeSet
564};
565
566static const UConverterStaticData _ASCIIStaticData={
567 sizeof(UConverterStaticData),
568 "US-ASCII",
569 367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
570 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
571 0,
572 0,
573 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
574};
575
576const UConverterSharedData _ASCIIData={
577 sizeof(UConverterSharedData), ~((uint32_t) 0),
578 NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl,
579 0
580};
374ca955
A
581
582#endif