]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ********************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 2000-2015, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * file name: ucnvlat1.cpp | |
f3c0d7a5 | 9 | * encoding: UTF-8 |
b75a7d8f A |
10 | * tab size: 8 (not used) |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2000feb07 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #include "unicode/utypes.h" | |
374ca955 A |
18 | |
19 | #if !UCONFIG_NO_CONVERSION | |
20 | ||
b75a7d8f | 21 | #include "unicode/ucnv.h" |
b75a7d8f | 22 | #include "unicode/uset.h" |
4388f060 | 23 | #include "unicode/utf8.h" |
b75a7d8f A |
24 | #include "ucnv_bld.h" |
25 | #include "ucnv_cnv.h" | |
26 | ||
27 | /* control optimizations according to the platform */ | |
b75a7d8f | 28 | #define LATIN1_UNROLL_FROM_UNICODE 1 |
b75a7d8f A |
29 | |
30 | /* ISO 8859-1 --------------------------------------------------------------- */ | |
31 | ||
374ca955 | 32 | /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
f3c0d7a5 A |
33 | U_CDECL_BEGIN |
34 | static void U_CALLCONV | |
b75a7d8f A |
35 | _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
36 | UErrorCode *pErrorCode) { | |
37 | const uint8_t *source; | |
38 | UChar *target; | |
39 | int32_t targetCapacity, length; | |
40 | int32_t *offsets; | |
41 | ||
42 | int32_t sourceIndex; | |
43 | ||
44 | /* set up the local pointers */ | |
45 | source=(const uint8_t *)pArgs->source; | |
46 | target=pArgs->target; | |
73c04bcf | 47 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
b75a7d8f A |
48 | offsets=pArgs->offsets; |
49 | ||
50 | sourceIndex=0; | |
51 | ||
52 | /* | |
53 | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter | |
54 | * for the minimum of the sourceLength and targetCapacity | |
55 | */ | |
73c04bcf | 56 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
b75a7d8f A |
57 | if(length<=targetCapacity) { |
58 | targetCapacity=length; | |
59 | } else { | |
60 | /* target will be full */ | |
61 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
62 | length=targetCapacity; | |
63 | } | |
64 | ||
46f4442e A |
65 | if(targetCapacity>=8) { |
66 | /* This loop is unrolled for speed and improved pipelining. */ | |
b75a7d8f A |
67 | int32_t count, loops; |
68 | ||
46f4442e A |
69 | loops=count=targetCapacity>>3; |
70 | length=targetCapacity&=0x7; | |
b75a7d8f | 71 | do { |
46f4442e A |
72 | target[0]=source[0]; |
73 | target[1]=source[1]; | |
74 | target[2]=source[2]; | |
75 | target[3]=source[3]; | |
76 | target[4]=source[4]; | |
77 | target[5]=source[5]; | |
78 | target[6]=source[6]; | |
79 | target[7]=source[7]; | |
80 | target+=8; | |
81 | source+=8; | |
b75a7d8f A |
82 | } while(--count>0); |
83 | ||
84 | if(offsets!=NULL) { | |
85 | do { | |
46f4442e A |
86 | offsets[0]=sourceIndex++; |
87 | offsets[1]=sourceIndex++; | |
88 | offsets[2]=sourceIndex++; | |
89 | offsets[3]=sourceIndex++; | |
90 | offsets[4]=sourceIndex++; | |
91 | offsets[5]=sourceIndex++; | |
92 | offsets[6]=sourceIndex++; | |
93 | offsets[7]=sourceIndex++; | |
94 | offsets+=8; | |
b75a7d8f A |
95 | } while(--loops>0); |
96 | } | |
97 | } | |
b75a7d8f A |
98 | |
99 | /* conversion loop */ | |
100 | while(targetCapacity>0) { | |
101 | *target++=*source++; | |
102 | --targetCapacity; | |
103 | } | |
104 | ||
105 | /* write back the updated pointers */ | |
106 | pArgs->source=(const char *)source; | |
107 | pArgs->target=target; | |
108 | ||
109 | /* set offsets */ | |
110 | if(offsets!=NULL) { | |
111 | while(length>0) { | |
112 | *offsets++=sourceIndex++; | |
113 | --length; | |
114 | } | |
115 | pArgs->offsets=offsets; | |
116 | } | |
117 | } | |
118 | ||
374ca955 | 119 | /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */ |
f3c0d7a5 | 120 | static UChar32 U_CALLCONV |
b75a7d8f A |
121 | _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs, |
122 | UErrorCode *pErrorCode) { | |
123 | const uint8_t *source=(const uint8_t *)pArgs->source; | |
124 | if(source<(const uint8_t *)pArgs->sourceLimit) { | |
125 | pArgs->source=(const char *)(source+1); | |
126 | return *source; | |
127 | } | |
128 | ||
129 | /* no output because of empty input */ | |
130 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
131 | return 0xffff; | |
132 | } | |
133 | ||
374ca955 | 134 | /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */ |
f3c0d7a5 | 135 | static void U_CALLCONV |
b75a7d8f A |
136 | _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
137 | UErrorCode *pErrorCode) { | |
138 | UConverter *cnv; | |
374ca955 A |
139 | const UChar *source, *sourceLimit; |
140 | uint8_t *target, *oldTarget; | |
b75a7d8f A |
141 | int32_t targetCapacity, length; |
142 | int32_t *offsets; | |
143 | ||
374ca955 A |
144 | UChar32 cp; |
145 | UChar c, max; | |
b75a7d8f A |
146 | |
147 | int32_t sourceIndex; | |
148 | ||
b75a7d8f A |
149 | /* set up the local pointers */ |
150 | cnv=pArgs->converter; | |
151 | source=pArgs->source; | |
152 | sourceLimit=pArgs->sourceLimit; | |
374ca955 | 153 | target=oldTarget=(uint8_t *)pArgs->target; |
73c04bcf | 154 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
b75a7d8f A |
155 | offsets=pArgs->offsets; |
156 | ||
157 | if(cnv->sharedData==&_Latin1Data) { | |
158 | max=0xff; /* Latin-1 */ | |
159 | } else { | |
160 | max=0x7f; /* US-ASCII */ | |
161 | } | |
162 | ||
163 | /* get the converter state from UConverter */ | |
374ca955 | 164 | cp=cnv->fromUChar32; |
b75a7d8f A |
165 | |
166 | /* sourceIndex=-1 if the current character began in the previous buffer */ | |
374ca955 | 167 | sourceIndex= cp==0 ? 0 : -1; |
b75a7d8f A |
168 | |
169 | /* | |
170 | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter | |
171 | * for the minimum of the sourceLength and targetCapacity | |
172 | */ | |
73c04bcf | 173 | length=(int32_t)(sourceLimit-source); |
b75a7d8f A |
174 | if(length<targetCapacity) { |
175 | targetCapacity=length; | |
176 | } | |
177 | ||
178 | /* conversion loop */ | |
374ca955 | 179 | if(cp!=0 && targetCapacity>0) { |
b75a7d8f A |
180 | goto getTrail; |
181 | } | |
182 | ||
183 | #if LATIN1_UNROLL_FROM_UNICODE | |
184 | /* unroll the loop with the most common case */ | |
b75a7d8f A |
185 | if(targetCapacity>=16) { |
186 | int32_t count, loops; | |
187 | UChar u, oredChars; | |
188 | ||
189 | loops=count=targetCapacity>>4; | |
190 | do { | |
191 | oredChars=u=*source++; | |
192 | *target++=(uint8_t)u; | |
193 | oredChars|=u=*source++; | |
194 | *target++=(uint8_t)u; | |
195 | oredChars|=u=*source++; | |
196 | *target++=(uint8_t)u; | |
197 | oredChars|=u=*source++; | |
198 | *target++=(uint8_t)u; | |
199 | oredChars|=u=*source++; | |
200 | *target++=(uint8_t)u; | |
201 | oredChars|=u=*source++; | |
202 | *target++=(uint8_t)u; | |
203 | oredChars|=u=*source++; | |
204 | *target++=(uint8_t)u; | |
205 | oredChars|=u=*source++; | |
206 | *target++=(uint8_t)u; | |
207 | oredChars|=u=*source++; | |
208 | *target++=(uint8_t)u; | |
209 | oredChars|=u=*source++; | |
210 | *target++=(uint8_t)u; | |
211 | oredChars|=u=*source++; | |
212 | *target++=(uint8_t)u; | |
213 | oredChars|=u=*source++; | |
214 | *target++=(uint8_t)u; | |
215 | oredChars|=u=*source++; | |
216 | *target++=(uint8_t)u; | |
217 | oredChars|=u=*source++; | |
218 | *target++=(uint8_t)u; | |
219 | oredChars|=u=*source++; | |
220 | *target++=(uint8_t)u; | |
221 | oredChars|=u=*source++; | |
222 | *target++=(uint8_t)u; | |
223 | ||
224 | /* were all 16 entries really valid? */ | |
225 | if(oredChars>max) { | |
226 | /* no, return to the first of these 16 */ | |
227 | source-=16; | |
228 | target-=16; | |
229 | break; | |
230 | } | |
231 | } while(--count>0); | |
232 | count=loops-count; | |
233 | targetCapacity-=16*count; | |
234 | ||
235 | if(offsets!=NULL) { | |
374ca955 | 236 | oldTarget+=16*count; |
b75a7d8f A |
237 | while(count>0) { |
238 | *offsets++=sourceIndex++; | |
239 | *offsets++=sourceIndex++; | |
240 | *offsets++=sourceIndex++; | |
241 | *offsets++=sourceIndex++; | |
242 | *offsets++=sourceIndex++; | |
243 | *offsets++=sourceIndex++; | |
244 | *offsets++=sourceIndex++; | |
245 | *offsets++=sourceIndex++; | |
246 | *offsets++=sourceIndex++; | |
247 | *offsets++=sourceIndex++; | |
248 | *offsets++=sourceIndex++; | |
249 | *offsets++=sourceIndex++; | |
250 | *offsets++=sourceIndex++; | |
251 | *offsets++=sourceIndex++; | |
252 | *offsets++=sourceIndex++; | |
253 | *offsets++=sourceIndex++; | |
254 | --count; | |
255 | } | |
256 | } | |
b75a7d8f A |
257 | } |
258 | #endif | |
259 | ||
374ca955 A |
260 | /* conversion loop */ |
261 | c=0; | |
262 | while(targetCapacity>0 && (c=*source++)<=max) { | |
263 | /* convert the Unicode code point */ | |
264 | *target++=(uint8_t)c; | |
265 | --targetCapacity; | |
266 | } | |
267 | ||
268 | if(c>max) { | |
269 | cp=c; | |
270 | if(!U_IS_SURROGATE(cp)) { | |
271 | /* callback(unassigned) */ | |
272 | } else if(U_IS_SURROGATE_LEAD(cp)) { | |
b75a7d8f | 273 | getTrail: |
374ca955 A |
274 | if(source<sourceLimit) { |
275 | /* test the following code unit */ | |
276 | UChar trail=*source; | |
277 | if(U16_IS_TRAIL(trail)) { | |
278 | ++source; | |
279 | cp=U16_GET_SUPPLEMENTARY(cp, trail); | |
280 | /* this codepage does not map supplementary code points */ | |
281 | /* callback(unassigned) */ | |
b75a7d8f | 282 | } else { |
374ca955 A |
283 | /* this is an unmatched lead code unit (1st surrogate) */ |
284 | /* callback(illegal) */ | |
b75a7d8f A |
285 | } |
286 | } else { | |
374ca955 A |
287 | /* no more input */ |
288 | cnv->fromUChar32=cp; | |
289 | goto noMoreInput; | |
b75a7d8f | 290 | } |
374ca955 A |
291 | } else { |
292 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
293 | /* callback(illegal) */ | |
b75a7d8f | 294 | } |
b75a7d8f | 295 | |
374ca955 A |
296 | *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; |
297 | cnv->fromUChar32=cp; | |
b75a7d8f | 298 | } |
374ca955 | 299 | noMoreInput: |
b75a7d8f | 300 | |
374ca955 | 301 | /* set offsets since the start */ |
b75a7d8f | 302 | if(offsets!=NULL) { |
374ca955 | 303 | size_t count=target-oldTarget; |
b75a7d8f A |
304 | while(count>0) { |
305 | *offsets++=sourceIndex++; | |
306 | --count; | |
307 | } | |
308 | } | |
309 | ||
374ca955 A |
310 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { |
311 | /* target is full */ | |
312 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
b75a7d8f A |
313 | } |
314 | ||
315 | /* write back the updated pointers */ | |
316 | pArgs->source=source; | |
317 | pArgs->target=(char *)target; | |
318 | pArgs->offsets=offsets; | |
319 | } | |
320 | ||
46f4442e | 321 | /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */ |
f3c0d7a5 | 322 | static void U_CALLCONV |
46f4442e A |
323 | ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
324 | UConverterToUnicodeArgs *pToUArgs, | |
325 | UErrorCode *pErrorCode) { | |
326 | UConverter *utf8; | |
327 | const uint8_t *source, *sourceLimit; | |
328 | uint8_t *target; | |
329 | int32_t targetCapacity; | |
330 | ||
331 | UChar32 c; | |
332 | uint8_t b, t1; | |
333 | ||
334 | /* set up the local pointers */ | |
335 | utf8=pToUArgs->converter; | |
336 | source=(uint8_t *)pToUArgs->source; | |
337 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; | |
338 | target=(uint8_t *)pFromUArgs->target; | |
339 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); | |
340 | ||
341 | /* get the converter state from the UTF-8 UConverter */ | |
342 | c=(UChar32)utf8->toUnicodeStatus; | |
343 | if(c!=0 && source<sourceLimit) { | |
344 | if(targetCapacity==0) { | |
345 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
346 | return; | |
347 | } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) { | |
348 | ++source; | |
349 | *target++=(uint8_t)(((c&3)<<6)|t1); | |
350 | --targetCapacity; | |
351 | ||
352 | utf8->toUnicodeStatus=0; | |
353 | utf8->toULength=0; | |
354 | } else { | |
355 | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ | |
356 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
357 | return; | |
358 | } | |
359 | } | |
360 | ||
361 | /* | |
362 | * Make sure that the last byte sequence before sourceLimit is complete | |
363 | * or runs into a lead byte. | |
364 | * In the conversion loop compare source with sourceLimit only once | |
365 | * per multi-byte character. | |
366 | * For Latin-1, adjust sourceLimit only for 1 trail byte because | |
367 | * the conversion loop handles at most 2-byte sequences. | |
368 | */ | |
369 | if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) { | |
370 | --sourceLimit; | |
371 | } | |
372 | ||
373 | /* conversion loop */ | |
374 | while(source<sourceLimit) { | |
375 | if(targetCapacity>0) { | |
376 | b=*source++; | |
377 | if((int8_t)b>=0) { | |
378 | /* convert ASCII */ | |
379 | *target++=(uint8_t)b; | |
380 | --targetCapacity; | |
381 | } else if( /* handle U+0080..U+00FF inline */ | |
382 | b>=0xc2 && b<=0xc3 && | |
383 | (t1=(uint8_t)(*source-0x80)) <= 0x3f | |
384 | ) { | |
385 | ++source; | |
386 | *target++=(uint8_t)(((b&3)<<6)|t1); | |
387 | --targetCapacity; | |
388 | } else { | |
389 | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ | |
390 | pToUArgs->source=(char *)(source-1); | |
391 | pFromUArgs->target=(char *)target; | |
392 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
393 | return; | |
394 | } | |
395 | } else { | |
396 | /* target is full */ | |
397 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
398 | break; | |
399 | } | |
400 | } | |
401 | ||
402 | /* | |
403 | * The sourceLimit may have been adjusted before the conversion loop | |
404 | * to stop before a truncated sequence. | |
405 | * If so, then collect the truncated sequence now. | |
406 | * For Latin-1, there is at most exactly one lead byte because of the | |
407 | * smaller sourceLimit adjustment logic. | |
408 | */ | |
409 | if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { | |
410 | utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; | |
411 | utf8->toULength=1; | |
51004dcb | 412 | utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; |
46f4442e A |
413 | } |
414 | ||
415 | /* write back the updated pointers */ | |
416 | pToUArgs->source=(char *)source; | |
417 | pFromUArgs->target=(char *)target; | |
418 | } | |
419 | ||
f3c0d7a5 | 420 | static void U_CALLCONV |
b75a7d8f | 421 | _Latin1GetUnicodeSet(const UConverter *cnv, |
73c04bcf | 422 | const USetAdder *sa, |
b75a7d8f A |
423 | UConverterUnicodeSet which, |
424 | UErrorCode *pErrorCode) { | |
f3c0d7a5 A |
425 | (void)cnv; |
426 | (void)which; | |
427 | (void)pErrorCode; | |
374ca955 | 428 | sa->addRange(sa->set, 0, 0xff); |
b75a7d8f | 429 | } |
f3c0d7a5 A |
430 | U_CDECL_END |
431 | ||
b75a7d8f A |
432 | |
433 | static const UConverterImpl _Latin1Impl={ | |
434 | UCNV_LATIN_1, | |
435 | ||
436 | NULL, | |
437 | NULL, | |
438 | ||
439 | NULL, | |
440 | NULL, | |
441 | NULL, | |
442 | ||
443 | _Latin1ToUnicodeWithOffsets, | |
444 | _Latin1ToUnicodeWithOffsets, | |
445 | _Latin1FromUnicodeWithOffsets, | |
446 | _Latin1FromUnicodeWithOffsets, | |
447 | _Latin1GetNextUChar, | |
448 | ||
449 | NULL, | |
450 | NULL, | |
451 | NULL, | |
452 | NULL, | |
46f4442e A |
453 | _Latin1GetUnicodeSet, |
454 | ||
455 | NULL, | |
456 | ucnv_Latin1FromUTF8 | |
b75a7d8f A |
457 | }; |
458 | ||
459 | static const UConverterStaticData _Latin1StaticData={ | |
460 | sizeof(UConverterStaticData), | |
461 | "ISO-8859-1", | |
462 | 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, | |
463 | { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, | |
464 | 0, | |
465 | 0, | |
466 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
467 | }; | |
468 | ||
2ca993e8 A |
469 | const UConverterSharedData _Latin1Data= |
470 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Latin1StaticData, &_Latin1Impl); | |
b75a7d8f A |
471 | |
472 | /* US-ASCII ----------------------------------------------------------------- */ | |
473 | ||
f3c0d7a5 | 474 | U_CDECL_BEGIN |
374ca955 | 475 | /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
f3c0d7a5 | 476 | static void U_CALLCONV |
b75a7d8f A |
477 | _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
478 | UErrorCode *pErrorCode) { | |
374ca955 A |
479 | const uint8_t *source, *sourceLimit; |
480 | UChar *target, *oldTarget; | |
b75a7d8f A |
481 | int32_t targetCapacity, length; |
482 | int32_t *offsets; | |
483 | ||
484 | int32_t sourceIndex; | |
485 | ||
374ca955 A |
486 | uint8_t c; |
487 | ||
b75a7d8f A |
488 | /* set up the local pointers */ |
489 | source=(const uint8_t *)pArgs->source; | |
490 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; | |
374ca955 | 491 | target=oldTarget=pArgs->target; |
73c04bcf | 492 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
b75a7d8f A |
493 | offsets=pArgs->offsets; |
494 | ||
495 | /* sourceIndex=-1 if the current character began in the previous buffer */ | |
496 | sourceIndex=0; | |
b75a7d8f A |
497 | |
498 | /* | |
499 | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter | |
500 | * for the minimum of the sourceLength and targetCapacity | |
501 | */ | |
73c04bcf | 502 | length=(int32_t)(sourceLimit-source); |
b75a7d8f A |
503 | if(length<targetCapacity) { |
504 | targetCapacity=length; | |
505 | } | |
506 | ||
46f4442e A |
507 | if(targetCapacity>=8) { |
508 | /* This loop is unrolled for speed and improved pipelining. */ | |
b75a7d8f A |
509 | int32_t count, loops; |
510 | UChar oredChars; | |
511 | ||
46f4442e | 512 | loops=count=targetCapacity>>3; |
b75a7d8f | 513 | do { |
46f4442e A |
514 | oredChars=target[0]=source[0]; |
515 | oredChars|=target[1]=source[1]; | |
516 | oredChars|=target[2]=source[2]; | |
517 | oredChars|=target[3]=source[3]; | |
518 | oredChars|=target[4]=source[4]; | |
519 | oredChars|=target[5]=source[5]; | |
520 | oredChars|=target[6]=source[6]; | |
521 | oredChars|=target[7]=source[7]; | |
b75a7d8f A |
522 | |
523 | /* were all 16 entries really valid? */ | |
524 | if(oredChars>0x7f) { | |
525 | /* no, return to the first of these 16 */ | |
b75a7d8f A |
526 | break; |
527 | } | |
46f4442e A |
528 | source+=8; |
529 | target+=8; | |
b75a7d8f A |
530 | } while(--count>0); |
531 | count=loops-count; | |
46f4442e | 532 | targetCapacity-=count*8; |
b75a7d8f A |
533 | |
534 | if(offsets!=NULL) { | |
46f4442e | 535 | oldTarget+=count*8; |
b75a7d8f | 536 | while(count>0) { |
46f4442e A |
537 | offsets[0]=sourceIndex++; |
538 | offsets[1]=sourceIndex++; | |
539 | offsets[2]=sourceIndex++; | |
540 | offsets[3]=sourceIndex++; | |
541 | offsets[4]=sourceIndex++; | |
542 | offsets[5]=sourceIndex++; | |
543 | offsets[6]=sourceIndex++; | |
544 | offsets[7]=sourceIndex++; | |
545 | offsets+=8; | |
b75a7d8f A |
546 | --count; |
547 | } | |
548 | } | |
549 | } | |
b75a7d8f A |
550 | |
551 | /* conversion loop */ | |
374ca955 A |
552 | c=0; |
553 | while(targetCapacity>0 && (c=*source++)<=0x7f) { | |
554 | *target++=c; | |
555 | --targetCapacity; | |
b75a7d8f A |
556 | } |
557 | ||
374ca955 A |
558 | if(c>0x7f) { |
559 | /* callback(illegal); copy the current bytes to toUBytes[] */ | |
560 | UConverter *cnv=pArgs->converter; | |
561 | cnv->toUBytes[0]=c; | |
562 | cnv->toULength=1; | |
563 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
564 | } else if(source<sourceLimit && target>=pArgs->targetLimit) { | |
b75a7d8f A |
565 | /* target is full */ |
566 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
567 | } | |
568 | ||
374ca955 | 569 | /* set offsets since the start */ |
b75a7d8f | 570 | if(offsets!=NULL) { |
374ca955 | 571 | size_t count=target-oldTarget; |
b75a7d8f A |
572 | while(count>0) { |
573 | *offsets++=sourceIndex++; | |
574 | --count; | |
575 | } | |
576 | } | |
577 | ||
578 | /* write back the updated pointers */ | |
579 | pArgs->source=(const char *)source; | |
580 | pArgs->target=target; | |
581 | pArgs->offsets=offsets; | |
582 | } | |
583 | ||
374ca955 | 584 | /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */ |
f3c0d7a5 | 585 | static UChar32 U_CALLCONV |
b75a7d8f A |
586 | _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, |
587 | UErrorCode *pErrorCode) { | |
b75a7d8f A |
588 | const uint8_t *source; |
589 | uint8_t b; | |
590 | ||
b75a7d8f | 591 | source=(const uint8_t *)pArgs->source; |
374ca955 | 592 | if(source<(const uint8_t *)pArgs->sourceLimit) { |
b75a7d8f A |
593 | b=*source++; |
594 | pArgs->source=(const char *)source; | |
595 | if(b<=0x7f) { | |
596 | return b; | |
597 | } else { | |
b75a7d8f | 598 | UConverter *cnv=pArgs->converter; |
374ca955 A |
599 | cnv->toUBytes[0]=b; |
600 | cnv->toULength=1; | |
b75a7d8f | 601 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
374ca955 | 602 | return 0xffff; |
b75a7d8f A |
603 | } |
604 | } | |
605 | ||
374ca955 | 606 | /* no output because of empty input */ |
b75a7d8f A |
607 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
608 | return 0xffff; | |
609 | } | |
610 | ||
46f4442e | 611 | /* "Convert" UTF-8 to US-ASCII: Validate and copy. */ |
f3c0d7a5 | 612 | static void U_CALLCONV |
46f4442e A |
613 | ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
614 | UConverterToUnicodeArgs *pToUArgs, | |
615 | UErrorCode *pErrorCode) { | |
616 | const uint8_t *source, *sourceLimit; | |
617 | uint8_t *target; | |
618 | int32_t targetCapacity, length; | |
619 | ||
620 | uint8_t c; | |
621 | ||
622 | if(pToUArgs->converter->toUnicodeStatus!=0) { | |
623 | /* no handling of partial UTF-8 characters here, fall back to pivoting */ | |
624 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
625 | return; | |
626 | } | |
627 | ||
628 | /* set up the local pointers */ | |
629 | source=(const uint8_t *)pToUArgs->source; | |
630 | sourceLimit=(const uint8_t *)pToUArgs->sourceLimit; | |
631 | target=(uint8_t *)pFromUArgs->target; | |
632 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); | |
633 | ||
634 | /* | |
635 | * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter | |
636 | * for the minimum of the sourceLength and targetCapacity | |
637 | */ | |
638 | length=(int32_t)(sourceLimit-source); | |
639 | if(length<targetCapacity) { | |
640 | targetCapacity=length; | |
641 | } | |
642 | ||
643 | /* unroll the loop with the most common case */ | |
644 | if(targetCapacity>=16) { | |
645 | int32_t count, loops; | |
646 | uint8_t oredChars; | |
647 | ||
648 | loops=count=targetCapacity>>4; | |
649 | do { | |
650 | oredChars=*target++=*source++; | |
651 | oredChars|=*target++=*source++; | |
652 | oredChars|=*target++=*source++; | |
653 | oredChars|=*target++=*source++; | |
654 | oredChars|=*target++=*source++; | |
655 | oredChars|=*target++=*source++; | |
656 | oredChars|=*target++=*source++; | |
657 | oredChars|=*target++=*source++; | |
658 | oredChars|=*target++=*source++; | |
659 | oredChars|=*target++=*source++; | |
660 | oredChars|=*target++=*source++; | |
661 | oredChars|=*target++=*source++; | |
662 | oredChars|=*target++=*source++; | |
663 | oredChars|=*target++=*source++; | |
664 | oredChars|=*target++=*source++; | |
665 | oredChars|=*target++=*source++; | |
666 | ||
667 | /* were all 16 entries really valid? */ | |
668 | if(oredChars>0x7f) { | |
669 | /* no, return to the first of these 16 */ | |
670 | source-=16; | |
671 | target-=16; | |
672 | break; | |
673 | } | |
674 | } while(--count>0); | |
675 | count=loops-count; | |
676 | targetCapacity-=16*count; | |
677 | } | |
678 | ||
679 | /* conversion loop */ | |
680 | c=0; | |
681 | while(targetCapacity>0 && (c=*source)<=0x7f) { | |
682 | ++source; | |
683 | *target++=c; | |
684 | --targetCapacity; | |
685 | } | |
686 | ||
687 | if(c>0x7f) { | |
688 | /* non-ASCII character, handle in standard converter */ | |
689 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
690 | } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) { | |
691 | /* target is full */ | |
692 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
693 | } | |
694 | ||
695 | /* write back the updated pointers */ | |
696 | pToUArgs->source=(const char *)source; | |
697 | pFromUArgs->target=(char *)target; | |
698 | } | |
699 | ||
f3c0d7a5 | 700 | static void U_CALLCONV |
b75a7d8f | 701 | _ASCIIGetUnicodeSet(const UConverter *cnv, |
73c04bcf | 702 | const USetAdder *sa, |
b75a7d8f A |
703 | UConverterUnicodeSet which, |
704 | UErrorCode *pErrorCode) { | |
f3c0d7a5 A |
705 | (void)cnv; |
706 | (void)which; | |
707 | (void)pErrorCode; | |
374ca955 | 708 | sa->addRange(sa->set, 0, 0x7f); |
b75a7d8f | 709 | } |
f3c0d7a5 | 710 | U_CDECL_END |
b75a7d8f A |
711 | |
712 | static const UConverterImpl _ASCIIImpl={ | |
713 | UCNV_US_ASCII, | |
714 | ||
715 | NULL, | |
716 | NULL, | |
717 | ||
718 | NULL, | |
719 | NULL, | |
720 | NULL, | |
721 | ||
722 | _ASCIIToUnicodeWithOffsets, | |
723 | _ASCIIToUnicodeWithOffsets, | |
724 | _Latin1FromUnicodeWithOffsets, | |
725 | _Latin1FromUnicodeWithOffsets, | |
726 | _ASCIIGetNextUChar, | |
727 | ||
728 | NULL, | |
729 | NULL, | |
730 | NULL, | |
731 | NULL, | |
46f4442e A |
732 | _ASCIIGetUnicodeSet, |
733 | ||
734 | NULL, | |
735 | ucnv_ASCIIFromUTF8 | |
b75a7d8f A |
736 | }; |
737 | ||
738 | static const UConverterStaticData _ASCIIStaticData={ | |
739 | sizeof(UConverterStaticData), | |
740 | "US-ASCII", | |
741 | 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, | |
742 | { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, | |
743 | 0, | |
744 | 0, | |
745 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
746 | }; | |
747 | ||
2ca993e8 A |
748 | const UConverterSharedData _ASCIIData= |
749 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ASCIIStaticData, &_ASCIIImpl); | |
374ca955 A |
750 | |
751 | #endif |