]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
51004dcb | 3 | * Copyright (C) 2000-2012, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * file name: ucnvlat1.cpp | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2000feb07 | |
12 | * created by: Markus W. Scherer | |
13 | */ | |
14 | ||
15 | #include "unicode/utypes.h" | |
374ca955 A |
16 | |
17 | #if !UCONFIG_NO_CONVERSION | |
18 | ||
b75a7d8f | 19 | #include "unicode/ucnv.h" |
b75a7d8f | 20 | #include "unicode/uset.h" |
4388f060 | 21 | #include "unicode/utf8.h" |
b75a7d8f A |
22 | #include "ucnv_bld.h" |
23 | #include "ucnv_cnv.h" | |
24 | ||
25 | /* control optimizations according to the platform */ | |
b75a7d8f | 26 | #define LATIN1_UNROLL_FROM_UNICODE 1 |
b75a7d8f A |
27 | |
28 | /* ISO 8859-1 --------------------------------------------------------------- */ | |
29 | ||
374ca955 | 30 | /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
b75a7d8f A |
31 | static void |
32 | _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
33 | UErrorCode *pErrorCode) { | |
34 | const uint8_t *source; | |
35 | UChar *target; | |
36 | int32_t targetCapacity, length; | |
37 | int32_t *offsets; | |
38 | ||
39 | int32_t sourceIndex; | |
40 | ||
41 | /* set up the local pointers */ | |
42 | source=(const uint8_t *)pArgs->source; | |
43 | target=pArgs->target; | |
73c04bcf | 44 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
b75a7d8f A |
45 | offsets=pArgs->offsets; |
46 | ||
47 | sourceIndex=0; | |
48 | ||
49 | /* | |
50 | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter | |
51 | * for the minimum of the sourceLength and targetCapacity | |
52 | */ | |
73c04bcf | 53 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
b75a7d8f A |
54 | if(length<=targetCapacity) { |
55 | targetCapacity=length; | |
56 | } else { | |
57 | /* target will be full */ | |
58 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
59 | length=targetCapacity; | |
60 | } | |
61 | ||
46f4442e A |
62 | if(targetCapacity>=8) { |
63 | /* This loop is unrolled for speed and improved pipelining. */ | |
b75a7d8f A |
64 | int32_t count, loops; |
65 | ||
46f4442e A |
66 | loops=count=targetCapacity>>3; |
67 | length=targetCapacity&=0x7; | |
b75a7d8f | 68 | do { |
46f4442e A |
69 | target[0]=source[0]; |
70 | target[1]=source[1]; | |
71 | target[2]=source[2]; | |
72 | target[3]=source[3]; | |
73 | target[4]=source[4]; | |
74 | target[5]=source[5]; | |
75 | target[6]=source[6]; | |
76 | target[7]=source[7]; | |
77 | target+=8; | |
78 | source+=8; | |
b75a7d8f A |
79 | } while(--count>0); |
80 | ||
81 | if(offsets!=NULL) { | |
82 | do { | |
46f4442e A |
83 | offsets[0]=sourceIndex++; |
84 | offsets[1]=sourceIndex++; | |
85 | offsets[2]=sourceIndex++; | |
86 | offsets[3]=sourceIndex++; | |
87 | offsets[4]=sourceIndex++; | |
88 | offsets[5]=sourceIndex++; | |
89 | offsets[6]=sourceIndex++; | |
90 | offsets[7]=sourceIndex++; | |
91 | offsets+=8; | |
b75a7d8f A |
92 | } while(--loops>0); |
93 | } | |
94 | } | |
b75a7d8f A |
95 | |
96 | /* conversion loop */ | |
97 | while(targetCapacity>0) { | |
98 | *target++=*source++; | |
99 | --targetCapacity; | |
100 | } | |
101 | ||
102 | /* write back the updated pointers */ | |
103 | pArgs->source=(const char *)source; | |
104 | pArgs->target=target; | |
105 | ||
106 | /* set offsets */ | |
107 | if(offsets!=NULL) { | |
108 | while(length>0) { | |
109 | *offsets++=sourceIndex++; | |
110 | --length; | |
111 | } | |
112 | pArgs->offsets=offsets; | |
113 | } | |
114 | } | |
115 | ||
374ca955 | 116 | /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */ |
b75a7d8f A |
117 | static UChar32 |
118 | _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs, | |
119 | UErrorCode *pErrorCode) { | |
120 | const uint8_t *source=(const uint8_t *)pArgs->source; | |
121 | if(source<(const uint8_t *)pArgs->sourceLimit) { | |
122 | pArgs->source=(const char *)(source+1); | |
123 | return *source; | |
124 | } | |
125 | ||
126 | /* no output because of empty input */ | |
127 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
128 | return 0xffff; | |
129 | } | |
130 | ||
374ca955 | 131 | /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */ |
b75a7d8f A |
132 | static void |
133 | _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
134 | UErrorCode *pErrorCode) { | |
135 | UConverter *cnv; | |
374ca955 A |
136 | const UChar *source, *sourceLimit; |
137 | uint8_t *target, *oldTarget; | |
b75a7d8f A |
138 | int32_t targetCapacity, length; |
139 | int32_t *offsets; | |
140 | ||
374ca955 A |
141 | UChar32 cp; |
142 | UChar c, max; | |
b75a7d8f A |
143 | |
144 | int32_t sourceIndex; | |
145 | ||
b75a7d8f A |
146 | /* set up the local pointers */ |
147 | cnv=pArgs->converter; | |
148 | source=pArgs->source; | |
149 | sourceLimit=pArgs->sourceLimit; | |
374ca955 | 150 | target=oldTarget=(uint8_t *)pArgs->target; |
73c04bcf | 151 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
b75a7d8f A |
152 | offsets=pArgs->offsets; |
153 | ||
154 | if(cnv->sharedData==&_Latin1Data) { | |
155 | max=0xff; /* Latin-1 */ | |
156 | } else { | |
157 | max=0x7f; /* US-ASCII */ | |
158 | } | |
159 | ||
160 | /* get the converter state from UConverter */ | |
374ca955 | 161 | cp=cnv->fromUChar32; |
b75a7d8f A |
162 | |
163 | /* sourceIndex=-1 if the current character began in the previous buffer */ | |
374ca955 | 164 | sourceIndex= cp==0 ? 0 : -1; |
b75a7d8f A |
165 | |
166 | /* | |
167 | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter | |
168 | * for the minimum of the sourceLength and targetCapacity | |
169 | */ | |
73c04bcf | 170 | length=(int32_t)(sourceLimit-source); |
b75a7d8f A |
171 | if(length<targetCapacity) { |
172 | targetCapacity=length; | |
173 | } | |
174 | ||
175 | /* conversion loop */ | |
374ca955 | 176 | if(cp!=0 && targetCapacity>0) { |
b75a7d8f A |
177 | goto getTrail; |
178 | } | |
179 | ||
180 | #if LATIN1_UNROLL_FROM_UNICODE | |
181 | /* unroll the loop with the most common case */ | |
b75a7d8f A |
182 | if(targetCapacity>=16) { |
183 | int32_t count, loops; | |
184 | UChar u, oredChars; | |
185 | ||
186 | loops=count=targetCapacity>>4; | |
187 | do { | |
188 | oredChars=u=*source++; | |
189 | *target++=(uint8_t)u; | |
190 | oredChars|=u=*source++; | |
191 | *target++=(uint8_t)u; | |
192 | oredChars|=u=*source++; | |
193 | *target++=(uint8_t)u; | |
194 | oredChars|=u=*source++; | |
195 | *target++=(uint8_t)u; | |
196 | oredChars|=u=*source++; | |
197 | *target++=(uint8_t)u; | |
198 | oredChars|=u=*source++; | |
199 | *target++=(uint8_t)u; | |
200 | oredChars|=u=*source++; | |
201 | *target++=(uint8_t)u; | |
202 | oredChars|=u=*source++; | |
203 | *target++=(uint8_t)u; | |
204 | oredChars|=u=*source++; | |
205 | *target++=(uint8_t)u; | |
206 | oredChars|=u=*source++; | |
207 | *target++=(uint8_t)u; | |
208 | oredChars|=u=*source++; | |
209 | *target++=(uint8_t)u; | |
210 | oredChars|=u=*source++; | |
211 | *target++=(uint8_t)u; | |
212 | oredChars|=u=*source++; | |
213 | *target++=(uint8_t)u; | |
214 | oredChars|=u=*source++; | |
215 | *target++=(uint8_t)u; | |
216 | oredChars|=u=*source++; | |
217 | *target++=(uint8_t)u; | |
218 | oredChars|=u=*source++; | |
219 | *target++=(uint8_t)u; | |
220 | ||
221 | /* were all 16 entries really valid? */ | |
222 | if(oredChars>max) { | |
223 | /* no, return to the first of these 16 */ | |
224 | source-=16; | |
225 | target-=16; | |
226 | break; | |
227 | } | |
228 | } while(--count>0); | |
229 | count=loops-count; | |
230 | targetCapacity-=16*count; | |
231 | ||
232 | if(offsets!=NULL) { | |
374ca955 | 233 | oldTarget+=16*count; |
b75a7d8f A |
234 | while(count>0) { |
235 | *offsets++=sourceIndex++; | |
236 | *offsets++=sourceIndex++; | |
237 | *offsets++=sourceIndex++; | |
238 | *offsets++=sourceIndex++; | |
239 | *offsets++=sourceIndex++; | |
240 | *offsets++=sourceIndex++; | |
241 | *offsets++=sourceIndex++; | |
242 | *offsets++=sourceIndex++; | |
243 | *offsets++=sourceIndex++; | |
244 | *offsets++=sourceIndex++; | |
245 | *offsets++=sourceIndex++; | |
246 | *offsets++=sourceIndex++; | |
247 | *offsets++=sourceIndex++; | |
248 | *offsets++=sourceIndex++; | |
249 | *offsets++=sourceIndex++; | |
250 | *offsets++=sourceIndex++; | |
251 | --count; | |
252 | } | |
253 | } | |
b75a7d8f A |
254 | } |
255 | #endif | |
256 | ||
374ca955 A |
257 | /* conversion loop */ |
258 | c=0; | |
259 | while(targetCapacity>0 && (c=*source++)<=max) { | |
260 | /* convert the Unicode code point */ | |
261 | *target++=(uint8_t)c; | |
262 | --targetCapacity; | |
263 | } | |
264 | ||
265 | if(c>max) { | |
266 | cp=c; | |
267 | if(!U_IS_SURROGATE(cp)) { | |
268 | /* callback(unassigned) */ | |
269 | } else if(U_IS_SURROGATE_LEAD(cp)) { | |
b75a7d8f | 270 | getTrail: |
374ca955 A |
271 | if(source<sourceLimit) { |
272 | /* test the following code unit */ | |
273 | UChar trail=*source; | |
274 | if(U16_IS_TRAIL(trail)) { | |
275 | ++source; | |
276 | cp=U16_GET_SUPPLEMENTARY(cp, trail); | |
277 | /* this codepage does not map supplementary code points */ | |
278 | /* callback(unassigned) */ | |
b75a7d8f | 279 | } else { |
374ca955 A |
280 | /* this is an unmatched lead code unit (1st surrogate) */ |
281 | /* callback(illegal) */ | |
b75a7d8f A |
282 | } |
283 | } else { | |
374ca955 A |
284 | /* no more input */ |
285 | cnv->fromUChar32=cp; | |
286 | goto noMoreInput; | |
b75a7d8f | 287 | } |
374ca955 A |
288 | } else { |
289 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
290 | /* callback(illegal) */ | |
b75a7d8f | 291 | } |
b75a7d8f | 292 | |
374ca955 A |
293 | *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; |
294 | cnv->fromUChar32=cp; | |
b75a7d8f | 295 | } |
374ca955 | 296 | noMoreInput: |
b75a7d8f | 297 | |
374ca955 | 298 | /* set offsets since the start */ |
b75a7d8f | 299 | if(offsets!=NULL) { |
374ca955 | 300 | size_t count=target-oldTarget; |
b75a7d8f A |
301 | while(count>0) { |
302 | *offsets++=sourceIndex++; | |
303 | --count; | |
304 | } | |
305 | } | |
306 | ||
374ca955 A |
307 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { |
308 | /* target is full */ | |
309 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
b75a7d8f A |
310 | } |
311 | ||
312 | /* write back the updated pointers */ | |
313 | pArgs->source=source; | |
314 | pArgs->target=(char *)target; | |
315 | pArgs->offsets=offsets; | |
316 | } | |
317 | ||
46f4442e A |
318 | /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */ |
319 | static void | |
320 | ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, | |
321 | UConverterToUnicodeArgs *pToUArgs, | |
322 | UErrorCode *pErrorCode) { | |
323 | UConverter *utf8; | |
324 | const uint8_t *source, *sourceLimit; | |
325 | uint8_t *target; | |
326 | int32_t targetCapacity; | |
327 | ||
328 | UChar32 c; | |
329 | uint8_t b, t1; | |
330 | ||
331 | /* set up the local pointers */ | |
332 | utf8=pToUArgs->converter; | |
333 | source=(uint8_t *)pToUArgs->source; | |
334 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; | |
335 | target=(uint8_t *)pFromUArgs->target; | |
336 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); | |
337 | ||
338 | /* get the converter state from the UTF-8 UConverter */ | |
339 | c=(UChar32)utf8->toUnicodeStatus; | |
340 | if(c!=0 && source<sourceLimit) { | |
341 | if(targetCapacity==0) { | |
342 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
343 | return; | |
344 | } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) { | |
345 | ++source; | |
346 | *target++=(uint8_t)(((c&3)<<6)|t1); | |
347 | --targetCapacity; | |
348 | ||
349 | utf8->toUnicodeStatus=0; | |
350 | utf8->toULength=0; | |
351 | } else { | |
352 | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ | |
353 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
354 | return; | |
355 | } | |
356 | } | |
357 | ||
358 | /* | |
359 | * Make sure that the last byte sequence before sourceLimit is complete | |
360 | * or runs into a lead byte. | |
361 | * In the conversion loop compare source with sourceLimit only once | |
362 | * per multi-byte character. | |
363 | * For Latin-1, adjust sourceLimit only for 1 trail byte because | |
364 | * the conversion loop handles at most 2-byte sequences. | |
365 | */ | |
366 | if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) { | |
367 | --sourceLimit; | |
368 | } | |
369 | ||
370 | /* conversion loop */ | |
371 | while(source<sourceLimit) { | |
372 | if(targetCapacity>0) { | |
373 | b=*source++; | |
374 | if((int8_t)b>=0) { | |
375 | /* convert ASCII */ | |
376 | *target++=(uint8_t)b; | |
377 | --targetCapacity; | |
378 | } else if( /* handle U+0080..U+00FF inline */ | |
379 | b>=0xc2 && b<=0xc3 && | |
380 | (t1=(uint8_t)(*source-0x80)) <= 0x3f | |
381 | ) { | |
382 | ++source; | |
383 | *target++=(uint8_t)(((b&3)<<6)|t1); | |
384 | --targetCapacity; | |
385 | } else { | |
386 | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ | |
387 | pToUArgs->source=(char *)(source-1); | |
388 | pFromUArgs->target=(char *)target; | |
389 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
390 | return; | |
391 | } | |
392 | } else { | |
393 | /* target is full */ | |
394 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
395 | break; | |
396 | } | |
397 | } | |
398 | ||
399 | /* | |
400 | * The sourceLimit may have been adjusted before the conversion loop | |
401 | * to stop before a truncated sequence. | |
402 | * If so, then collect the truncated sequence now. | |
403 | * For Latin-1, there is at most exactly one lead byte because of the | |
404 | * smaller sourceLimit adjustment logic. | |
405 | */ | |
406 | if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { | |
407 | utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; | |
408 | utf8->toULength=1; | |
51004dcb | 409 | utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; |
46f4442e A |
410 | } |
411 | ||
412 | /* write back the updated pointers */ | |
413 | pToUArgs->source=(char *)source; | |
414 | pFromUArgs->target=(char *)target; | |
415 | } | |
416 | ||
b75a7d8f A |
417 | static void |
418 | _Latin1GetUnicodeSet(const UConverter *cnv, | |
73c04bcf | 419 | const USetAdder *sa, |
b75a7d8f A |
420 | UConverterUnicodeSet which, |
421 | UErrorCode *pErrorCode) { | |
374ca955 | 422 | sa->addRange(sa->set, 0, 0xff); |
b75a7d8f A |
423 | } |
424 | ||
425 | static const UConverterImpl _Latin1Impl={ | |
426 | UCNV_LATIN_1, | |
427 | ||
428 | NULL, | |
429 | NULL, | |
430 | ||
431 | NULL, | |
432 | NULL, | |
433 | NULL, | |
434 | ||
435 | _Latin1ToUnicodeWithOffsets, | |
436 | _Latin1ToUnicodeWithOffsets, | |
437 | _Latin1FromUnicodeWithOffsets, | |
438 | _Latin1FromUnicodeWithOffsets, | |
439 | _Latin1GetNextUChar, | |
440 | ||
441 | NULL, | |
442 | NULL, | |
443 | NULL, | |
444 | NULL, | |
46f4442e A |
445 | _Latin1GetUnicodeSet, |
446 | ||
447 | NULL, | |
448 | ucnv_Latin1FromUTF8 | |
b75a7d8f A |
449 | }; |
450 | ||
451 | static const UConverterStaticData _Latin1StaticData={ | |
452 | sizeof(UConverterStaticData), | |
453 | "ISO-8859-1", | |
454 | 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, | |
455 | { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, | |
456 | 0, | |
457 | 0, | |
458 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
459 | }; | |
460 | ||
461 | const UConverterSharedData _Latin1Data={ | |
462 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
463 | NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl, | |
464 | 0 | |
465 | }; | |
466 | ||
467 | /* US-ASCII ----------------------------------------------------------------- */ | |
468 | ||
374ca955 | 469 | /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
b75a7d8f A |
470 | static void |
471 | _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
472 | UErrorCode *pErrorCode) { | |
374ca955 A |
473 | const uint8_t *source, *sourceLimit; |
474 | UChar *target, *oldTarget; | |
b75a7d8f A |
475 | int32_t targetCapacity, length; |
476 | int32_t *offsets; | |
477 | ||
478 | int32_t sourceIndex; | |
479 | ||
374ca955 A |
480 | uint8_t c; |
481 | ||
b75a7d8f A |
482 | /* set up the local pointers */ |
483 | source=(const uint8_t *)pArgs->source; | |
484 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; | |
374ca955 | 485 | target=oldTarget=pArgs->target; |
73c04bcf | 486 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
b75a7d8f A |
487 | offsets=pArgs->offsets; |
488 | ||
489 | /* sourceIndex=-1 if the current character began in the previous buffer */ | |
490 | sourceIndex=0; | |
b75a7d8f A |
491 | |
492 | /* | |
493 | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter | |
494 | * for the minimum of the sourceLength and targetCapacity | |
495 | */ | |
73c04bcf | 496 | length=(int32_t)(sourceLimit-source); |
b75a7d8f A |
497 | if(length<targetCapacity) { |
498 | targetCapacity=length; | |
499 | } | |
500 | ||
46f4442e A |
501 | if(targetCapacity>=8) { |
502 | /* This loop is unrolled for speed and improved pipelining. */ | |
b75a7d8f A |
503 | int32_t count, loops; |
504 | UChar oredChars; | |
505 | ||
46f4442e | 506 | loops=count=targetCapacity>>3; |
b75a7d8f | 507 | do { |
46f4442e A |
508 | oredChars=target[0]=source[0]; |
509 | oredChars|=target[1]=source[1]; | |
510 | oredChars|=target[2]=source[2]; | |
511 | oredChars|=target[3]=source[3]; | |
512 | oredChars|=target[4]=source[4]; | |
513 | oredChars|=target[5]=source[5]; | |
514 | oredChars|=target[6]=source[6]; | |
515 | oredChars|=target[7]=source[7]; | |
b75a7d8f A |
516 | |
517 | /* were all 16 entries really valid? */ | |
518 | if(oredChars>0x7f) { | |
519 | /* no, return to the first of these 16 */ | |
b75a7d8f A |
520 | break; |
521 | } | |
46f4442e A |
522 | source+=8; |
523 | target+=8; | |
b75a7d8f A |
524 | } while(--count>0); |
525 | count=loops-count; | |
46f4442e | 526 | targetCapacity-=count*8; |
b75a7d8f A |
527 | |
528 | if(offsets!=NULL) { | |
46f4442e | 529 | oldTarget+=count*8; |
b75a7d8f | 530 | while(count>0) { |
46f4442e A |
531 | offsets[0]=sourceIndex++; |
532 | offsets[1]=sourceIndex++; | |
533 | offsets[2]=sourceIndex++; | |
534 | offsets[3]=sourceIndex++; | |
535 | offsets[4]=sourceIndex++; | |
536 | offsets[5]=sourceIndex++; | |
537 | offsets[6]=sourceIndex++; | |
538 | offsets[7]=sourceIndex++; | |
539 | offsets+=8; | |
b75a7d8f A |
540 | --count; |
541 | } | |
542 | } | |
543 | } | |
b75a7d8f A |
544 | |
545 | /* conversion loop */ | |
374ca955 A |
546 | c=0; |
547 | while(targetCapacity>0 && (c=*source++)<=0x7f) { | |
548 | *target++=c; | |
549 | --targetCapacity; | |
b75a7d8f A |
550 | } |
551 | ||
374ca955 A |
552 | if(c>0x7f) { |
553 | /* callback(illegal); copy the current bytes to toUBytes[] */ | |
554 | UConverter *cnv=pArgs->converter; | |
555 | cnv->toUBytes[0]=c; | |
556 | cnv->toULength=1; | |
557 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
558 | } else if(source<sourceLimit && target>=pArgs->targetLimit) { | |
b75a7d8f A |
559 | /* target is full */ |
560 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
561 | } | |
562 | ||
374ca955 | 563 | /* set offsets since the start */ |
b75a7d8f | 564 | if(offsets!=NULL) { |
374ca955 | 565 | size_t count=target-oldTarget; |
b75a7d8f A |
566 | while(count>0) { |
567 | *offsets++=sourceIndex++; | |
568 | --count; | |
569 | } | |
570 | } | |
571 | ||
572 | /* write back the updated pointers */ | |
573 | pArgs->source=(const char *)source; | |
574 | pArgs->target=target; | |
575 | pArgs->offsets=offsets; | |
576 | } | |
577 | ||
374ca955 | 578 | /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */ |
b75a7d8f A |
579 | static UChar32 |
580 | _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, | |
581 | UErrorCode *pErrorCode) { | |
b75a7d8f A |
582 | const uint8_t *source; |
583 | uint8_t b; | |
584 | ||
b75a7d8f | 585 | source=(const uint8_t *)pArgs->source; |
374ca955 | 586 | if(source<(const uint8_t *)pArgs->sourceLimit) { |
b75a7d8f A |
587 | b=*source++; |
588 | pArgs->source=(const char *)source; | |
589 | if(b<=0x7f) { | |
590 | return b; | |
591 | } else { | |
b75a7d8f | 592 | UConverter *cnv=pArgs->converter; |
374ca955 A |
593 | cnv->toUBytes[0]=b; |
594 | cnv->toULength=1; | |
b75a7d8f | 595 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
374ca955 | 596 | return 0xffff; |
b75a7d8f A |
597 | } |
598 | } | |
599 | ||
374ca955 | 600 | /* no output because of empty input */ |
b75a7d8f A |
601 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
602 | return 0xffff; | |
603 | } | |
604 | ||
46f4442e A |
605 | /* "Convert" UTF-8 to US-ASCII: Validate and copy. */ |
606 | static void | |
607 | ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, | |
608 | UConverterToUnicodeArgs *pToUArgs, | |
609 | UErrorCode *pErrorCode) { | |
610 | const uint8_t *source, *sourceLimit; | |
611 | uint8_t *target; | |
612 | int32_t targetCapacity, length; | |
613 | ||
614 | uint8_t c; | |
615 | ||
616 | if(pToUArgs->converter->toUnicodeStatus!=0) { | |
617 | /* no handling of partial UTF-8 characters here, fall back to pivoting */ | |
618 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
619 | return; | |
620 | } | |
621 | ||
622 | /* set up the local pointers */ | |
623 | source=(const uint8_t *)pToUArgs->source; | |
624 | sourceLimit=(const uint8_t *)pToUArgs->sourceLimit; | |
625 | target=(uint8_t *)pFromUArgs->target; | |
626 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); | |
627 | ||
628 | /* | |
629 | * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter | |
630 | * for the minimum of the sourceLength and targetCapacity | |
631 | */ | |
632 | length=(int32_t)(sourceLimit-source); | |
633 | if(length<targetCapacity) { | |
634 | targetCapacity=length; | |
635 | } | |
636 | ||
637 | /* unroll the loop with the most common case */ | |
638 | if(targetCapacity>=16) { | |
639 | int32_t count, loops; | |
640 | uint8_t oredChars; | |
641 | ||
642 | loops=count=targetCapacity>>4; | |
643 | do { | |
644 | oredChars=*target++=*source++; | |
645 | oredChars|=*target++=*source++; | |
646 | oredChars|=*target++=*source++; | |
647 | oredChars|=*target++=*source++; | |
648 | oredChars|=*target++=*source++; | |
649 | oredChars|=*target++=*source++; | |
650 | oredChars|=*target++=*source++; | |
651 | oredChars|=*target++=*source++; | |
652 | oredChars|=*target++=*source++; | |
653 | oredChars|=*target++=*source++; | |
654 | oredChars|=*target++=*source++; | |
655 | oredChars|=*target++=*source++; | |
656 | oredChars|=*target++=*source++; | |
657 | oredChars|=*target++=*source++; | |
658 | oredChars|=*target++=*source++; | |
659 | oredChars|=*target++=*source++; | |
660 | ||
661 | /* were all 16 entries really valid? */ | |
662 | if(oredChars>0x7f) { | |
663 | /* no, return to the first of these 16 */ | |
664 | source-=16; | |
665 | target-=16; | |
666 | break; | |
667 | } | |
668 | } while(--count>0); | |
669 | count=loops-count; | |
670 | targetCapacity-=16*count; | |
671 | } | |
672 | ||
673 | /* conversion loop */ | |
674 | c=0; | |
675 | while(targetCapacity>0 && (c=*source)<=0x7f) { | |
676 | ++source; | |
677 | *target++=c; | |
678 | --targetCapacity; | |
679 | } | |
680 | ||
681 | if(c>0x7f) { | |
682 | /* non-ASCII character, handle in standard converter */ | |
683 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
684 | } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) { | |
685 | /* target is full */ | |
686 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
687 | } | |
688 | ||
689 | /* write back the updated pointers */ | |
690 | pToUArgs->source=(const char *)source; | |
691 | pFromUArgs->target=(char *)target; | |
692 | } | |
693 | ||
b75a7d8f A |
694 | static void |
695 | _ASCIIGetUnicodeSet(const UConverter *cnv, | |
73c04bcf | 696 | const USetAdder *sa, |
b75a7d8f A |
697 | UConverterUnicodeSet which, |
698 | UErrorCode *pErrorCode) { | |
374ca955 | 699 | sa->addRange(sa->set, 0, 0x7f); |
b75a7d8f A |
700 | } |
701 | ||
702 | static const UConverterImpl _ASCIIImpl={ | |
703 | UCNV_US_ASCII, | |
704 | ||
705 | NULL, | |
706 | NULL, | |
707 | ||
708 | NULL, | |
709 | NULL, | |
710 | NULL, | |
711 | ||
712 | _ASCIIToUnicodeWithOffsets, | |
713 | _ASCIIToUnicodeWithOffsets, | |
714 | _Latin1FromUnicodeWithOffsets, | |
715 | _Latin1FromUnicodeWithOffsets, | |
716 | _ASCIIGetNextUChar, | |
717 | ||
718 | NULL, | |
719 | NULL, | |
720 | NULL, | |
721 | NULL, | |
46f4442e A |
722 | _ASCIIGetUnicodeSet, |
723 | ||
724 | NULL, | |
725 | ucnv_ASCIIFromUTF8 | |
b75a7d8f A |
726 | }; |
727 | ||
728 | static const UConverterStaticData _ASCIIStaticData={ | |
729 | sizeof(UConverterStaticData), | |
730 | "US-ASCII", | |
731 | 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, | |
732 | { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, | |
733 | 0, | |
734 | 0, | |
735 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
736 | }; | |
737 | ||
738 | const UConverterSharedData _ASCIIData={ | |
739 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
740 | NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl, | |
741 | 0 | |
742 | }; | |
374ca955 A |
743 | |
744 | #endif |