]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
73c04bcf | 3 | * Copyright (C) 2002-2006, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * file name: ucnv_u16.c | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2002jul01 | |
12 | * created by: Markus W. Scherer | |
13 | * | |
14 | * UTF-16 converter implementation. Used to be in ucnv_utf.c. | |
15 | */ | |
16 | ||
17 | #include "unicode/utypes.h" | |
374ca955 A |
18 | |
19 | #if !UCONFIG_NO_CONVERSION | |
20 | ||
b75a7d8f | 21 | #include "unicode/ucnv.h" |
b75a7d8f A |
22 | #include "ucnv_bld.h" |
23 | #include "ucnv_cnv.h" | |
24 | #include "cmemory.h" | |
25 | ||
73c04bcf A |
26 | enum { |
27 | UCNV_NEED_TO_WRITE_BOM=1 | |
28 | }; | |
29 | ||
374ca955 A |
30 | /* UTF-16BE ----------------------------------------------------------------- */ |
31 | ||
32 | #if U_IS_BIG_ENDIAN | |
33 | # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets | |
34 | #else | |
35 | # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets | |
36 | #endif | |
b75a7d8f | 37 | |
73c04bcf | 38 | |
b75a7d8f | 39 | static void |
374ca955 A |
40 | _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
41 | UErrorCode *pErrorCode) { | |
42 | UConverter *cnv; | |
43 | const UChar *source; | |
73c04bcf | 44 | char *target; |
374ca955 A |
45 | int32_t *offsets; |
46 | ||
73c04bcf | 47 | uint32_t targetCapacity, length, sourceIndex; |
374ca955 A |
48 | UChar c, trail; |
49 | char overflow[4]; | |
50 | ||
51 | source=pArgs->source; | |
73c04bcf | 52 | length=(int32_t)(pArgs->sourceLimit-source); |
374ca955 | 53 | if(length<=0) { |
b75a7d8f A |
54 | /* no input, nothing to do */ |
55 | return; | |
56 | } | |
57 | ||
73c04bcf A |
58 | cnv=pArgs->converter; |
59 | ||
60 | /* write the BOM if necessary */ | |
61 | if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { | |
62 | static const char bom[]={ (char)0xfe, (char)0xff }; | |
63 | ucnv_fromUWriteBytes(cnv, | |
64 | bom, 2, | |
65 | &pArgs->target, pArgs->targetLimit, | |
66 | &pArgs->offsets, -1, | |
67 | pErrorCode); | |
68 | cnv->fromUnicodeStatus=0; | |
69 | } | |
70 | ||
71 | target=pArgs->target; | |
72 | if(target >= pArgs->targetLimit) { | |
b75a7d8f A |
73 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
74 | return; | |
75 | } | |
76 | ||
73c04bcf | 77 | targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
374ca955 A |
78 | offsets=pArgs->offsets; |
79 | sourceIndex=0; | |
80 | ||
81 | /* c!=0 indicates in several places outside the main loops that a surrogate was found */ | |
82 | ||
83 | if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { | |
84 | /* the last buffer ended with a lead surrogate, output the surrogate pair */ | |
85 | ++source; | |
b75a7d8f | 86 | --length; |
374ca955 A |
87 | target[0]=(uint8_t)(c>>8); |
88 | target[1]=(uint8_t)c; | |
89 | target[2]=(uint8_t)(trail>>8); | |
90 | target[3]=(uint8_t)trail; | |
91 | target+=4; | |
92 | targetCapacity-=4; | |
93 | if(offsets!=NULL) { | |
94 | *offsets++=-1; | |
95 | *offsets++=-1; | |
96 | *offsets++=-1; | |
97 | *offsets++=-1; | |
b75a7d8f | 98 | } |
374ca955 A |
99 | sourceIndex=1; |
100 | cnv->fromUChar32=c=0; | |
b75a7d8f A |
101 | } |
102 | ||
374ca955 | 103 | if(c==0) { |
73c04bcf A |
104 | /* copy an even number of bytes for complete UChars */ |
105 | uint32_t count=2*length; | |
106 | if(count>targetCapacity) { | |
107 | count=targetCapacity&~1; | |
108 | } | |
109 | /* count is even */ | |
374ca955 A |
110 | targetCapacity-=count; |
111 | count>>=1; | |
112 | length-=count; | |
113 | ||
114 | if(offsets==NULL) { | |
115 | while(count>0) { | |
116 | c=*source++; | |
117 | if(U16_IS_SINGLE(c)) { | |
118 | target[0]=(uint8_t)(c>>8); | |
119 | target[1]=(uint8_t)c; | |
120 | target+=2; | |
121 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { | |
122 | ++source; | |
123 | --count; | |
124 | target[0]=(uint8_t)(c>>8); | |
125 | target[1]=(uint8_t)c; | |
126 | target[2]=(uint8_t)(trail>>8); | |
127 | target[3]=(uint8_t)trail; | |
128 | target+=4; | |
129 | } else { | |
130 | break; | |
131 | } | |
132 | --count; | |
133 | } | |
134 | } else { | |
135 | while(count>0) { | |
136 | c=*source++; | |
137 | if(U16_IS_SINGLE(c)) { | |
138 | target[0]=(uint8_t)(c>>8); | |
139 | target[1]=(uint8_t)c; | |
140 | target+=2; | |
141 | *offsets++=sourceIndex; | |
142 | *offsets++=sourceIndex++; | |
143 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { | |
144 | ++source; | |
145 | --count; | |
146 | target[0]=(uint8_t)(c>>8); | |
147 | target[1]=(uint8_t)c; | |
148 | target[2]=(uint8_t)(trail>>8); | |
149 | target[3]=(uint8_t)trail; | |
150 | target+=4; | |
151 | *offsets++=sourceIndex; | |
152 | *offsets++=sourceIndex; | |
153 | *offsets++=sourceIndex; | |
154 | *offsets++=sourceIndex; | |
155 | sourceIndex+=2; | |
156 | } else { | |
157 | break; | |
158 | } | |
b75a7d8f A |
159 | --count; |
160 | } | |
161 | } | |
b75a7d8f | 162 | |
374ca955 A |
163 | if(count==0) { |
164 | /* done with the loop for complete UChars */ | |
165 | if(length>0 && targetCapacity>0) { | |
166 | /* | |
167 | * there is more input and some target capacity - | |
168 | * it must be targetCapacity==1 because otherwise | |
169 | * the above would have copied more; | |
170 | * prepare for overflow output | |
171 | */ | |
172 | if(U16_IS_SINGLE(c=*source++)) { | |
173 | overflow[0]=(char)(c>>8); | |
174 | overflow[1]=(char)c; | |
175 | length=2; /* 2 bytes to output */ | |
176 | c=0; | |
177 | /* } else { keep c for surrogate handling, length will be set there */ | |
178 | } | |
179 | } else { | |
180 | length=0; | |
181 | c=0; | |
182 | } | |
b75a7d8f | 183 | } else { |
374ca955 A |
184 | /* keep c for surrogate handling, length will be set there */ |
185 | targetCapacity+=2*count; | |
b75a7d8f | 186 | } |
374ca955 A |
187 | } else { |
188 | length=0; /* from here on, length counts the bytes in overflow[] */ | |
b75a7d8f | 189 | } |
374ca955 A |
190 | |
191 | if(c!=0) { | |
192 | /* | |
193 | * c is a surrogate, and | |
194 | * - source or target too short | |
195 | * - or the surrogate is unmatched | |
196 | */ | |
197 | length=0; | |
198 | if(U16_IS_SURROGATE_LEAD(c)) { | |
199 | if(source<pArgs->sourceLimit) { | |
200 | if(U16_IS_TRAIL(trail=*source)) { | |
201 | /* output the surrogate pair, will overflow (see conditions comment above) */ | |
202 | ++source; | |
203 | overflow[0]=(char)(c>>8); | |
204 | overflow[1]=(char)c; | |
205 | overflow[2]=(char)(trail>>8); | |
206 | overflow[3]=(char)trail; | |
207 | length=4; /* 4 bytes to output */ | |
208 | c=0; | |
209 | } else { | |
210 | /* unmatched lead surrogate */ | |
211 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
212 | } | |
213 | } else { | |
214 | /* see if the trail surrogate is in the next buffer */ | |
215 | } | |
216 | } else { | |
217 | /* unmatched trail surrogate */ | |
218 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
b75a7d8f | 219 | } |
374ca955 | 220 | cnv->fromUChar32=c; |
b75a7d8f A |
221 | } |
222 | ||
374ca955 A |
223 | if(length>0) { |
224 | /* output length bytes with overflow (length>targetCapacity>0) */ | |
225 | ucnv_fromUWriteBytes(cnv, | |
226 | overflow, length, | |
227 | (char **)&target, pArgs->targetLimit, | |
228 | &offsets, sourceIndex, | |
229 | pErrorCode); | |
73c04bcf | 230 | targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
b75a7d8f A |
231 | } |
232 | ||
374ca955 A |
233 | if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { |
234 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
b75a7d8f A |
235 | } |
236 | ||
237 | /* write back the updated pointers */ | |
374ca955 A |
238 | pArgs->source=source; |
239 | pArgs->target=(char *)target; | |
240 | pArgs->offsets=offsets; | |
b75a7d8f A |
241 | } |
242 | ||
b75a7d8f | 243 | static void |
374ca955 | 244 | _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
b75a7d8f | 245 | UErrorCode *pErrorCode) { |
374ca955 A |
246 | UConverter *cnv; |
247 | const uint8_t *source; | |
248 | UChar *target; | |
249 | int32_t *offsets; | |
250 | ||
73c04bcf | 251 | uint32_t targetCapacity, length, count, sourceIndex; |
374ca955 A |
252 | UChar c, trail; |
253 | ||
254 | cnv=pArgs->converter; | |
255 | source=(const uint8_t *)pArgs->source; | |
73c04bcf | 256 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
374ca955 | 257 | if(length<=0 && cnv->toUnicodeStatus==0) { |
b75a7d8f A |
258 | /* no input, nothing to do */ |
259 | return; | |
260 | } | |
261 | ||
73c04bcf A |
262 | target=pArgs->target; |
263 | if(target >= pArgs->targetLimit) { | |
374ca955 | 264 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
b75a7d8f A |
265 | return; |
266 | } | |
267 | ||
73c04bcf | 268 | targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
374ca955 A |
269 | offsets=pArgs->offsets; |
270 | sourceIndex=0; | |
271 | c=0; | |
272 | ||
273 | /* complete a partial UChar or pair from the last call */ | |
274 | if(cnv->toUnicodeStatus!=0) { | |
b75a7d8f | 275 | /* |
374ca955 A |
276 | * special case: single byte from a previous buffer, |
277 | * where the byte turned out not to belong to a trail surrogate | |
278 | * and the preceding, unmatched lead surrogate was put into toUBytes[] | |
279 | * for error handling | |
b75a7d8f | 280 | */ |
374ca955 A |
281 | cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
282 | cnv->toULength=1; | |
283 | cnv->toUnicodeStatus=0; | |
b75a7d8f | 284 | } |
374ca955 A |
285 | if((count=cnv->toULength)!=0) { |
286 | uint8_t *p=cnv->toUBytes; | |
287 | do { | |
288 | p[count++]=*source++; | |
289 | ++sourceIndex; | |
290 | --length; | |
291 | if(count==2) { | |
292 | c=((UChar)p[0]<<8)|p[1]; | |
293 | if(U16_IS_SINGLE(c)) { | |
294 | /* output the BMP code point */ | |
295 | *target++=c; | |
296 | if(offsets!=NULL) { | |
297 | *offsets++=-1; | |
298 | } | |
299 | --targetCapacity; | |
300 | count=0; | |
301 | c=0; | |
302 | break; | |
303 | } else if(U16_IS_SURROGATE_LEAD(c)) { | |
304 | /* continue collecting bytes for the trail surrogate */ | |
305 | c=0; /* avoid unnecessary surrogate handling below */ | |
306 | } else { | |
307 | /* fall through to error handling for an unmatched trail surrogate */ | |
308 | break; | |
309 | } | |
310 | } else if(count==4) { | |
311 | c=((UChar)p[0]<<8)|p[1]; | |
312 | trail=((UChar)p[2]<<8)|p[3]; | |
313 | if(U16_IS_TRAIL(trail)) { | |
314 | /* output the surrogate pair */ | |
315 | *target++=c; | |
316 | if(targetCapacity>=2) { | |
317 | *target++=trail; | |
318 | if(offsets!=NULL) { | |
319 | *offsets++=-1; | |
320 | *offsets++=-1; | |
321 | } | |
322 | targetCapacity-=2; | |
323 | } else /* targetCapacity==1 */ { | |
324 | targetCapacity=0; | |
325 | cnv->UCharErrorBuffer[0]=trail; | |
326 | cnv->UCharErrorBufferLength=1; | |
327 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
328 | } | |
329 | count=0; | |
330 | c=0; | |
331 | break; | |
332 | } else { | |
333 | /* unmatched lead surrogate, handle here for consistent toUBytes[] */ | |
334 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
335 | ||
336 | /* back out reading the code unit after it */ | |
337 | if(((const uint8_t *)pArgs->source-source)>=2) { | |
338 | source-=2; | |
339 | } else { | |
340 | /* | |
341 | * if the trail unit's first byte was in a previous buffer, then | |
342 | * we need to put it into a special place because toUBytes[] will be | |
343 | * used for the lead unit's bytes | |
344 | */ | |
345 | cnv->toUnicodeStatus=0x100|p[2]; | |
346 | --source; | |
347 | } | |
348 | cnv->toULength=2; | |
349 | ||
350 | /* write back the updated pointers */ | |
351 | pArgs->source=(const char *)source; | |
352 | pArgs->target=target; | |
353 | pArgs->offsets=offsets; | |
354 | return; | |
355 | } | |
b75a7d8f | 356 | } |
374ca955 A |
357 | } while(length>0); |
358 | cnv->toULength=(int8_t)count; | |
b75a7d8f A |
359 | } |
360 | ||
374ca955 A |
361 | /* copy an even number of bytes for complete UChars */ |
362 | count=2*targetCapacity; | |
363 | if(count>length) { | |
364 | count=length&~1; | |
365 | } | |
366 | if(c==0 && count>0) { | |
367 | length-=count; | |
368 | count>>=1; | |
369 | targetCapacity-=count; | |
370 | if(offsets==NULL) { | |
371 | do { | |
372 | c=((UChar)source[0]<<8)|source[1]; | |
373 | source+=2; | |
374 | if(U16_IS_SINGLE(c)) { | |
375 | *target++=c; | |
376 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && | |
377 | U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) | |
378 | ) { | |
379 | source+=2; | |
380 | --count; | |
381 | *target++=c; | |
382 | *target++=trail; | |
383 | } else { | |
384 | break; | |
385 | } | |
386 | } while(--count>0); | |
b75a7d8f | 387 | } else { |
374ca955 A |
388 | do { |
389 | c=((UChar)source[0]<<8)|source[1]; | |
390 | source+=2; | |
391 | if(U16_IS_SINGLE(c)) { | |
392 | *target++=c; | |
393 | *offsets++=sourceIndex; | |
394 | sourceIndex+=2; | |
395 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && | |
396 | U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) | |
397 | ) { | |
398 | source+=2; | |
399 | --count; | |
400 | *target++=c; | |
401 | *target++=trail; | |
402 | *offsets++=sourceIndex; | |
403 | *offsets++=sourceIndex; | |
404 | sourceIndex+=4; | |
405 | } else { | |
406 | break; | |
407 | } | |
408 | } while(--count>0); | |
b75a7d8f | 409 | } |
b75a7d8f | 410 | |
374ca955 A |
411 | if(count==0) { |
412 | /* done with the loop for complete UChars */ | |
413 | c=0; | |
414 | } else { | |
415 | /* keep c for surrogate handling, trail will be set there */ | |
416 | length+=2*(count-1); /* one more byte pair was consumed than count decremented */ | |
417 | targetCapacity+=count; | |
b75a7d8f A |
418 | } |
419 | } | |
420 | ||
374ca955 A |
421 | if(c!=0) { |
422 | /* | |
423 | * c is a surrogate, and | |
424 | * - source or target too short | |
425 | * - or the surrogate is unmatched | |
426 | */ | |
427 | cnv->toUBytes[0]=(uint8_t)(c>>8); | |
428 | cnv->toUBytes[1]=(uint8_t)c; | |
429 | cnv->toULength=2; | |
430 | ||
431 | if(U16_IS_SURROGATE_LEAD(c)) { | |
432 | if(length>=2) { | |
433 | if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { | |
434 | /* output the surrogate pair, will overflow (see conditions comment above) */ | |
435 | source+=2; | |
436 | length-=2; | |
437 | *target++=c; | |
438 | if(offsets!=NULL) { | |
439 | *offsets++=sourceIndex; | |
440 | } | |
441 | cnv->UCharErrorBuffer[0]=trail; | |
442 | cnv->UCharErrorBufferLength=1; | |
443 | cnv->toULength=0; | |
444 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
445 | } else { | |
446 | /* unmatched lead surrogate */ | |
447 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
448 | } | |
449 | } else { | |
450 | /* see if the trail surrogate is in the next buffer */ | |
b75a7d8f A |
451 | } |
452 | } else { | |
374ca955 A |
453 | /* unmatched trail surrogate */ |
454 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
b75a7d8f | 455 | } |
b75a7d8f A |
456 | } |
457 | ||
374ca955 A |
458 | if(U_SUCCESS(*pErrorCode)) { |
459 | /* check for a remaining source byte */ | |
460 | if(length>0) { | |
461 | if(targetCapacity==0) { | |
462 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
463 | } else { | |
464 | /* it must be length==1 because otherwise the above would have copied more */ | |
465 | cnv->toUBytes[cnv->toULength++]=*source++; | |
b75a7d8f A |
466 | } |
467 | } | |
468 | } | |
469 | ||
470 | /* write back the updated pointers */ | |
374ca955 A |
471 | pArgs->source=(const char *)source; |
472 | pArgs->target=target; | |
473 | pArgs->offsets=offsets; | |
b75a7d8f A |
474 | } |
475 | ||
374ca955 A |
476 | static UChar32 |
477 | _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { | |
478 | const uint8_t *s, *sourceLimit; | |
479 | UChar32 c; | |
b75a7d8f | 480 | |
374ca955 A |
481 | s=(const uint8_t *)pArgs->source; |
482 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; | |
b75a7d8f | 483 | |
374ca955 A |
484 | if(s>=sourceLimit) { |
485 | /* no input */ | |
486 | *err=U_INDEX_OUTOFBOUNDS_ERROR; | |
b75a7d8f A |
487 | return 0xffff; |
488 | } | |
489 | ||
374ca955 A |
490 | if(s+2>sourceLimit) { |
491 | /* only one byte: truncated UChar */ | |
492 | pArgs->converter->toUBytes[0]=*s++; | |
493 | pArgs->converter->toULength=1; | |
494 | pArgs->source=(const char *)s; | |
495 | *err = U_TRUNCATED_CHAR_FOUND; | |
496 | return 0xffff; | |
497 | } | |
b75a7d8f | 498 | |
374ca955 A |
499 | /* get one UChar */ |
500 | c=((UChar32)*s<<8)|s[1]; | |
501 | s+=2; | |
502 | ||
503 | /* check for a surrogate pair */ | |
504 | if(U_IS_SURROGATE(c)) { | |
505 | if(U16_IS_SURROGATE_LEAD(c)) { | |
506 | if(s+2<=sourceLimit) { | |
507 | UChar trail; | |
508 | ||
509 | /* get a second UChar and see if it is a trail surrogate */ | |
510 | trail=((UChar)*s<<8)|s[1]; | |
511 | if(U16_IS_TRAIL(trail)) { | |
512 | c=U16_GET_SUPPLEMENTARY(c, trail); | |
513 | s+=2; | |
514 | } else { | |
515 | /* unmatched lead surrogate */ | |
516 | c=-2; | |
517 | } | |
518 | } else { | |
519 | /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ | |
520 | uint8_t *bytes=pArgs->converter->toUBytes; | |
521 | s-=2; | |
522 | pArgs->converter->toULength=(int8_t)(sourceLimit-s); | |
523 | do { | |
524 | *bytes++=*s++; | |
525 | } while(s<sourceLimit); | |
526 | ||
527 | c=0xffff; | |
528 | *err=U_TRUNCATED_CHAR_FOUND; | |
529 | } | |
530 | } else { | |
531 | /* unmatched trail surrogate */ | |
532 | c=-2; | |
b75a7d8f A |
533 | } |
534 | ||
374ca955 A |
535 | if(c<0) { |
536 | /* write the unmatched surrogate */ | |
537 | uint8_t *bytes=pArgs->converter->toUBytes; | |
538 | pArgs->converter->toULength=2; | |
539 | *bytes=*(s-2); | |
540 | bytes[1]=*(s-1); | |
b75a7d8f | 541 | |
374ca955 A |
542 | c=0xffff; |
543 | *err=U_ILLEGAL_CHAR_FOUND; | |
b75a7d8f A |
544 | } |
545 | } | |
546 | ||
374ca955 A |
547 | pArgs->source=(const char *)s; |
548 | return c; | |
b75a7d8f A |
549 | } |
550 | ||
551 | static const UConverterImpl _UTF16BEImpl={ | |
552 | UCNV_UTF16_BigEndian, | |
553 | ||
554 | NULL, | |
555 | NULL, | |
556 | ||
557 | NULL, | |
558 | NULL, | |
559 | NULL, | |
560 | ||
561 | _UTF16BEToUnicodeWithOffsets, | |
562 | _UTF16BEToUnicodeWithOffsets, | |
563 | _UTF16BEFromUnicodeWithOffsets, | |
564 | _UTF16BEFromUnicodeWithOffsets, | |
374ca955 | 565 | _UTF16BEGetNextUChar, |
b75a7d8f A |
566 | |
567 | NULL, | |
568 | NULL, | |
569 | NULL, | |
570 | NULL, | |
73c04bcf | 571 | ucnv_getNonSurrogateUnicodeSet |
b75a7d8f A |
572 | }; |
573 | ||
b75a7d8f A |
574 | static const UConverterStaticData _UTF16BEStaticData={ |
575 | sizeof(UConverterStaticData), | |
576 | "UTF-16BE", | |
577 | 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, | |
578 | { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, | |
579 | 0, | |
580 | 0, | |
581 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
582 | }; | |
583 | ||
584 | ||
585 | const UConverterSharedData _UTF16BEData={ | |
586 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
587 | NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, | |
588 | 0 | |
589 | }; | |
590 | ||
591 | /* UTF-16LE ----------------------------------------------------------------- */ | |
592 | ||
374ca955 A |
593 | static void |
594 | _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
595 | UErrorCode *pErrorCode) { | |
596 | UConverter *cnv; | |
597 | const UChar *source; | |
73c04bcf | 598 | char *target; |
374ca955 A |
599 | int32_t *offsets; |
600 | ||
73c04bcf | 601 | uint32_t targetCapacity, length, sourceIndex; |
374ca955 A |
602 | UChar c, trail; |
603 | char overflow[4]; | |
604 | ||
605 | source=pArgs->source; | |
73c04bcf | 606 | length=(int32_t)(pArgs->sourceLimit-source); |
374ca955 A |
607 | if(length<=0) { |
608 | /* no input, nothing to do */ | |
609 | return; | |
610 | } | |
611 | ||
73c04bcf A |
612 | cnv=pArgs->converter; |
613 | ||
614 | /* write the BOM if necessary */ | |
615 | if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { | |
616 | static const char bom[]={ (char)0xff, (char)0xfe }; | |
617 | ucnv_fromUWriteBytes(cnv, | |
618 | bom, 2, | |
619 | &pArgs->target, pArgs->targetLimit, | |
620 | &pArgs->offsets, -1, | |
621 | pErrorCode); | |
622 | cnv->fromUnicodeStatus=0; | |
623 | } | |
624 | ||
625 | target=pArgs->target; | |
626 | if(target >= pArgs->targetLimit) { | |
374ca955 A |
627 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
628 | return; | |
629 | } | |
630 | ||
73c04bcf | 631 | targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
374ca955 A |
632 | offsets=pArgs->offsets; |
633 | sourceIndex=0; | |
634 | ||
635 | /* c!=0 indicates in several places outside the main loops that a surrogate was found */ | |
636 | ||
637 | if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { | |
638 | /* the last buffer ended with a lead surrogate, output the surrogate pair */ | |
639 | ++source; | |
640 | --length; | |
641 | target[0]=(uint8_t)c; | |
642 | target[1]=(uint8_t)(c>>8); | |
643 | target[2]=(uint8_t)trail; | |
644 | target[3]=(uint8_t)(trail>>8); | |
645 | target+=4; | |
646 | targetCapacity-=4; | |
647 | if(offsets!=NULL) { | |
648 | *offsets++=-1; | |
649 | *offsets++=-1; | |
650 | *offsets++=-1; | |
651 | *offsets++=-1; | |
b75a7d8f | 652 | } |
374ca955 A |
653 | sourceIndex=1; |
654 | cnv->fromUChar32=c=0; | |
655 | } | |
656 | ||
374ca955 | 657 | if(c==0) { |
73c04bcf A |
658 | /* copy an even number of bytes for complete UChars */ |
659 | uint32_t count=2*length; | |
660 | if(count>targetCapacity) { | |
661 | count=targetCapacity&~1; | |
662 | } | |
663 | /* count is even */ | |
374ca955 A |
664 | targetCapacity-=count; |
665 | count>>=1; | |
666 | length-=count; | |
667 | ||
668 | if(offsets==NULL) { | |
669 | while(count>0) { | |
670 | c=*source++; | |
671 | if(U16_IS_SINGLE(c)) { | |
672 | target[0]=(uint8_t)c; | |
673 | target[1]=(uint8_t)(c>>8); | |
674 | target+=2; | |
675 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { | |
676 | ++source; | |
677 | --count; | |
678 | target[0]=(uint8_t)c; | |
679 | target[1]=(uint8_t)(c>>8); | |
680 | target[2]=(uint8_t)trail; | |
681 | target[3]=(uint8_t)(trail>>8); | |
682 | target+=4; | |
683 | } else { | |
684 | break; | |
685 | } | |
686 | --count; | |
687 | } | |
688 | } else { | |
689 | while(count>0) { | |
690 | c=*source++; | |
691 | if(U16_IS_SINGLE(c)) { | |
692 | target[0]=(uint8_t)c; | |
693 | target[1]=(uint8_t)(c>>8); | |
694 | target+=2; | |
695 | *offsets++=sourceIndex; | |
696 | *offsets++=sourceIndex++; | |
697 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { | |
698 | ++source; | |
699 | --count; | |
700 | target[0]=(uint8_t)c; | |
701 | target[1]=(uint8_t)(c>>8); | |
702 | target[2]=(uint8_t)trail; | |
703 | target[3]=(uint8_t)(trail>>8); | |
704 | target+=4; | |
705 | *offsets++=sourceIndex; | |
706 | *offsets++=sourceIndex; | |
707 | *offsets++=sourceIndex; | |
708 | *offsets++=sourceIndex; | |
709 | sourceIndex+=2; | |
710 | } else { | |
711 | break; | |
712 | } | |
713 | --count; | |
714 | } | |
b75a7d8f A |
715 | } |
716 | ||
374ca955 A |
717 | if(count==0) { |
718 | /* done with the loop for complete UChars */ | |
719 | if(length>0 && targetCapacity>0) { | |
720 | /* | |
721 | * there is more input and some target capacity - | |
722 | * it must be targetCapacity==1 because otherwise | |
723 | * the above would have copied more; | |
724 | * prepare for overflow output | |
725 | */ | |
726 | if(U16_IS_SINGLE(c=*source++)) { | |
727 | overflow[0]=(char)c; | |
728 | overflow[1]=(char)(c>>8); | |
729 | length=2; /* 2 bytes to output */ | |
730 | c=0; | |
731 | /* } else { keep c for surrogate handling, length will be set there */ | |
732 | } | |
733 | } else { | |
734 | length=0; | |
735 | c=0; | |
736 | } | |
737 | } else { | |
738 | /* keep c for surrogate handling, length will be set there */ | |
739 | targetCapacity+=2*count; | |
740 | } | |
741 | } else { | |
742 | length=0; /* from here on, length counts the bytes in overflow[] */ | |
743 | } | |
744 | ||
745 | if(c!=0) { | |
746 | /* | |
747 | * c is a surrogate, and | |
748 | * - source or target too short | |
749 | * - or the surrogate is unmatched | |
750 | */ | |
751 | length=0; | |
752 | if(U16_IS_SURROGATE_LEAD(c)) { | |
753 | if(source<pArgs->sourceLimit) { | |
754 | if(U16_IS_TRAIL(trail=*source)) { | |
755 | /* output the surrogate pair, will overflow (see conditions comment above) */ | |
756 | ++source; | |
757 | overflow[0]=(char)c; | |
758 | overflow[1]=(char)(c>>8); | |
759 | overflow[2]=(char)trail; | |
760 | overflow[3]=(char)(trail>>8); | |
761 | length=4; /* 4 bytes to output */ | |
762 | c=0; | |
763 | } else { | |
764 | /* unmatched lead surrogate */ | |
765 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
766 | } | |
767 | } else { | |
768 | /* see if the trail surrogate is in the next buffer */ | |
769 | } | |
770 | } else { | |
771 | /* unmatched trail surrogate */ | |
772 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
773 | } | |
774 | cnv->fromUChar32=c; | |
b75a7d8f A |
775 | } |
776 | ||
374ca955 A |
777 | if(length>0) { |
778 | /* output length bytes with overflow (length>targetCapacity>0) */ | |
779 | ucnv_fromUWriteBytes(cnv, | |
780 | overflow, length, | |
73c04bcf | 781 | &target, pArgs->targetLimit, |
374ca955 A |
782 | &offsets, sourceIndex, |
783 | pErrorCode); | |
73c04bcf | 784 | targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
374ca955 | 785 | } |
b75a7d8f | 786 | |
374ca955 A |
787 | if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { |
788 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
789 | } | |
790 | ||
791 | /* write back the updated pointers */ | |
792 | pArgs->source=source; | |
73c04bcf | 793 | pArgs->target=target; |
374ca955 A |
794 | pArgs->offsets=offsets; |
795 | } | |
b75a7d8f | 796 | |
374ca955 A |
797 | static void |
798 | _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
799 | UErrorCode *pErrorCode) { | |
800 | UConverter *cnv; | |
801 | const uint8_t *source; | |
802 | UChar *target; | |
803 | int32_t *offsets; | |
804 | ||
73c04bcf | 805 | uint32_t targetCapacity, length, count, sourceIndex; |
374ca955 A |
806 | UChar c, trail; |
807 | ||
808 | cnv=pArgs->converter; | |
809 | source=(const uint8_t *)pArgs->source; | |
73c04bcf | 810 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
374ca955 A |
811 | if(length<=0 && cnv->toUnicodeStatus==0) { |
812 | /* no input, nothing to do */ | |
813 | return; | |
814 | } | |
815 | ||
73c04bcf A |
816 | target=pArgs->target; |
817 | if(target >= pArgs->targetLimit) { | |
374ca955 A |
818 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
819 | return; | |
820 | } | |
821 | ||
73c04bcf | 822 | targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
374ca955 A |
823 | offsets=pArgs->offsets; |
824 | sourceIndex=0; | |
825 | c=0; | |
826 | ||
827 | /* complete a partial UChar or pair from the last call */ | |
828 | if(cnv->toUnicodeStatus!=0) { | |
829 | /* | |
830 | * special case: single byte from a previous buffer, | |
831 | * where the byte turned out not to belong to a trail surrogate | |
832 | * and the preceding, unmatched lead surrogate was put into toUBytes[] | |
833 | * for error handling | |
834 | */ | |
835 | cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; | |
836 | cnv->toULength=1; | |
837 | cnv->toUnicodeStatus=0; | |
838 | } | |
839 | if((count=cnv->toULength)!=0) { | |
840 | uint8_t *p=cnv->toUBytes; | |
841 | do { | |
842 | p[count++]=*source++; | |
843 | ++sourceIndex; | |
844 | --length; | |
845 | if(count==2) { | |
846 | c=((UChar)p[1]<<8)|p[0]; | |
847 | if(U16_IS_SINGLE(c)) { | |
848 | /* output the BMP code point */ | |
849 | *target++=c; | |
850 | if(offsets!=NULL) { | |
851 | *offsets++=-1; | |
852 | } | |
853 | --targetCapacity; | |
854 | count=0; | |
855 | c=0; | |
856 | break; | |
857 | } else if(U16_IS_SURROGATE_LEAD(c)) { | |
858 | /* continue collecting bytes for the trail surrogate */ | |
859 | c=0; /* avoid unnecessary surrogate handling below */ | |
860 | } else { | |
861 | /* fall through to error handling for an unmatched trail surrogate */ | |
862 | break; | |
863 | } | |
864 | } else if(count==4) { | |
865 | c=((UChar)p[1]<<8)|p[0]; | |
866 | trail=((UChar)p[3]<<8)|p[2]; | |
867 | if(U16_IS_TRAIL(trail)) { | |
868 | /* output the surrogate pair */ | |
869 | *target++=c; | |
870 | if(targetCapacity>=2) { | |
871 | *target++=trail; | |
872 | if(offsets!=NULL) { | |
873 | *offsets++=-1; | |
874 | *offsets++=-1; | |
875 | } | |
876 | targetCapacity-=2; | |
877 | } else /* targetCapacity==1 */ { | |
878 | targetCapacity=0; | |
879 | cnv->UCharErrorBuffer[0]=trail; | |
880 | cnv->UCharErrorBufferLength=1; | |
881 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
882 | } | |
883 | count=0; | |
884 | c=0; | |
885 | break; | |
886 | } else { | |
887 | /* unmatched lead surrogate, handle here for consistent toUBytes[] */ | |
888 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
889 | ||
890 | /* back out reading the code unit after it */ | |
891 | if(((const uint8_t *)pArgs->source-source)>=2) { | |
892 | source-=2; | |
893 | } else { | |
894 | /* | |
895 | * if the trail unit's first byte was in a previous buffer, then | |
896 | * we need to put it into a special place because toUBytes[] will be | |
897 | * used for the lead unit's bytes | |
898 | */ | |
899 | cnv->toUnicodeStatus=0x100|p[2]; | |
900 | --source; | |
901 | } | |
902 | cnv->toULength=2; | |
903 | ||
904 | /* write back the updated pointers */ | |
905 | pArgs->source=(const char *)source; | |
906 | pArgs->target=target; | |
907 | pArgs->offsets=offsets; | |
908 | return; | |
909 | } | |
910 | } | |
911 | } while(length>0); | |
912 | cnv->toULength=(int8_t)count; | |
913 | } | |
914 | ||
915 | /* copy an even number of bytes for complete UChars */ | |
916 | count=2*targetCapacity; | |
917 | if(count>length) { | |
918 | count=length&~1; | |
919 | } | |
920 | if(c==0 && count>0) { | |
921 | length-=count; | |
922 | count>>=1; | |
923 | targetCapacity-=count; | |
924 | if(offsets==NULL) { | |
925 | do { | |
926 | c=((UChar)source[1]<<8)|source[0]; | |
927 | source+=2; | |
928 | if(U16_IS_SINGLE(c)) { | |
929 | *target++=c; | |
930 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && | |
931 | U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) | |
932 | ) { | |
933 | source+=2; | |
934 | --count; | |
935 | *target++=c; | |
936 | *target++=trail; | |
937 | } else { | |
938 | break; | |
939 | } | |
940 | } while(--count>0); | |
941 | } else { | |
942 | do { | |
943 | c=((UChar)source[1]<<8)|source[0]; | |
944 | source+=2; | |
945 | if(U16_IS_SINGLE(c)) { | |
946 | *target++=c; | |
947 | *offsets++=sourceIndex; | |
948 | sourceIndex+=2; | |
949 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && | |
950 | U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) | |
951 | ) { | |
952 | source+=2; | |
953 | --count; | |
954 | *target++=c; | |
955 | *target++=trail; | |
956 | *offsets++=sourceIndex; | |
957 | *offsets++=sourceIndex; | |
958 | sourceIndex+=4; | |
959 | } else { | |
960 | break; | |
961 | } | |
962 | } while(--count>0); | |
b75a7d8f A |
963 | } |
964 | ||
374ca955 A |
965 | if(count==0) { |
966 | /* done with the loop for complete UChars */ | |
967 | c=0; | |
968 | } else { | |
969 | /* keep c for surrogate handling, trail will be set there */ | |
970 | length+=2*(count-1); /* one more byte pair was consumed than count decremented */ | |
971 | targetCapacity+=count; | |
972 | } | |
973 | } | |
b75a7d8f | 974 | |
374ca955 A |
975 | if(c!=0) { |
976 | /* | |
977 | * c is a surrogate, and | |
978 | * - source or target too short | |
979 | * - or the surrogate is unmatched | |
980 | */ | |
981 | cnv->toUBytes[0]=(uint8_t)c; | |
982 | cnv->toUBytes[1]=(uint8_t)(c>>8); | |
983 | cnv->toULength=2; | |
984 | ||
985 | if(U16_IS_SURROGATE_LEAD(c)) { | |
986 | if(length>=2) { | |
987 | if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { | |
988 | /* output the surrogate pair, will overflow (see conditions comment above) */ | |
989 | source+=2; | |
990 | length-=2; | |
991 | *target++=c; | |
992 | if(offsets!=NULL) { | |
993 | *offsets++=sourceIndex; | |
994 | } | |
995 | cnv->UCharErrorBuffer[0]=trail; | |
996 | cnv->UCharErrorBufferLength=1; | |
997 | cnv->toULength=0; | |
998 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
999 | } else { | |
1000 | /* unmatched lead surrogate */ | |
1001 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
1002 | } | |
1003 | } else { | |
1004 | /* see if the trail surrogate is in the next buffer */ | |
1005 | } | |
1006 | } else { | |
1007 | /* unmatched trail surrogate */ | |
1008 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
b75a7d8f A |
1009 | } |
1010 | } | |
1011 | ||
374ca955 A |
1012 | if(U_SUCCESS(*pErrorCode)) { |
1013 | /* check for a remaining source byte */ | |
1014 | if(length>0) { | |
1015 | if(targetCapacity==0) { | |
1016 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
1017 | } else { | |
1018 | /* it must be length==1 because otherwise the above would have copied more */ | |
1019 | cnv->toUBytes[cnv->toULength++]=*source++; | |
1020 | } | |
1021 | } | |
1022 | } | |
1023 | ||
1024 | /* write back the updated pointers */ | |
1025 | pArgs->source=(const char *)source; | |
1026 | pArgs->target=target; | |
1027 | pArgs->offsets=offsets; | |
1028 | } | |
1029 | ||
1030 | static UChar32 | |
1031 | _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { | |
1032 | const uint8_t *s, *sourceLimit; | |
1033 | UChar32 c; | |
1034 | ||
1035 | s=(const uint8_t *)pArgs->source; | |
1036 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; | |
1037 | ||
1038 | if(s>=sourceLimit) { | |
1039 | /* no input */ | |
1040 | *err=U_INDEX_OUTOFBOUNDS_ERROR; | |
1041 | return 0xffff; | |
1042 | } | |
1043 | ||
1044 | if(s+2>sourceLimit) { | |
1045 | /* only one byte: truncated UChar */ | |
1046 | pArgs->converter->toUBytes[0]=*s++; | |
1047 | pArgs->converter->toULength=1; | |
1048 | pArgs->source=(const char *)s; | |
1049 | *err = U_TRUNCATED_CHAR_FOUND; | |
1050 | return 0xffff; | |
1051 | } | |
1052 | ||
1053 | /* get one UChar */ | |
1054 | c=((UChar32)s[1]<<8)|*s; | |
1055 | s+=2; | |
1056 | ||
1057 | /* check for a surrogate pair */ | |
1058 | if(U_IS_SURROGATE(c)) { | |
1059 | if(U16_IS_SURROGATE_LEAD(c)) { | |
1060 | if(s+2<=sourceLimit) { | |
1061 | UChar trail; | |
1062 | ||
1063 | /* get a second UChar and see if it is a trail surrogate */ | |
1064 | trail=((UChar)s[1]<<8)|*s; | |
1065 | if(U16_IS_TRAIL(trail)) { | |
1066 | c=U16_GET_SUPPLEMENTARY(c, trail); | |
1067 | s+=2; | |
1068 | } else { | |
1069 | /* unmatched lead surrogate */ | |
1070 | c=-2; | |
1071 | } | |
1072 | } else { | |
1073 | /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ | |
1074 | uint8_t *bytes=pArgs->converter->toUBytes; | |
1075 | s-=2; | |
1076 | pArgs->converter->toULength=(int8_t)(sourceLimit-s); | |
1077 | do { | |
1078 | *bytes++=*s++; | |
1079 | } while(s<sourceLimit); | |
1080 | ||
1081 | c=0xffff; | |
1082 | *err=U_TRUNCATED_CHAR_FOUND; | |
1083 | } | |
1084 | } else { | |
1085 | /* unmatched trail surrogate */ | |
1086 | c=-2; | |
1087 | } | |
1088 | ||
1089 | if(c<0) { | |
1090 | /* write the unmatched surrogate */ | |
1091 | uint8_t *bytes=pArgs->converter->toUBytes; | |
1092 | pArgs->converter->toULength=2; | |
1093 | *bytes=*(s-2); | |
1094 | bytes[1]=*(s-1); | |
1095 | ||
1096 | c=0xffff; | |
1097 | *err=U_ILLEGAL_CHAR_FOUND; | |
1098 | } | |
1099 | } | |
1100 | ||
1101 | pArgs->source=(const char *)s; | |
1102 | return c; | |
b75a7d8f A |
1103 | } |
1104 | ||
1105 | static const UConverterImpl _UTF16LEImpl={ | |
1106 | UCNV_UTF16_LittleEndian, | |
1107 | ||
1108 | NULL, | |
1109 | NULL, | |
1110 | ||
1111 | NULL, | |
1112 | NULL, | |
1113 | NULL, | |
1114 | ||
1115 | _UTF16LEToUnicodeWithOffsets, | |
1116 | _UTF16LEToUnicodeWithOffsets, | |
1117 | _UTF16LEFromUnicodeWithOffsets, | |
1118 | _UTF16LEFromUnicodeWithOffsets, | |
374ca955 | 1119 | _UTF16LEGetNextUChar, |
b75a7d8f A |
1120 | |
1121 | NULL, | |
1122 | NULL, | |
1123 | NULL, | |
1124 | NULL, | |
73c04bcf | 1125 | ucnv_getNonSurrogateUnicodeSet |
b75a7d8f A |
1126 | }; |
1127 | ||
1128 | ||
b75a7d8f A |
1129 | static const UConverterStaticData _UTF16LEStaticData={ |
1130 | sizeof(UConverterStaticData), | |
1131 | "UTF-16LE", | |
1132 | 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, | |
1133 | { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, | |
1134 | 0, | |
1135 | 0, | |
1136 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
1137 | }; | |
1138 | ||
1139 | ||
1140 | const UConverterSharedData _UTF16LEData={ | |
1141 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
1142 | NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, | |
1143 | 0 | |
1144 | }; | |
1145 | ||
1146 | /* UTF-16 (Detect BOM) ------------------------------------------------------ */ | |
1147 | ||
1148 | /* | |
1149 | * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE | |
1150 | * accordingly. | |
1151 | * This is a simpler version of the UTF-32 converter below, with | |
1152 | * fewer states for shorter BOMs. | |
1153 | * | |
1154 | * State values: | |
1155 | * 0 initial state | |
1156 | * 1 saw FE | |
1157 | * 2..4 - | |
1158 | * 5 saw FF | |
1159 | * 6..7 - | |
1160 | * 8 UTF-16BE mode | |
1161 | * 9 UTF-16LE mode | |
1162 | * | |
1163 | * During detection: state&3==number of matching bytes so far. | |
1164 | * | |
1165 | * On output, emit U+FEFF as the first code point. | |
1166 | */ | |
1167 | ||
1168 | static void | |
1169 | _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { | |
1170 | if(choice<=UCNV_RESET_TO_UNICODE) { | |
1171 | /* reset toUnicode: state=0 */ | |
1172 | cnv->mode=0; | |
1173 | } | |
1174 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
1175 | /* reset fromUnicode: prepare to output the UTF-16PE BOM */ | |
73c04bcf | 1176 | cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
b75a7d8f A |
1177 | } |
1178 | } | |
1179 | ||
1180 | static void | |
1181 | _UTF16Open(UConverter *cnv, | |
1182 | const char *name, | |
1183 | const char *locale, | |
1184 | uint32_t options, | |
1185 | UErrorCode *pErrorCode) { | |
1186 | _UTF16Reset(cnv, UCNV_RESET_BOTH); | |
1187 | } | |
1188 | ||
1189 | static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 }; | |
1190 | ||
1191 | static void | |
1192 | _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
1193 | UErrorCode *pErrorCode) { | |
1194 | UConverter *cnv=pArgs->converter; | |
1195 | const char *source=pArgs->source; | |
1196 | const char *sourceLimit=pArgs->sourceLimit; | |
1197 | int32_t *offsets=pArgs->offsets; | |
1198 | ||
1199 | int32_t state, offsetDelta; | |
1200 | char b; | |
1201 | ||
1202 | state=cnv->mode; | |
1203 | ||
1204 | /* | |
1205 | * If we detect a BOM in this buffer, then we must add the BOM size to the | |
1206 | * offsets because the actual converter function will not see and count the BOM. | |
1207 | * offsetDelta will have the number of the BOM bytes that are in the current buffer. | |
1208 | */ | |
1209 | offsetDelta=0; | |
1210 | ||
1211 | while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { | |
1212 | switch(state) { | |
1213 | case 0: | |
1214 | b=*source; | |
1215 | if(b==(char)0xfe) { | |
1216 | state=1; /* could be FE FF */ | |
1217 | } else if(b==(char)0xff) { | |
1218 | state=5; /* could be FF FE */ | |
1219 | } else { | |
1220 | state=8; /* default to UTF-16BE */ | |
1221 | continue; | |
1222 | } | |
1223 | ++source; | |
1224 | break; | |
1225 | case 1: | |
1226 | case 5: | |
1227 | if(*source==utf16BOM[state]) { | |
1228 | ++source; | |
1229 | if(state==1) { | |
1230 | state=8; /* detect UTF-16BE */ | |
73c04bcf | 1231 | offsetDelta=(int32_t)(source-pArgs->source); |
b75a7d8f A |
1232 | } else if(state==5) { |
1233 | state=9; /* detect UTF-16LE */ | |
73c04bcf | 1234 | offsetDelta=(int32_t)(source-pArgs->source); |
b75a7d8f A |
1235 | } |
1236 | } else { | |
1237 | /* switch to UTF-16BE and pass the previous bytes */ | |
1238 | if(source!=pArgs->source) { | |
1239 | /* just reset the source */ | |
1240 | source=pArgs->source; | |
1241 | } else { | |
1242 | UBool oldFlush=pArgs->flush; | |
1243 | ||
1244 | /* the first byte is from a previous buffer, replay it first */ | |
1245 | pArgs->source=utf16BOM+(state&4); /* select the correct BOM */ | |
1246 | pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */ | |
1247 | pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ | |
1248 | ||
1249 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
1250 | ||
1251 | /* restore real pointers; pArgs->source will be set in case 8/9 */ | |
1252 | pArgs->sourceLimit=sourceLimit; | |
1253 | pArgs->flush=oldFlush; | |
1254 | } | |
1255 | state=8; | |
1256 | continue; | |
1257 | } | |
1258 | break; | |
1259 | case 8: | |
1260 | /* call UTF-16BE */ | |
1261 | pArgs->source=source; | |
1262 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
1263 | source=pArgs->source; | |
1264 | break; | |
1265 | case 9: | |
1266 | /* call UTF-16LE */ | |
1267 | pArgs->source=source; | |
1268 | _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); | |
1269 | source=pArgs->source; | |
1270 | break; | |
1271 | default: | |
1272 | break; /* does not occur */ | |
1273 | } | |
1274 | } | |
1275 | ||
1276 | /* add BOM size to offsets - see comment at offsetDelta declaration */ | |
1277 | if(offsets!=NULL && offsetDelta!=0) { | |
1278 | int32_t *offsetsLimit=pArgs->offsets; | |
1279 | while(offsets<offsetsLimit) { | |
1280 | *offsets++ += offsetDelta; | |
1281 | } | |
1282 | } | |
1283 | ||
1284 | pArgs->source=source; | |
1285 | ||
1286 | if(source==sourceLimit && pArgs->flush) { | |
1287 | /* handle truncated input */ | |
1288 | switch(state) { | |
1289 | case 0: | |
1290 | break; /* no input at all, nothing to do */ | |
1291 | case 8: | |
1292 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
1293 | break; | |
1294 | case 9: | |
1295 | _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); | |
1296 | break; | |
1297 | default: | |
1298 | /* handle 0<state<8: call UTF-16BE with too-short input */ | |
1299 | pArgs->source=utf16BOM+(state&4); /* select the correct BOM */ | |
1300 | pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ | |
1301 | ||
1302 | /* no offsets: not enough for output */ | |
1303 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
1304 | pArgs->source=source; | |
1305 | pArgs->sourceLimit=sourceLimit; | |
374ca955 | 1306 | state=8; |
b75a7d8f A |
1307 | break; |
1308 | } | |
b75a7d8f | 1309 | } |
374ca955 A |
1310 | |
1311 | cnv->mode=state; | |
b75a7d8f A |
1312 | } |
1313 | ||
1314 | static UChar32 | |
1315 | _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, | |
1316 | UErrorCode *pErrorCode) { | |
1317 | switch(pArgs->converter->mode) { | |
1318 | case 8: | |
374ca955 | 1319 | return _UTF16BEGetNextUChar(pArgs, pErrorCode); |
b75a7d8f | 1320 | case 9: |
374ca955 | 1321 | return _UTF16LEGetNextUChar(pArgs, pErrorCode); |
b75a7d8f | 1322 | default: |
374ca955 | 1323 | return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
b75a7d8f A |
1324 | } |
1325 | } | |
1326 | ||
1327 | static const UConverterImpl _UTF16Impl = { | |
1328 | UCNV_UTF16, | |
1329 | ||
1330 | NULL, | |
1331 | NULL, | |
1332 | ||
1333 | _UTF16Open, | |
1334 | NULL, | |
1335 | _UTF16Reset, | |
1336 | ||
1337 | _UTF16ToUnicodeWithOffsets, | |
1338 | _UTF16ToUnicodeWithOffsets, | |
1339 | _UTF16PEFromUnicodeWithOffsets, | |
1340 | _UTF16PEFromUnicodeWithOffsets, | |
1341 | _UTF16GetNextUChar, | |
1342 | ||
1343 | NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ | |
1344 | NULL, | |
1345 | NULL, | |
1346 | NULL, | |
73c04bcf | 1347 | ucnv_getNonSurrogateUnicodeSet |
b75a7d8f A |
1348 | }; |
1349 | ||
1350 | static const UConverterStaticData _UTF16StaticData = { | |
1351 | sizeof(UConverterStaticData), | |
1352 | "UTF-16", | |
73c04bcf | 1353 | 1204, /* CCSID for BOM sensitive UTF-16 */ |
b75a7d8f A |
1354 | UCNV_IBM, UCNV_UTF16, 2, 2, |
1355 | #if U_IS_BIG_ENDIAN | |
1356 | { 0xff, 0xfd, 0, 0 }, 2, | |
1357 | #else | |
1358 | { 0xfd, 0xff, 0, 0 }, 2, | |
1359 | #endif | |
1360 | FALSE, FALSE, | |
1361 | 0, | |
1362 | 0, | |
1363 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
1364 | }; | |
1365 | ||
1366 | const UConverterSharedData _UTF16Data = { | |
1367 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
1368 | NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, | |
1369 | 0 | |
1370 | }; | |
374ca955 A |
1371 | |
1372 | #endif |