]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2002-2003, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | * file name: ucnv_u16.c | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2002jul01 | |
12 | * created by: Markus W. Scherer | |
13 | * | |
14 | * UTF-16 converter implementation. Used to be in ucnv_utf.c. | |
15 | */ | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | #include "unicode/ucnv.h" | |
19 | #include "unicode/ucnv_err.h" | |
20 | #include "ucnv_bld.h" | |
21 | #include "ucnv_cnv.h" | |
22 | #include "cmemory.h" | |
23 | ||
24 | /* UTF-16 Platform Endian --------------------------------------------------- */ | |
25 | ||
26 | static void | |
27 | _UTF16PEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
28 | UErrorCode *pErrorCode) { | |
29 | UConverter *cnv = pArgs->converter; | |
30 | const uint8_t *source = (const uint8_t *)pArgs->source; | |
31 | UChar *target = pArgs->target; | |
32 | int32_t *offsets = pArgs->offsets; | |
33 | int32_t targetCapacity = pArgs->targetLimit - pArgs->target; | |
34 | int32_t length = (const uint8_t *)pArgs->sourceLimit - source; | |
35 | int32_t count; | |
36 | int32_t sourceIndex = 0; | |
37 | ||
38 | if(length <= 0 && cnv->toUnicodeStatus == 0) { | |
39 | /* no input, nothing to do */ | |
40 | return; | |
41 | } | |
42 | ||
43 | if(targetCapacity <= 0) { | |
44 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
45 | return; | |
46 | } | |
47 | ||
48 | /* complete a partial UChar from the last call */ | |
49 | if(length != 0 && cnv->toUnicodeStatus != 0) { | |
50 | /* | |
51 | * copy the byte from the last call and the first one here into the target, | |
52 | * byte-wise to keep the platform endianness | |
53 | */ | |
54 | uint8_t *p = (uint8_t *)target++; | |
55 | *p++ = (uint8_t)cnv->toUnicodeStatus; | |
56 | cnv->toUnicodeStatus = 0; | |
57 | *p = *source++; | |
58 | --length; | |
59 | --targetCapacity; | |
60 | if(offsets != NULL) { | |
61 | *offsets++ = -1; | |
62 | } | |
63 | } | |
64 | ||
65 | /* copy an even number of bytes for complete UChars */ | |
66 | count = 2 * targetCapacity; | |
67 | if(count > length) { | |
68 | count = length & ~1; | |
69 | } | |
70 | if(count > 0) { | |
71 | uprv_memcpy(target, source, count); | |
72 | source += count; | |
73 | length -= count; | |
74 | count >>= 1; | |
75 | target += count; | |
76 | targetCapacity -= count; | |
77 | if(offsets != NULL) { | |
78 | while(count > 0) { | |
79 | *offsets++ = sourceIndex; | |
80 | sourceIndex += 2; | |
81 | --count; | |
82 | } | |
83 | } | |
84 | } | |
85 | ||
86 | /* check for a remaining source byte and store the status */ | |
87 | if(length >= 2) { | |
88 | /* it must be targetCapacity==0 because otherwise the above would have copied more */ | |
89 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
90 | } else if(length == 1) { | |
91 | if(pArgs->flush) { | |
92 | /* a UChar remains incomplete */ | |
93 | *pErrorCode = U_TRUNCATED_CHAR_FOUND; | |
94 | } else { | |
95 | /* consume the last byte and store it, making sure that it will never set the status to 0 */ | |
96 | cnv->toUnicodeStatus = *source++ | 0x100; | |
97 | } | |
98 | } else /* length==0 */ if(cnv->toUnicodeStatus!=0 && pArgs->flush) { | |
99 | /* a UChar remains incomplete */ | |
100 | *pErrorCode = U_TRUNCATED_CHAR_FOUND; | |
101 | } | |
102 | ||
103 | /* write back the updated pointers */ | |
104 | pArgs->source = (const char *)source; | |
105 | pArgs->target = target; | |
106 | pArgs->offsets = offsets; | |
107 | } | |
108 | ||
109 | static void | |
110 | _UTF16PEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
111 | UErrorCode *pErrorCode) { | |
112 | UConverter *cnv = pArgs->converter; | |
113 | const UChar *source = pArgs->source; | |
114 | uint8_t *target = (uint8_t *)pArgs->target; | |
115 | int32_t *offsets = pArgs->offsets; | |
116 | int32_t targetCapacity = pArgs->targetLimit - pArgs->target; | |
117 | int32_t length = pArgs->sourceLimit - source; | |
118 | int32_t count; | |
119 | int32_t sourceIndex = 0; | |
120 | ||
121 | if(length <= 0 && cnv->fromUnicodeStatus == 0) { | |
122 | /* no input, nothing to do */ | |
123 | return; | |
124 | } | |
125 | ||
126 | if(targetCapacity <= 0) { | |
127 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
128 | return; | |
129 | } | |
130 | ||
131 | /* complete a partial UChar from the last call */ | |
132 | if(cnv->fromUnicodeStatus != 0) { | |
133 | *target++ = (uint8_t)cnv->fromUnicodeStatus; | |
134 | cnv->fromUnicodeStatus = 0; | |
135 | --targetCapacity; | |
136 | if(offsets != NULL) { | |
137 | *offsets++ = -1; | |
138 | } | |
139 | } | |
140 | ||
141 | /* copy an even number of bytes for complete UChars */ | |
142 | count = 2 * length; | |
143 | if(count > targetCapacity) { | |
144 | count = targetCapacity & ~1; | |
145 | } | |
146 | if(count>0) { | |
147 | uprv_memcpy(target, source, count); | |
148 | target += count; | |
149 | targetCapacity -= count; | |
150 | count >>= 1; | |
151 | source += count; | |
152 | length -= count; | |
153 | if(offsets != NULL) { | |
154 | while(count > 0) { | |
155 | *offsets++ = sourceIndex; | |
156 | *offsets++ = sourceIndex++; | |
157 | --count; | |
158 | } | |
159 | } | |
160 | } | |
161 | ||
162 | if(length > 0) { | |
163 | /* it must be targetCapacity<=1 because otherwise the above would have copied more */ | |
164 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
165 | if(targetCapacity > 0) /* targetCapacity==1 */ { | |
166 | /* copy one byte and keep the other in the status */ | |
167 | const uint8_t *p = (const uint8_t *)source++; | |
168 | *target++ = *p++; | |
169 | cnv->fromUnicodeStatus = *p | 0x100; | |
170 | if(offsets != NULL) { | |
171 | *offsets++ = sourceIndex; | |
172 | } | |
173 | } | |
174 | } | |
175 | ||
176 | /* write back the updated pointers */ | |
177 | pArgs->source = source; | |
178 | pArgs->target = (char *)target; | |
179 | pArgs->offsets = offsets; | |
180 | } | |
181 | ||
182 | /* UTF-16 Opposite Endian --------------------------------------------------- */ | |
183 | ||
184 | /* | |
185 | * For opposite-endian UTF-16, we keep a byte pointer to the UChars | |
186 | * and copy two bytes at a time and reverse them. | |
187 | */ | |
188 | ||
189 | static void | |
190 | _UTF16OEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
191 | UErrorCode *pErrorCode) { | |
192 | UConverter *cnv = pArgs->converter; | |
193 | const uint8_t *source = (const uint8_t *)pArgs->source; | |
194 | UChar *target = pArgs->target; | |
195 | uint8_t *target8 = (uint8_t *)target; /* byte pointer to the target */ | |
196 | int32_t *offsets = pArgs->offsets; | |
197 | int32_t targetCapacity = pArgs->targetLimit - pArgs->target; | |
198 | int32_t length = (const uint8_t *)pArgs->sourceLimit - source; | |
199 | int32_t count; | |
200 | int32_t sourceIndex = 0; | |
201 | ||
202 | if(length <= 0 && cnv->toUnicodeStatus == 0) { | |
203 | /* no input, nothing to do */ | |
204 | return; | |
205 | } | |
206 | ||
207 | if(targetCapacity <= 0) { | |
208 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
209 | return; | |
210 | } | |
211 | ||
212 | /* complete a partial UChar from the last call */ | |
213 | if(length != 0 && cnv->toUnicodeStatus != 0) { | |
214 | /* | |
215 | * copy the byte from the last call and the first one here into the target, | |
216 | * byte-wise, reversing the platform endianness | |
217 | */ | |
218 | *target8++ = *source++; | |
219 | *target8++ = (uint8_t)cnv->toUnicodeStatus; | |
220 | cnv->toUnicodeStatus = 0; | |
221 | ++target; | |
222 | --length; | |
223 | --targetCapacity; | |
224 | if(offsets != NULL) { | |
225 | *offsets++ = -1; | |
226 | } | |
227 | } | |
228 | ||
229 | /* copy an even number of bytes for complete UChars */ | |
230 | count = 2 * targetCapacity; | |
231 | if(count > length) { | |
232 | count = length & ~1; | |
233 | } | |
234 | if(count>0) { | |
235 | length -= count; | |
236 | count >>= 1; | |
237 | targetCapacity -= count; | |
238 | if(offsets == NULL) { | |
239 | while(count > 0) { | |
240 | target8[1] = *source++; | |
241 | target8[0] = *source++; | |
242 | target8 += 2; | |
243 | --count; | |
244 | } | |
245 | } else { | |
246 | while(count>0) { | |
247 | target8[1] = *source++; | |
248 | target8[0] = *source++; | |
249 | target8 += 2; | |
250 | *offsets++ = sourceIndex; | |
251 | sourceIndex += 2; | |
252 | --count; | |
253 | } | |
254 | } | |
255 | target=(UChar *)target8; | |
256 | } | |
257 | ||
258 | /* check for a remaining source byte and store the status */ | |
259 | if(length >= 2) { | |
260 | /* it must be targetCapacity==0 because otherwise the above would have copied more */ | |
261 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
262 | } else if(length == 1) { | |
263 | if(pArgs->flush) { | |
264 | /* a UChar remains incomplete */ | |
265 | *pErrorCode = U_TRUNCATED_CHAR_FOUND; | |
266 | } else { | |
267 | /* consume the last byte and store it, making sure that it will never set the status to 0 */ | |
268 | cnv->toUnicodeStatus = *source++ | 0x100; | |
269 | } | |
270 | } else /* length==0 */ if(cnv->toUnicodeStatus!=0 && pArgs->flush) { | |
271 | /* a UChar remains incomplete */ | |
272 | *pErrorCode = U_TRUNCATED_CHAR_FOUND; | |
273 | } | |
274 | ||
275 | /* write back the updated pointers */ | |
276 | pArgs->source = (const char *)source; | |
277 | pArgs->target = target; | |
278 | pArgs->offsets = offsets; | |
279 | } | |
280 | ||
281 | static void | |
282 | _UTF16OEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
283 | UErrorCode *pErrorCode) { | |
284 | UConverter *cnv = pArgs->converter; | |
285 | const UChar *source = pArgs->source; | |
286 | const uint8_t *source8 = (const uint8_t *)source; /* byte pointer to the source */ | |
287 | uint8_t *target = (uint8_t *)pArgs->target; | |
288 | int32_t *offsets = pArgs->offsets; | |
289 | int32_t targetCapacity = pArgs->targetLimit - pArgs->target; | |
290 | int32_t length = pArgs->sourceLimit - source; | |
291 | int32_t count; | |
292 | int32_t sourceIndex = 0; | |
293 | ||
294 | if(length <= 0 && cnv->fromUnicodeStatus == 0) { | |
295 | /* no input, nothing to do */ | |
296 | return; | |
297 | } | |
298 | ||
299 | if(targetCapacity <= 0) { | |
300 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
301 | return; | |
302 | } | |
303 | ||
304 | /* complete a partial UChar from the last call */ | |
305 | if(cnv->fromUnicodeStatus != 0) { | |
306 | *target++ = (uint8_t)cnv->fromUnicodeStatus; | |
307 | cnv->fromUnicodeStatus = 0; | |
308 | --targetCapacity; | |
309 | if(offsets != NULL) { | |
310 | *offsets++ = -1; | |
311 | } | |
312 | } | |
313 | ||
314 | /* copy an even number of bytes for complete UChars */ | |
315 | count = 2 * length; | |
316 | if(count > targetCapacity) { | |
317 | count = targetCapacity & ~1; | |
318 | } | |
319 | if(count > 0) { | |
320 | targetCapacity -= count; | |
321 | count >>= 1; | |
322 | length -= count; | |
323 | if(offsets == NULL) { | |
324 | while(count > 0) { | |
325 | target[1] = *source8++; | |
326 | target[0] = *source8++; | |
327 | target += 2; | |
328 | --count; | |
329 | } | |
330 | } else { | |
331 | while(count>0) { | |
332 | target[1] = *source8++; | |
333 | target[0] = *source8++; | |
334 | target += 2; | |
335 | *offsets++ = sourceIndex; | |
336 | *offsets++ = sourceIndex++; | |
337 | --count; | |
338 | } | |
339 | } | |
340 | source=(const UChar *)source8; | |
341 | } | |
342 | ||
343 | if(length > 0) { | |
344 | /* it must be targetCapacity<=1 because otherwise the above would have copied more */ | |
345 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; | |
346 | if(targetCapacity > 0) /* targetCapacity==1 */ { | |
347 | /* copy one byte and keep the other in the status */ | |
348 | cnv->fromUnicodeStatus = *source8++ | 0x100; | |
349 | *target++ = *source8; | |
350 | ++source; | |
351 | if(offsets != NULL) { | |
352 | *offsets++ = sourceIndex; | |
353 | } | |
354 | } | |
355 | } | |
356 | ||
357 | /* write back the updated pointers */ | |
358 | pArgs->source = source; | |
359 | pArgs->target = (char *)target; | |
360 | pArgs->offsets = offsets; | |
361 | } | |
362 | ||
363 | /* UTF-16BE ----------------------------------------------------------------- */ | |
364 | ||
365 | #if U_IS_BIG_ENDIAN | |
366 | # define _UTF16BEToUnicodeWithOffsets _UTF16PEToUnicodeWithOffsets | |
367 | # define _UTF16LEToUnicodeWithOffsets _UTF16OEToUnicodeWithOffsets | |
368 | # define _UTF16BEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets | |
369 | # define _UTF16LEFromUnicodeWithOffsets _UTF16OEFromUnicodeWithOffsets | |
370 | #else | |
371 | # define _UTF16BEToUnicodeWithOffsets _UTF16OEToUnicodeWithOffsets | |
372 | # define _UTF16LEToUnicodeWithOffsets _UTF16PEToUnicodeWithOffsets | |
373 | # define _UTF16BEFromUnicodeWithOffsets _UTF16OEFromUnicodeWithOffsets | |
374 | # define _UTF16LEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets | |
375 | #endif | |
376 | ||
377 | static UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args, | |
378 | UErrorCode* err) | |
379 | { | |
380 | UChar32 myUChar; | |
381 | uint16_t first; | |
382 | /*Checks boundaries and set appropriate error codes*/ | |
383 | if (args->source+2 > args->sourceLimit) | |
384 | { | |
385 | if (args->source >= args->sourceLimit) | |
386 | { | |
387 | /*Either caller has reached the end of the byte stream*/ | |
388 | *err = U_INDEX_OUTOFBOUNDS_ERROR; | |
389 | } | |
390 | else | |
391 | { | |
392 | /* a character was cut in half*/ | |
393 | *err = U_TRUNCATED_CHAR_FOUND; | |
394 | } | |
395 | return 0xffff; | |
396 | } | |
397 | ||
398 | /*Gets the corresponding codepoint*/ | |
399 | first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1))); | |
400 | myUChar = first; | |
401 | args->source += 2; | |
402 | ||
403 | if(UTF_IS_FIRST_SURROGATE(first)) { | |
404 | uint16_t second; | |
405 | ||
406 | if (args->source+2 > args->sourceLimit) { | |
407 | *err = U_TRUNCATED_CHAR_FOUND; | |
408 | return 0xffff; | |
409 | } | |
410 | ||
411 | /* get the second surrogate and assemble the code point */ | |
412 | second = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*(args->source+1))); | |
413 | ||
414 | /* ignore unmatched surrogates and just deliver the first one in such a case */ | |
415 | if(UTF_IS_SECOND_SURROGATE(second)) { | |
416 | /* matched pair, get pair value */ | |
417 | myUChar = UTF16_GET_PAIR_VALUE(first, second); | |
418 | args->source += 2; | |
419 | } | |
420 | } | |
421 | ||
422 | return myUChar; | |
423 | } | |
424 | ||
425 | static const UConverterImpl _UTF16BEImpl={ | |
426 | UCNV_UTF16_BigEndian, | |
427 | ||
428 | NULL, | |
429 | NULL, | |
430 | ||
431 | NULL, | |
432 | NULL, | |
433 | NULL, | |
434 | ||
435 | _UTF16BEToUnicodeWithOffsets, | |
436 | _UTF16BEToUnicodeWithOffsets, | |
437 | _UTF16BEFromUnicodeWithOffsets, | |
438 | _UTF16BEFromUnicodeWithOffsets, | |
439 | T_UConverter_getNextUChar_UTF16_BE, | |
440 | ||
441 | NULL, | |
442 | NULL, | |
443 | NULL, | |
444 | NULL, | |
445 | ucnv_getCompleteUnicodeSet | |
446 | }; | |
447 | ||
448 | /* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */ | |
449 | static const UConverterStaticData _UTF16BEStaticData={ | |
450 | sizeof(UConverterStaticData), | |
451 | "UTF-16BE", | |
452 | 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, | |
453 | { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, | |
454 | 0, | |
455 | 0, | |
456 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
457 | }; | |
458 | ||
459 | ||
460 | const UConverterSharedData _UTF16BEData={ | |
461 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
462 | NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, | |
463 | 0 | |
464 | }; | |
465 | ||
466 | /* UTF-16LE ----------------------------------------------------------------- */ | |
467 | ||
468 | static UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args, | |
469 | UErrorCode* err) | |
470 | { | |
471 | UChar32 myUChar; | |
472 | uint16_t first; | |
473 | /*Checks boundaries and set appropriate error codes*/ | |
474 | if (args->source+2 > args->sourceLimit) | |
475 | { | |
476 | if (args->source >= args->sourceLimit) | |
477 | { | |
478 | /*Either caller has reached the end of the byte stream*/ | |
479 | *err = U_INDEX_OUTOFBOUNDS_ERROR; | |
480 | } | |
481 | else | |
482 | { | |
483 | /* a character was cut in half*/ | |
484 | *err = U_TRUNCATED_CHAR_FOUND; | |
485 | } | |
486 | ||
487 | return 0xffff; | |
488 | } | |
489 | ||
490 | /*Gets the corresponding codepoint*/ | |
491 | first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source)))); | |
492 | myUChar=first; | |
493 | /*updates the source*/ | |
494 | args->source += 2; | |
495 | ||
496 | if (UTF_IS_FIRST_SURROGATE(first)) | |
497 | { | |
498 | uint16_t second; | |
499 | ||
500 | if (args->source+2 > args->sourceLimit) | |
501 | { | |
502 | *err = U_TRUNCATED_CHAR_FOUND; | |
503 | return 0xffff; | |
504 | } | |
505 | ||
506 | /* get the second surrogate and assemble the code point */ | |
507 | second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source)))); | |
508 | ||
509 | /* ignore unmatched surrogates and just deliver the first one in such a case */ | |
510 | if(UTF_IS_SECOND_SURROGATE(second)) | |
511 | { | |
512 | /* matched pair, get pair value */ | |
513 | myUChar = UTF16_GET_PAIR_VALUE(first, second); | |
514 | args->source += 2; | |
515 | } | |
516 | } | |
517 | ||
518 | return myUChar; | |
519 | } | |
520 | ||
521 | static const UConverterImpl _UTF16LEImpl={ | |
522 | UCNV_UTF16_LittleEndian, | |
523 | ||
524 | NULL, | |
525 | NULL, | |
526 | ||
527 | NULL, | |
528 | NULL, | |
529 | NULL, | |
530 | ||
531 | _UTF16LEToUnicodeWithOffsets, | |
532 | _UTF16LEToUnicodeWithOffsets, | |
533 | _UTF16LEFromUnicodeWithOffsets, | |
534 | _UTF16LEFromUnicodeWithOffsets, | |
535 | T_UConverter_getNextUChar_UTF16_LE, | |
536 | ||
537 | NULL, | |
538 | NULL, | |
539 | NULL, | |
540 | NULL, | |
541 | ucnv_getCompleteUnicodeSet | |
542 | }; | |
543 | ||
544 | ||
545 | /* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */ | |
546 | static const UConverterStaticData _UTF16LEStaticData={ | |
547 | sizeof(UConverterStaticData), | |
548 | "UTF-16LE", | |
549 | 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, | |
550 | { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, | |
551 | 0, | |
552 | 0, | |
553 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
554 | }; | |
555 | ||
556 | ||
557 | const UConverterSharedData _UTF16LEData={ | |
558 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
559 | NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, | |
560 | 0 | |
561 | }; | |
562 | ||
563 | /* UTF-16 (Detect BOM) ------------------------------------------------------ */ | |
564 | ||
565 | /* | |
566 | * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE | |
567 | * accordingly. | |
568 | * This is a simpler version of the UTF-32 converter below, with | |
569 | * fewer states for shorter BOMs. | |
570 | * | |
571 | * State values: | |
572 | * 0 initial state | |
573 | * 1 saw FE | |
574 | * 2..4 - | |
575 | * 5 saw FF | |
576 | * 6..7 - | |
577 | * 8 UTF-16BE mode | |
578 | * 9 UTF-16LE mode | |
579 | * | |
580 | * During detection: state&3==number of matching bytes so far. | |
581 | * | |
582 | * On output, emit U+FEFF as the first code point. | |
583 | */ | |
584 | ||
585 | static void | |
586 | _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { | |
587 | if(choice<=UCNV_RESET_TO_UNICODE) { | |
588 | /* reset toUnicode: state=0 */ | |
589 | cnv->mode=0; | |
590 | } | |
591 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
592 | /* reset fromUnicode: prepare to output the UTF-16PE BOM */ | |
593 | cnv->charErrorBufferLength=2; | |
594 | #if U_IS_BIG_ENDIAN | |
595 | cnv->charErrorBuffer[0]=0xfe; | |
596 | cnv->charErrorBuffer[1]=0xff; | |
597 | #else | |
598 | cnv->charErrorBuffer[0]=0xff; | |
599 | cnv->charErrorBuffer[1]=0xfe; | |
600 | #endif | |
601 | } | |
602 | } | |
603 | ||
604 | static void | |
605 | _UTF16Open(UConverter *cnv, | |
606 | const char *name, | |
607 | const char *locale, | |
608 | uint32_t options, | |
609 | UErrorCode *pErrorCode) { | |
610 | _UTF16Reset(cnv, UCNV_RESET_BOTH); | |
611 | } | |
612 | ||
613 | static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 }; | |
614 | ||
615 | static void | |
616 | _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
617 | UErrorCode *pErrorCode) { | |
618 | UConverter *cnv=pArgs->converter; | |
619 | const char *source=pArgs->source; | |
620 | const char *sourceLimit=pArgs->sourceLimit; | |
621 | int32_t *offsets=pArgs->offsets; | |
622 | ||
623 | int32_t state, offsetDelta; | |
624 | char b; | |
625 | ||
626 | state=cnv->mode; | |
627 | ||
628 | /* | |
629 | * If we detect a BOM in this buffer, then we must add the BOM size to the | |
630 | * offsets because the actual converter function will not see and count the BOM. | |
631 | * offsetDelta will have the number of the BOM bytes that are in the current buffer. | |
632 | */ | |
633 | offsetDelta=0; | |
634 | ||
635 | while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { | |
636 | switch(state) { | |
637 | case 0: | |
638 | b=*source; | |
639 | if(b==(char)0xfe) { | |
640 | state=1; /* could be FE FF */ | |
641 | } else if(b==(char)0xff) { | |
642 | state=5; /* could be FF FE */ | |
643 | } else { | |
644 | state=8; /* default to UTF-16BE */ | |
645 | continue; | |
646 | } | |
647 | ++source; | |
648 | break; | |
649 | case 1: | |
650 | case 5: | |
651 | if(*source==utf16BOM[state]) { | |
652 | ++source; | |
653 | if(state==1) { | |
654 | state=8; /* detect UTF-16BE */ | |
655 | offsetDelta=source-pArgs->source; | |
656 | } else if(state==5) { | |
657 | state=9; /* detect UTF-16LE */ | |
658 | offsetDelta=source-pArgs->source; | |
659 | } | |
660 | } else { | |
661 | /* switch to UTF-16BE and pass the previous bytes */ | |
662 | if(source!=pArgs->source) { | |
663 | /* just reset the source */ | |
664 | source=pArgs->source; | |
665 | } else { | |
666 | UBool oldFlush=pArgs->flush; | |
667 | ||
668 | /* the first byte is from a previous buffer, replay it first */ | |
669 | pArgs->source=utf16BOM+(state&4); /* select the correct BOM */ | |
670 | pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */ | |
671 | pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ | |
672 | ||
673 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
674 | ||
675 | /* restore real pointers; pArgs->source will be set in case 8/9 */ | |
676 | pArgs->sourceLimit=sourceLimit; | |
677 | pArgs->flush=oldFlush; | |
678 | } | |
679 | state=8; | |
680 | continue; | |
681 | } | |
682 | break; | |
683 | case 8: | |
684 | /* call UTF-16BE */ | |
685 | pArgs->source=source; | |
686 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
687 | source=pArgs->source; | |
688 | break; | |
689 | case 9: | |
690 | /* call UTF-16LE */ | |
691 | pArgs->source=source; | |
692 | _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); | |
693 | source=pArgs->source; | |
694 | break; | |
695 | default: | |
696 | break; /* does not occur */ | |
697 | } | |
698 | } | |
699 | ||
700 | /* add BOM size to offsets - see comment at offsetDelta declaration */ | |
701 | if(offsets!=NULL && offsetDelta!=0) { | |
702 | int32_t *offsetsLimit=pArgs->offsets; | |
703 | while(offsets<offsetsLimit) { | |
704 | *offsets++ += offsetDelta; | |
705 | } | |
706 | } | |
707 | ||
708 | pArgs->source=source; | |
709 | ||
710 | if(source==sourceLimit && pArgs->flush) { | |
711 | /* handle truncated input */ | |
712 | switch(state) { | |
713 | case 0: | |
714 | break; /* no input at all, nothing to do */ | |
715 | case 8: | |
716 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
717 | break; | |
718 | case 9: | |
719 | _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); | |
720 | break; | |
721 | default: | |
722 | /* handle 0<state<8: call UTF-16BE with too-short input */ | |
723 | pArgs->source=utf16BOM+(state&4); /* select the correct BOM */ | |
724 | pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ | |
725 | ||
726 | /* no offsets: not enough for output */ | |
727 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); | |
728 | pArgs->source=source; | |
729 | pArgs->sourceLimit=sourceLimit; | |
730 | break; | |
731 | } | |
732 | cnv->mode=0; /* reset */ | |
733 | } else { | |
734 | cnv->mode=state; | |
735 | } | |
736 | } | |
737 | ||
738 | static UChar32 | |
739 | _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, | |
740 | UErrorCode *pErrorCode) { | |
741 | switch(pArgs->converter->mode) { | |
742 | case 8: | |
743 | return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode); | |
744 | case 9: | |
745 | return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode); | |
746 | default: | |
747 | return ucnv_getNextUCharFromToUImpl(pArgs, _UTF16ToUnicodeWithOffsets, TRUE, pErrorCode); | |
748 | } | |
749 | } | |
750 | ||
751 | static const UConverterImpl _UTF16Impl = { | |
752 | UCNV_UTF16, | |
753 | ||
754 | NULL, | |
755 | NULL, | |
756 | ||
757 | _UTF16Open, | |
758 | NULL, | |
759 | _UTF16Reset, | |
760 | ||
761 | _UTF16ToUnicodeWithOffsets, | |
762 | _UTF16ToUnicodeWithOffsets, | |
763 | _UTF16PEFromUnicodeWithOffsets, | |
764 | _UTF16PEFromUnicodeWithOffsets, | |
765 | _UTF16GetNextUChar, | |
766 | ||
767 | NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ | |
768 | NULL, | |
769 | NULL, | |
770 | NULL, | |
771 | ucnv_getCompleteUnicodeSet | |
772 | }; | |
773 | ||
774 | static const UConverterStaticData _UTF16StaticData = { | |
775 | sizeof(UConverterStaticData), | |
776 | "UTF-16", | |
777 | 0, /* ### TODO review correctness of all Unicode CCSIDs */ | |
778 | UCNV_IBM, UCNV_UTF16, 2, 2, | |
779 | #if U_IS_BIG_ENDIAN | |
780 | { 0xff, 0xfd, 0, 0 }, 2, | |
781 | #else | |
782 | { 0xfd, 0xff, 0, 0 }, 2, | |
783 | #endif | |
784 | FALSE, FALSE, | |
785 | 0, | |
786 | 0, | |
787 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
788 | }; | |
789 | ||
790 | const UConverterSharedData _UTF16Data = { | |
791 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
792 | NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, | |
793 | 0 | |
794 | }; |