]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
46f4442e | 3 | * Copyright (C) 2002-2007, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * file name: ucnv_u8.c | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2002jul01 | |
12 | * created by: Markus W. Scherer | |
13 | * | |
14 | * UTF-8 converter implementation. Used to be in ucnv_utf.c. | |
15 | * | |
16 | * Also, CESU-8 implementation, see UTR 26. | |
17 | * The CESU-8 converter uses all the same functions as the | |
18 | * UTF-8 converter, with a branch for converting supplementary code points. | |
19 | */ | |
20 | ||
21 | #include "unicode/utypes.h" | |
374ca955 A |
22 | |
23 | #if !UCONFIG_NO_CONVERSION | |
24 | ||
b75a7d8f | 25 | #include "unicode/ucnv.h" |
b75a7d8f A |
26 | #include "ucnv_bld.h" |
27 | #include "ucnv_cnv.h" | |
28 | #include "cmemory.h" | |
29 | ||
30 | /* Prototypes --------------------------------------------------------------- */ | |
31 | ||
32 | /* Keep these here to make finicky compilers happy */ | |
33 | ||
374ca955 | 34 | U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, |
b75a7d8f | 35 | UErrorCode *err); |
374ca955 | 36 | U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, |
b75a7d8f | 37 | UErrorCode *err); |
b75a7d8f A |
38 | |
39 | ||
40 | /* UTF-8 -------------------------------------------------------------------- */ | |
41 | ||
42 | /* UTF-8 Conversion DATA | |
46f4442e | 43 | * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 |
b75a7d8f A |
44 | */ |
45 | /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ | |
46 | #define MAXIMUM_UCS2 0x0000FFFF | |
47 | #define MAXIMUM_UTF 0x0010FFFF | |
48 | #define MAXIMUM_UCS4 0x7FFFFFFF | |
49 | #define HALF_SHIFT 10 | |
50 | #define HALF_BASE 0x0010000 | |
51 | #define HALF_MASK 0x3FF | |
52 | #define SURROGATE_HIGH_START 0xD800 | |
53 | #define SURROGATE_HIGH_END 0xDBFF | |
54 | #define SURROGATE_LOW_START 0xDC00 | |
55 | #define SURROGATE_LOW_END 0xDFFF | |
56 | ||
57 | /* -SURROGATE_LOW_START + HALF_BASE */ | |
58 | #define SURROGATE_LOW_BASE 9216 | |
59 | ||
60 | static const uint32_t offsetsFromUTF8[7] = {0, | |
61 | (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, | |
62 | (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 | |
63 | }; | |
64 | ||
65 | /* END OF UTF-8 Conversion DATA */ | |
66 | ||
67 | static const int8_t bytesFromUTF8[256] = { | |
68 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
69 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
70 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
71 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
72 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
73 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
74 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
75 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | |
76 | }; | |
77 | ||
78 | /* | |
79 | * Starting with Unicode 3.0.1: | |
80 | * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; | |
81 | * byte sequences with more than 4 bytes are illegal in UTF-8, | |
82 | * which is tested with impossible values for them | |
83 | */ | |
84 | static const uint32_t | |
85 | utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; | |
86 | ||
374ca955 | 87 | static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, |
b75a7d8f A |
88 | UErrorCode * err) |
89 | { | |
46f4442e | 90 | UConverter *cnv = args->converter; |
b75a7d8f A |
91 | const unsigned char *mySource = (unsigned char *) args->source; |
92 | UChar *myTarget = args->target; | |
93 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; | |
94 | const UChar *targetLimit = args->targetLimit; | |
46f4442e A |
95 | unsigned char *toUBytes = cnv->toUBytes; |
96 | UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); | |
b75a7d8f A |
97 | uint32_t ch, ch2 = 0; |
98 | int32_t i, inBytes; | |
99 | ||
100 | /* Restore size of current sequence */ | |
46f4442e | 101 | if (cnv->toUnicodeStatus && myTarget < targetLimit) |
b75a7d8f | 102 | { |
46f4442e A |
103 | inBytes = cnv->mode; /* restore # of bytes to consume */ |
104 | i = cnv->toULength; /* restore # of bytes consumed */ | |
105 | cnv->toULength = 0; | |
b75a7d8f | 106 | |
46f4442e A |
107 | ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ |
108 | cnv->toUnicodeStatus = 0; | |
b75a7d8f A |
109 | goto morebytes; |
110 | } | |
111 | ||
112 | ||
113 | while (mySource < sourceLimit && myTarget < targetLimit) | |
114 | { | |
115 | ch = *(mySource++); | |
116 | if (ch < 0x80) /* Simple case */ | |
117 | { | |
118 | *(myTarget++) = (UChar) ch; | |
119 | } | |
120 | else | |
121 | { | |
122 | /* store the first char */ | |
123 | toUBytes[0] = (char)ch; | |
124 | inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ | |
125 | i = 1; | |
126 | ||
127 | morebytes: | |
128 | while (i < inBytes) | |
129 | { | |
130 | if (mySource < sourceLimit) | |
131 | { | |
132 | toUBytes[i] = (char) (ch2 = *mySource); | |
133 | if (!UTF8_IS_TRAIL(ch2)) | |
134 | { | |
135 | break; /* i < inBytes */ | |
136 | } | |
137 | ch = (ch << 6) + ch2; | |
138 | ++mySource; | |
139 | i++; | |
140 | } | |
141 | else | |
142 | { | |
374ca955 | 143 | /* stores a partially calculated target*/ |
46f4442e A |
144 | cnv->toUnicodeStatus = ch; |
145 | cnv->mode = inBytes; | |
146 | cnv->toULength = (int8_t) i; | |
b75a7d8f A |
147 | goto donefornow; |
148 | } | |
149 | } | |
150 | ||
151 | /* Remove the accumulated high bits */ | |
152 | ch -= offsetsFromUTF8[inBytes]; | |
153 | ||
154 | /* | |
155 | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: | |
156 | * - use only trail bytes after a lead byte (checked above) | |
157 | * - use the right number of trail bytes for a given lead byte | |
158 | * - encode a code point <= U+10ffff | |
159 | * - use the fewest possible number of bytes for their code points | |
160 | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) | |
161 | * | |
162 | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. | |
163 | * There are no irregular sequences any more. | |
164 | * In CESU-8, only surrogates, not supplementary code points, are encoded directly. | |
165 | */ | |
166 | if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && | |
167 | (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch))) | |
168 | { | |
169 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ | |
170 | if (ch <= MAXIMUM_UCS2) | |
171 | { | |
172 | /* fits in 16 bits */ | |
173 | *(myTarget++) = (UChar) ch; | |
174 | } | |
175 | else | |
176 | { | |
177 | /* write out the surrogates */ | |
178 | ch -= HALF_BASE; | |
179 | *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); | |
180 | ch = (ch & HALF_MASK) + SURROGATE_LOW_START; | |
181 | if (myTarget < targetLimit) | |
182 | { | |
183 | *(myTarget++) = (UChar)ch; | |
184 | } | |
185 | else | |
186 | { | |
187 | /* Put in overflow buffer (not handled here) */ | |
46f4442e A |
188 | cnv->UCharErrorBuffer[0] = (UChar) ch; |
189 | cnv->UCharErrorBufferLength = 1; | |
b75a7d8f A |
190 | *err = U_BUFFER_OVERFLOW_ERROR; |
191 | break; | |
192 | } | |
193 | } | |
194 | } | |
195 | else | |
196 | { | |
46f4442e | 197 | cnv->toULength = (int8_t)i; |
374ca955 A |
198 | *err = U_ILLEGAL_CHAR_FOUND; |
199 | break; | |
b75a7d8f A |
200 | } |
201 | } | |
202 | } | |
203 | ||
204 | donefornow: | |
205 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) | |
206 | { | |
207 | /* End of target buffer */ | |
208 | *err = U_BUFFER_OVERFLOW_ERROR; | |
209 | } | |
210 | ||
211 | args->target = myTarget; | |
212 | args->source = (const char *) mySource; | |
213 | } | |
214 | ||
374ca955 | 215 | static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, |
b75a7d8f A |
216 | UErrorCode * err) |
217 | { | |
46f4442e | 218 | UConverter *cnv = args->converter; |
b75a7d8f A |
219 | const unsigned char *mySource = (unsigned char *) args->source; |
220 | UChar *myTarget = args->target; | |
221 | int32_t *myOffsets = args->offsets; | |
222 | int32_t offsetNum = 0; | |
223 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; | |
224 | const UChar *targetLimit = args->targetLimit; | |
46f4442e A |
225 | unsigned char *toUBytes = cnv->toUBytes; |
226 | UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); | |
b75a7d8f A |
227 | uint32_t ch, ch2 = 0; |
228 | int32_t i, inBytes; | |
229 | ||
230 | /* Restore size of current sequence */ | |
46f4442e | 231 | if (cnv->toUnicodeStatus && myTarget < targetLimit) |
b75a7d8f | 232 | { |
46f4442e A |
233 | inBytes = cnv->mode; /* restore # of bytes to consume */ |
234 | i = cnv->toULength; /* restore # of bytes consumed */ | |
235 | cnv->toULength = 0; | |
b75a7d8f | 236 | |
46f4442e A |
237 | ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ |
238 | cnv->toUnicodeStatus = 0; | |
b75a7d8f A |
239 | goto morebytes; |
240 | } | |
241 | ||
242 | while (mySource < sourceLimit && myTarget < targetLimit) | |
243 | { | |
244 | ch = *(mySource++); | |
245 | if (ch < 0x80) /* Simple case */ | |
246 | { | |
247 | *(myTarget++) = (UChar) ch; | |
248 | *(myOffsets++) = offsetNum++; | |
249 | } | |
250 | else | |
251 | { | |
252 | toUBytes[0] = (char)ch; | |
253 | inBytes = bytesFromUTF8[ch]; | |
254 | i = 1; | |
255 | ||
256 | morebytes: | |
257 | while (i < inBytes) | |
258 | { | |
259 | if (mySource < sourceLimit) | |
260 | { | |
261 | toUBytes[i] = (char) (ch2 = *mySource); | |
262 | if (!UTF8_IS_TRAIL(ch2)) | |
263 | { | |
264 | break; /* i < inBytes */ | |
265 | } | |
266 | ch = (ch << 6) + ch2; | |
267 | ++mySource; | |
268 | i++; | |
269 | } | |
270 | else | |
271 | { | |
46f4442e A |
272 | cnv->toUnicodeStatus = ch; |
273 | cnv->mode = inBytes; | |
274 | cnv->toULength = (int8_t)i; | |
b75a7d8f A |
275 | goto donefornow; |
276 | } | |
277 | } | |
278 | ||
279 | /* Remove the accumulated high bits */ | |
280 | ch -= offsetsFromUTF8[inBytes]; | |
281 | ||
282 | /* | |
283 | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: | |
284 | * - use only trail bytes after a lead byte (checked above) | |
285 | * - use the right number of trail bytes for a given lead byte | |
286 | * - encode a code point <= U+10ffff | |
287 | * - use the fewest possible number of bytes for their code points | |
288 | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) | |
289 | * | |
290 | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. | |
291 | * There are no irregular sequences any more. | |
292 | * In CESU-8, only surrogates, not supplementary code points, are encoded directly. | |
293 | */ | |
294 | if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && | |
295 | (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch))) | |
296 | { | |
297 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ | |
298 | if (ch <= MAXIMUM_UCS2) | |
299 | { | |
300 | /* fits in 16 bits */ | |
301 | *(myTarget++) = (UChar) ch; | |
302 | *(myOffsets++) = offsetNum; | |
303 | } | |
304 | else | |
305 | { | |
306 | /* write out the surrogates */ | |
307 | ch -= HALF_BASE; | |
308 | *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); | |
309 | *(myOffsets++) = offsetNum; | |
310 | ch = (ch & HALF_MASK) + SURROGATE_LOW_START; | |
311 | if (myTarget < targetLimit) | |
312 | { | |
313 | *(myTarget++) = (UChar)ch; | |
314 | *(myOffsets++) = offsetNum; | |
315 | } | |
316 | else | |
317 | { | |
46f4442e A |
318 | cnv->UCharErrorBuffer[0] = (UChar) ch; |
319 | cnv->UCharErrorBufferLength = 1; | |
b75a7d8f A |
320 | *err = U_BUFFER_OVERFLOW_ERROR; |
321 | } | |
322 | } | |
323 | offsetNum += i; | |
324 | } | |
325 | else | |
326 | { | |
46f4442e | 327 | cnv->toULength = (int8_t)i; |
374ca955 A |
328 | *err = U_ILLEGAL_CHAR_FOUND; |
329 | break; | |
b75a7d8f A |
330 | } |
331 | } | |
332 | } | |
333 | ||
334 | donefornow: | |
335 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) | |
336 | { /* End of target buffer */ | |
337 | *err = U_BUFFER_OVERFLOW_ERROR; | |
338 | } | |
339 | ||
340 | args->target = myTarget; | |
341 | args->source = (const char *) mySource; | |
342 | args->offsets = myOffsets; | |
343 | } | |
344 | ||
374ca955 | 345 | U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, |
b75a7d8f A |
346 | UErrorCode * err) |
347 | { | |
348 | UConverter *cnv = args->converter; | |
349 | const UChar *mySource = args->source; | |
b75a7d8f | 350 | const UChar *sourceLimit = args->sourceLimit; |
46f4442e A |
351 | uint8_t *myTarget = (uint8_t *) args->target; |
352 | const uint8_t *targetLimit = (uint8_t *) args->targetLimit; | |
353 | uint8_t *tempPtr; | |
73c04bcf | 354 | UChar32 ch; |
46f4442e A |
355 | uint8_t tempBuf[4]; |
356 | int32_t indexToWrite; | |
357 | UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); | |
b75a7d8f | 358 | |
374ca955 | 359 | if (cnv->fromUChar32 && myTarget < targetLimit) |
b75a7d8f | 360 | { |
374ca955 A |
361 | ch = cnv->fromUChar32; |
362 | cnv->fromUChar32 = 0; | |
b75a7d8f A |
363 | goto lowsurrogate; |
364 | } | |
365 | ||
366 | while (mySource < sourceLimit && myTarget < targetLimit) | |
367 | { | |
368 | ch = *(mySource++); | |
369 | ||
370 | if (ch < 0x80) /* Single byte */ | |
371 | { | |
46f4442e | 372 | *(myTarget++) = (uint8_t) ch; |
b75a7d8f A |
373 | } |
374 | else if (ch < 0x800) /* Double byte */ | |
375 | { | |
46f4442e | 376 | *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
b75a7d8f A |
377 | if (myTarget < targetLimit) |
378 | { | |
46f4442e | 379 | *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
b75a7d8f A |
380 | } |
381 | else | |
382 | { | |
46f4442e | 383 | cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
b75a7d8f A |
384 | cnv->charErrorBufferLength = 1; |
385 | *err = U_BUFFER_OVERFLOW_ERROR; | |
386 | } | |
387 | } | |
46f4442e A |
388 | else { |
389 | /* Check for surrogates */ | |
390 | if(UTF_IS_SURROGATE(ch) && isNotCESU8) { | |
b75a7d8f | 391 | lowsurrogate: |
46f4442e A |
392 | if (mySource < sourceLimit) { |
393 | /* test both code units */ | |
394 | if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) { | |
395 | /* convert and consume this supplementary code point */ | |
396 | ch=UTF16_GET_PAIR_VALUE(ch, *mySource); | |
397 | ++mySource; | |
398 | /* exit this condition tree */ | |
399 | } | |
400 | else { | |
401 | /* this is an unpaired trail or lead code unit */ | |
402 | /* callback(illegal) */ | |
374ca955 | 403 | cnv->fromUChar32 = ch; |
46f4442e | 404 | *err = U_ILLEGAL_CHAR_FOUND; |
b75a7d8f A |
405 | break; |
406 | } | |
46f4442e A |
407 | } |
408 | else { | |
409 | /* no more input */ | |
374ca955 | 410 | cnv->fromUChar32 = ch; |
374ca955 | 411 | break; |
b75a7d8f A |
412 | } |
413 | } | |
414 | ||
46f4442e A |
415 | /* Do we write the buffer directly for speed, |
416 | or do we have to be careful about target buffer space? */ | |
417 | tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); | |
418 | ||
419 | if (ch <= MAXIMUM_UCS2) { | |
b75a7d8f | 420 | indexToWrite = 2; |
46f4442e | 421 | tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
b75a7d8f | 422 | } |
46f4442e | 423 | else { |
b75a7d8f | 424 | indexToWrite = 3; |
46f4442e A |
425 | tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
426 | tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); | |
b75a7d8f | 427 | } |
46f4442e A |
428 | tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
429 | tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); | |
b75a7d8f | 430 | |
46f4442e A |
431 | if (tempPtr == myTarget) { |
432 | /* There was enough space to write the codepoint directly. */ | |
433 | myTarget += (indexToWrite + 1); | |
434 | } | |
435 | else { | |
436 | /* We might run out of room soon. Write it slowly. */ | |
437 | for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { | |
438 | if (myTarget < targetLimit) { | |
439 | *(myTarget++) = *tempPtr; | |
440 | } | |
441 | else { | |
442 | cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; | |
443 | *err = U_BUFFER_OVERFLOW_ERROR; | |
444 | } | |
b75a7d8f A |
445 | } |
446 | } | |
447 | } | |
448 | } | |
449 | ||
450 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) | |
451 | { | |
452 | *err = U_BUFFER_OVERFLOW_ERROR; | |
453 | } | |
b75a7d8f A |
454 | |
455 | args->target = (char *) myTarget; | |
456 | args->source = mySource; | |
457 | } | |
458 | ||
374ca955 | 459 | U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, |
b75a7d8f A |
460 | UErrorCode * err) |
461 | { | |
462 | UConverter *cnv = args->converter; | |
463 | const UChar *mySource = args->source; | |
b75a7d8f A |
464 | int32_t *myOffsets = args->offsets; |
465 | const UChar *sourceLimit = args->sourceLimit; | |
46f4442e A |
466 | uint8_t *myTarget = (uint8_t *) args->target; |
467 | const uint8_t *targetLimit = (uint8_t *) args->targetLimit; | |
468 | uint8_t *tempPtr; | |
73c04bcf | 469 | UChar32 ch; |
b75a7d8f | 470 | int32_t offsetNum, nextSourceIndex; |
46f4442e A |
471 | int32_t indexToWrite; |
472 | uint8_t tempBuf[4]; | |
473 | UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); | |
b75a7d8f | 474 | |
374ca955 | 475 | if (cnv->fromUChar32 && myTarget < targetLimit) |
b75a7d8f | 476 | { |
374ca955 A |
477 | ch = cnv->fromUChar32; |
478 | cnv->fromUChar32 = 0; | |
b75a7d8f A |
479 | offsetNum = -1; |
480 | nextSourceIndex = 0; | |
481 | goto lowsurrogate; | |
482 | } else { | |
483 | offsetNum = 0; | |
484 | } | |
485 | ||
486 | while (mySource < sourceLimit && myTarget < targetLimit) | |
487 | { | |
488 | ch = *(mySource++); | |
489 | ||
490 | if (ch < 0x80) /* Single byte */ | |
491 | { | |
492 | *(myOffsets++) = offsetNum++; | |
493 | *(myTarget++) = (char) ch; | |
494 | } | |
495 | else if (ch < 0x800) /* Double byte */ | |
496 | { | |
497 | *(myOffsets++) = offsetNum; | |
46f4442e | 498 | *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
b75a7d8f A |
499 | if (myTarget < targetLimit) |
500 | { | |
501 | *(myOffsets++) = offsetNum++; | |
46f4442e | 502 | *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
b75a7d8f A |
503 | } |
504 | else | |
505 | { | |
46f4442e | 506 | cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
b75a7d8f A |
507 | cnv->charErrorBufferLength = 1; |
508 | *err = U_BUFFER_OVERFLOW_ERROR; | |
509 | } | |
510 | } | |
511 | else | |
512 | /* Check for surrogates */ | |
513 | { | |
514 | nextSourceIndex = offsetNum + 1; | |
515 | ||
46f4442e | 516 | if(UTF_IS_SURROGATE(ch) && isNotCESU8) { |
b75a7d8f | 517 | lowsurrogate: |
46f4442e A |
518 | if (mySource < sourceLimit) { |
519 | /* test both code units */ | |
520 | if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) { | |
521 | /* convert and consume this supplementary code point */ | |
522 | ch=UTF16_GET_PAIR_VALUE(ch, *mySource); | |
523 | ++mySource; | |
524 | ++nextSourceIndex; | |
525 | /* exit this condition tree */ | |
526 | } | |
527 | else { | |
528 | /* this is an unpaired trail or lead code unit */ | |
529 | /* callback(illegal) */ | |
374ca955 | 530 | cnv->fromUChar32 = ch; |
46f4442e | 531 | *err = U_ILLEGAL_CHAR_FOUND; |
b75a7d8f A |
532 | break; |
533 | } | |
46f4442e A |
534 | } |
535 | else { | |
536 | /* no more input */ | |
374ca955 | 537 | cnv->fromUChar32 = ch; |
374ca955 | 538 | break; |
b75a7d8f A |
539 | } |
540 | } | |
541 | ||
46f4442e A |
542 | /* Do we write the buffer directly for speed, |
543 | or do we have to be careful about target buffer space? */ | |
544 | tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); | |
545 | ||
546 | if (ch <= MAXIMUM_UCS2) { | |
b75a7d8f | 547 | indexToWrite = 2; |
46f4442e | 548 | tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
b75a7d8f | 549 | } |
46f4442e | 550 | else { |
b75a7d8f | 551 | indexToWrite = 3; |
46f4442e A |
552 | tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
553 | tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); | |
b75a7d8f | 554 | } |
46f4442e A |
555 | tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
556 | tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); | |
557 | ||
558 | if (tempPtr == myTarget) { | |
559 | /* There was enough space to write the codepoint directly. */ | |
560 | myTarget += (indexToWrite + 1); | |
561 | myOffsets[0] = offsetNum; | |
562 | myOffsets[1] = offsetNum; | |
563 | myOffsets[2] = offsetNum; | |
564 | if (indexToWrite >= 3) { | |
565 | myOffsets[3] = offsetNum; | |
b75a7d8f | 566 | } |
46f4442e A |
567 | myOffsets += (indexToWrite + 1); |
568 | } | |
569 | else { | |
570 | /* We might run out of room soon. Write it slowly. */ | |
571 | for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { | |
572 | if (myTarget < targetLimit) | |
573 | { | |
574 | *(myOffsets++) = offsetNum; | |
575 | *(myTarget++) = *tempPtr; | |
576 | } | |
577 | else | |
578 | { | |
579 | cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; | |
580 | *err = U_BUFFER_OVERFLOW_ERROR; | |
581 | } | |
b75a7d8f A |
582 | } |
583 | } | |
584 | offsetNum = nextSourceIndex; | |
585 | } | |
586 | } | |
587 | ||
588 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) | |
589 | { | |
590 | *err = U_BUFFER_OVERFLOW_ERROR; | |
591 | } | |
b75a7d8f A |
592 | |
593 | args->target = (char *) myTarget; | |
594 | args->source = mySource; | |
595 | args->offsets = myOffsets; | |
596 | } | |
597 | ||
374ca955 | 598 | static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, |
b75a7d8f | 599 | UErrorCode *err) { |
374ca955 A |
600 | UConverter *cnv; |
601 | const uint8_t *sourceInitial; | |
b75a7d8f | 602 | const uint8_t *source; |
b75a7d8f A |
603 | uint16_t extraBytesToWrite; |
604 | uint8_t myByte; | |
605 | UChar32 ch; | |
374ca955 | 606 | int8_t i, isLegalSequence; |
b75a7d8f | 607 | |
374ca955 A |
608 | /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ |
609 | ||
610 | cnv = args->converter; | |
611 | sourceInitial = source = (const uint8_t *)args->source; | |
612 | if (source >= (const uint8_t *)args->sourceLimit) | |
b75a7d8f | 613 | { |
374ca955 A |
614 | /* no input */ |
615 | *err = U_INDEX_OUTOFBOUNDS_ERROR; | |
616 | return 0xffff; | |
617 | } | |
b75a7d8f | 618 | |
374ca955 A |
619 | myByte = (uint8_t)*(source++); |
620 | if (myByte < 0x80) | |
621 | { | |
622 | args->source = (const char *)source; | |
623 | return (UChar32)myByte; | |
624 | } | |
b75a7d8f | 625 | |
374ca955 A |
626 | extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; |
627 | if (extraBytesToWrite == 0) { | |
628 | cnv->toUBytes[0] = myByte; | |
629 | cnv->toULength = 1; | |
630 | *err = U_ILLEGAL_CHAR_FOUND; | |
b75a7d8f | 631 | args->source = (const char *)source; |
374ca955 A |
632 | return 0xffff; |
633 | } | |
b75a7d8f | 634 | |
374ca955 A |
635 | /*The byte sequence is longer than the buffer area passed*/ |
636 | if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) | |
637 | { | |
638 | /* check if all of the remaining bytes are trail bytes */ | |
639 | cnv->toUBytes[0] = myByte; | |
640 | i = 1; | |
641 | *err = U_TRUNCATED_CHAR_FOUND; | |
642 | while(source < (const uint8_t *)args->sourceLimit) { | |
643 | if(U8_IS_TRAIL(myByte = *source)) { | |
644 | cnv->toUBytes[i++] = myByte; | |
645 | ++source; | |
b75a7d8f | 646 | } else { |
374ca955 A |
647 | /* error even before we run out of input */ |
648 | *err = U_ILLEGAL_CHAR_FOUND; | |
649 | break; | |
b75a7d8f A |
650 | } |
651 | } | |
374ca955 A |
652 | cnv->toULength = i; |
653 | args->source = (const char *)source; | |
654 | return 0xffff; | |
655 | } | |
b75a7d8f | 656 | |
374ca955 A |
657 | isLegalSequence = 1; |
658 | ch = myByte << 6; | |
659 | switch(extraBytesToWrite) | |
660 | { | |
661 | /* note: code falls through cases! (sic)*/ | |
662 | case 6: | |
663 | ch += (myByte = *source); | |
664 | ch <<= 6; | |
665 | if (!UTF8_IS_TRAIL(myByte)) | |
666 | { | |
667 | isLegalSequence = 0; | |
668 | break; | |
b75a7d8f | 669 | } |
374ca955 A |
670 | ++source; |
671 | case 5: | |
672 | ch += (myByte = *source); | |
673 | ch <<= 6; | |
674 | if (!UTF8_IS_TRAIL(myByte)) | |
675 | { | |
676 | isLegalSequence = 0; | |
677 | break; | |
678 | } | |
679 | ++source; | |
680 | case 4: | |
681 | ch += (myByte = *source); | |
682 | ch <<= 6; | |
683 | if (!UTF8_IS_TRAIL(myByte)) | |
684 | { | |
685 | isLegalSequence = 0; | |
686 | break; | |
687 | } | |
688 | ++source; | |
689 | case 3: | |
690 | ch += (myByte = *source); | |
691 | ch <<= 6; | |
692 | if (!UTF8_IS_TRAIL(myByte)) | |
693 | { | |
694 | isLegalSequence = 0; | |
695 | break; | |
696 | } | |
697 | ++source; | |
698 | case 2: | |
699 | ch += (myByte = *source); | |
700 | if (!UTF8_IS_TRAIL(myByte)) | |
701 | { | |
702 | isLegalSequence = 0; | |
703 | break; | |
704 | } | |
705 | ++source; | |
706 | }; | |
707 | ch -= offsetsFromUTF8[extraBytesToWrite]; | |
708 | args->source = (const char *)source; | |
709 | ||
710 | /* | |
711 | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: | |
712 | * - use only trail bytes after a lead byte (checked above) | |
713 | * - use the right number of trail bytes for a given lead byte | |
714 | * - encode a code point <= U+10ffff | |
715 | * - use the fewest possible number of bytes for their code points | |
716 | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) | |
717 | * | |
718 | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. | |
719 | * There are no irregular sequences any more. | |
720 | */ | |
721 | if (isLegalSequence && | |
722 | (uint32_t)ch <= MAXIMUM_UTF && | |
723 | (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && | |
724 | !U_IS_SURROGATE(ch) | |
725 | ) { | |
726 | return ch; /* return the code point */ | |
b75a7d8f A |
727 | } |
728 | ||
374ca955 A |
729 | for(i = 0; sourceInitial < source; ++i) { |
730 | cnv->toUBytes[i] = *sourceInitial++; | |
731 | } | |
732 | cnv->toULength = i; | |
733 | *err = U_ILLEGAL_CHAR_FOUND; | |
b75a7d8f A |
734 | return 0xffff; |
735 | } | |
736 | ||
46f4442e A |
737 | /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ |
738 | ||
739 | /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ | |
740 | static const UChar32 | |
741 | utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; | |
742 | ||
743 | /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ | |
744 | static const UChar32 | |
745 | utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; | |
746 | ||
747 | /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ | |
748 | static void | |
749 | ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, | |
750 | UConverterToUnicodeArgs *pToUArgs, | |
751 | UErrorCode *pErrorCode) { | |
752 | UConverter *utf8, *cnv; | |
753 | const uint8_t *source, *sourceLimit; | |
754 | uint8_t *target; | |
755 | int32_t targetCapacity; | |
756 | int32_t count; | |
757 | ||
758 | int8_t oldToULength, toULength, toULimit; | |
759 | ||
760 | UChar32 c; | |
761 | uint8_t b, t1, t2; | |
762 | ||
763 | /* set up the local pointers */ | |
764 | utf8=pToUArgs->converter; | |
765 | cnv=pFromUArgs->converter; | |
766 | source=(uint8_t *)pToUArgs->source; | |
767 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; | |
768 | target=(uint8_t *)pFromUArgs->target; | |
769 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); | |
770 | ||
771 | /* get the converter state from the UTF-8 UConverter */ | |
772 | c=(UChar32)utf8->toUnicodeStatus; | |
773 | if(c!=0) { | |
774 | toULength=oldToULength=utf8->toULength; | |
775 | toULimit=(int8_t)utf8->mode; | |
776 | } else { | |
777 | toULength=oldToULength=toULimit=0; | |
778 | } | |
779 | ||
780 | count=(int32_t)(sourceLimit-source)+oldToULength; | |
781 | if(count<toULimit) { | |
782 | /* | |
783 | * Not enough input to complete the partial character. | |
784 | * Jump to moreBytes below - it will not output to target. | |
785 | */ | |
786 | } else if(targetCapacity<toULimit) { | |
787 | /* | |
788 | * Not enough target capacity to output the partial character. | |
789 | * Let the standard converter handle this. | |
790 | */ | |
791 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
792 | return; | |
793 | } else { | |
794 | /* | |
795 | * Use a single counter for source and target, counting the minimum of | |
796 | * the source length and the target capacity. | |
797 | * As a result, the source length is checked only once per multi-byte | |
798 | * character instead of twice. | |
799 | * | |
800 | * Make sure that the last byte sequence is complete, or else | |
801 | * stop just before it. | |
802 | * (The longest legal byte sequence has 3 trail bytes.) | |
803 | * Count oldToULength (number of source bytes from a previous buffer) | |
804 | * into the source length but reduce the source index by toULimit | |
805 | * while going back over trail bytes in order to not go back into | |
806 | * the bytes that will be read for finishing a partial | |
807 | * sequence from the previous buffer. | |
808 | * Let the standard converter handle edge cases. | |
809 | */ | |
810 | int32_t i; | |
811 | ||
812 | if(count>targetCapacity) { | |
813 | count=targetCapacity; | |
814 | } | |
815 | ||
816 | i=0; | |
817 | while(i<3 && i<(count-toULimit)) { | |
818 | b=source[count-oldToULength-i-1]; | |
819 | if(U8_IS_TRAIL(b)) { | |
820 | ++i; | |
821 | } else { | |
822 | if(i<utf8_countTrailBytes[b]) { | |
823 | /* stop converting before the lead byte if there are not enough trail bytes for it */ | |
824 | count-=i+1; | |
825 | } | |
826 | break; | |
827 | } | |
828 | } | |
829 | } | |
830 | ||
831 | if(c!=0) { | |
832 | utf8->toUnicodeStatus=0; | |
833 | utf8->toULength=0; | |
834 | goto moreBytes; | |
835 | /* See note in ucnv_SBCSFromUTF8() about this goto. */ | |
836 | } | |
837 | ||
838 | /* conversion loop */ | |
839 | while(count>0) { | |
840 | b=*source++; | |
841 | if((int8_t)b>=0) { | |
842 | /* convert ASCII */ | |
843 | *target++=b; | |
844 | --count; | |
845 | continue; | |
846 | } else { | |
847 | if(b>0xe0) { | |
848 | if( /* handle U+1000..U+D7FF inline */ | |
849 | (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || | |
850 | (b==0xed && (t1 <= 0x9f))) && | |
851 | (t2=source[1]) >= 0x80 && t2 <= 0xbf | |
852 | ) { | |
853 | source+=2; | |
854 | *target++=b; | |
855 | *target++=t1; | |
856 | *target++=t2; | |
857 | count-=3; | |
858 | continue; | |
859 | } | |
860 | } else if(b<0xe0) { | |
861 | if( /* handle U+0080..U+07FF inline */ | |
862 | b>=0xc2 && | |
863 | (t1=*source) >= 0x80 && t1 <= 0xbf | |
864 | ) { | |
865 | ++source; | |
866 | *target++=b; | |
867 | *target++=t1; | |
868 | count-=2; | |
869 | continue; | |
870 | } | |
871 | } else if(b==0xe0) { | |
872 | if( /* handle U+0800..U+0FFF inline */ | |
873 | (t1=source[0]) >= 0xa0 && t1 <= 0xbf && | |
874 | (t2=source[1]) >= 0x80 && t2 <= 0xbf | |
875 | ) { | |
876 | source+=2; | |
877 | *target++=b; | |
878 | *target++=t1; | |
879 | *target++=t2; | |
880 | count-=3; | |
881 | continue; | |
882 | } | |
883 | } | |
884 | ||
885 | /* handle "complicated" and error cases, and continuing partial characters */ | |
886 | oldToULength=0; | |
887 | toULength=1; | |
888 | toULimit=utf8_countTrailBytes[b]+1; | |
889 | c=b; | |
890 | moreBytes: | |
891 | while(toULength<toULimit) { | |
892 | if(source<sourceLimit) { | |
893 | b=*source; | |
894 | if(U8_IS_TRAIL(b)) { | |
895 | ++source; | |
896 | ++toULength; | |
897 | c=(c<<6)+b; | |
898 | } else { | |
899 | break; /* sequence too short, stop with toULength<toULimit */ | |
900 | } | |
901 | } else { | |
902 | /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ | |
903 | source-=(toULength-oldToULength); | |
904 | while(oldToULength<toULength) { | |
905 | utf8->toUBytes[oldToULength++]=*source++; | |
906 | } | |
907 | utf8->toUnicodeStatus=c; | |
908 | utf8->toULength=toULength; | |
909 | utf8->mode=toULimit; | |
910 | pToUArgs->source=(char *)source; | |
911 | pFromUArgs->target=(char *)target; | |
912 | return; | |
913 | } | |
914 | } | |
915 | ||
916 | if( toULength==toULimit && /* consumed all trail bytes */ | |
917 | (toULength==3 || toULength==2) && /* BMP */ | |
918 | (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && | |
919 | (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ | |
920 | ) { | |
921 | /* legal byte sequence for BMP code point */ | |
922 | } else if( | |
923 | toULength==toULimit && toULength==4 && | |
924 | (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) | |
925 | ) { | |
926 | /* legal byte sequence for supplementary code point */ | |
927 | } else { | |
928 | /* error handling: illegal UTF-8 byte sequence */ | |
929 | source-=(toULength-oldToULength); | |
930 | while(oldToULength<toULength) { | |
931 | utf8->toUBytes[oldToULength++]=*source++; | |
932 | } | |
933 | utf8->toULength=toULength; | |
934 | pToUArgs->source=(char *)source; | |
935 | pFromUArgs->target=(char *)target; | |
936 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
937 | return; | |
938 | } | |
939 | ||
940 | /* copy the legal byte sequence to the target */ | |
941 | { | |
942 | int8_t i; | |
943 | ||
944 | for(i=0; i<oldToULength; ++i) { | |
945 | *target++=utf8->toUBytes[i]; | |
946 | } | |
947 | source-=(toULength-oldToULength); | |
948 | for(; i<toULength; ++i) { | |
949 | *target++=*source++; | |
950 | } | |
951 | count-=toULength; | |
952 | } | |
953 | } | |
954 | } | |
955 | ||
956 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { | |
957 | if(target==(const uint8_t *)pFromUArgs->targetLimit) { | |
958 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
959 | } else { | |
960 | b=*source; | |
961 | toULimit=utf8_countTrailBytes[b]+1; | |
962 | if(toULimit>(sourceLimit-source)) { | |
963 | /* collect a truncated byte sequence */ | |
964 | toULength=0; | |
965 | c=b; | |
966 | for(;;) { | |
967 | utf8->toUBytes[toULength++]=b; | |
968 | if(++source==sourceLimit) { | |
969 | /* partial byte sequence at end of source */ | |
970 | utf8->toUnicodeStatus=c; | |
971 | utf8->toULength=toULength; | |
972 | utf8->mode=toULimit; | |
973 | break; | |
974 | } else if(!U8_IS_TRAIL(b=*source)) { | |
975 | /* lead byte in trail byte position */ | |
976 | utf8->toULength=toULength; | |
977 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; | |
978 | break; | |
979 | } | |
980 | c=(c<<6)+b; | |
981 | } | |
982 | } else { | |
983 | /* partial-sequence target overflow: fall back to the pivoting implementation */ | |
984 | *pErrorCode=U_USING_DEFAULT_WARNING; | |
985 | } | |
986 | } | |
987 | } | |
988 | ||
989 | /* write back the updated pointers */ | |
990 | pToUArgs->source=(char *)source; | |
991 | pFromUArgs->target=(char *)target; | |
992 | } | |
993 | ||
b75a7d8f A |
994 | /* UTF-8 converter data ----------------------------------------------------- */ |
995 | ||
996 | static const UConverterImpl _UTF8Impl={ | |
997 | UCNV_UTF8, | |
998 | ||
999 | NULL, | |
1000 | NULL, | |
1001 | ||
1002 | NULL, | |
1003 | NULL, | |
1004 | NULL, | |
1005 | ||
374ca955 A |
1006 | ucnv_toUnicode_UTF8, |
1007 | ucnv_toUnicode_UTF8_OFFSETS_LOGIC, | |
1008 | ucnv_fromUnicode_UTF8, | |
1009 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, | |
1010 | ucnv_getNextUChar_UTF8, | |
b75a7d8f A |
1011 | |
1012 | NULL, | |
1013 | NULL, | |
1014 | NULL, | |
1015 | NULL, | |
46f4442e A |
1016 | ucnv_getNonSurrogateUnicodeSet, |
1017 | ||
1018 | ucnv_UTF8FromUTF8, | |
1019 | ucnv_UTF8FromUTF8 | |
b75a7d8f A |
1020 | }; |
1021 | ||
1022 | /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ | |
1023 | static const UConverterStaticData _UTF8StaticData={ | |
1024 | sizeof(UConverterStaticData), | |
1025 | "UTF-8", | |
374ca955 A |
1026 | 1208, UCNV_IBM, UCNV_UTF8, |
1027 | 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ | |
b75a7d8f A |
1028 | { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
1029 | 0, | |
1030 | 0, | |
1031 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
1032 | }; | |
1033 | ||
1034 | ||
1035 | const UConverterSharedData _UTF8Data={ | |
1036 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
1037 | NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl, | |
1038 | 0 | |
1039 | }; | |
1040 | ||
1041 | /* CESU-8 converter data ---------------------------------------------------- */ | |
1042 | ||
374ca955 A |
1043 | static const UConverterImpl _CESU8Impl={ |
1044 | UCNV_CESU8, | |
1045 | ||
1046 | NULL, | |
1047 | NULL, | |
1048 | ||
1049 | NULL, | |
1050 | NULL, | |
1051 | NULL, | |
1052 | ||
1053 | ucnv_toUnicode_UTF8, | |
1054 | ucnv_toUnicode_UTF8_OFFSETS_LOGIC, | |
1055 | ucnv_fromUnicode_UTF8, | |
1056 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, | |
1057 | NULL, | |
1058 | ||
1059 | NULL, | |
1060 | NULL, | |
1061 | NULL, | |
1062 | NULL, | |
1063 | ucnv_getCompleteUnicodeSet | |
1064 | }; | |
1065 | ||
b75a7d8f A |
1066 | static const UConverterStaticData _CESU8StaticData={ |
1067 | sizeof(UConverterStaticData), | |
1068 | "CESU-8", | |
73c04bcf A |
1069 | 9400, /* CCSID for CESU-8 */ |
1070 | UCNV_UNKNOWN, UCNV_CESU8, 1, 3, | |
b75a7d8f A |
1071 | { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
1072 | 0, | |
1073 | 0, | |
1074 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
1075 | }; | |
1076 | ||
1077 | ||
1078 | const UConverterSharedData _CESU8Data={ | |
1079 | sizeof(UConverterSharedData), ~((uint32_t) 0), | |
374ca955 | 1080 | NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl, |
b75a7d8f A |
1081 | 0 |
1082 | }; | |
374ca955 A |
1083 | |
1084 | #endif |