]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ********************************************************************** | |
b331163b | 5 | * Copyright (C) 2000-2015, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * file name: ucnvhz.c | |
f3c0d7a5 | 9 | * encoding: UTF-8 |
b75a7d8f A |
10 | * tab size: 8 (not used) |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2000oct16 | |
14 | * created by: Ram Viswanadha | |
15 | * 10/31/2000 Ram Implemented offsets logic function | |
16 | * | |
17 | */ | |
18 | ||
19 | #include "unicode/utypes.h" | |
20 | ||
b331163b | 21 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION |
b75a7d8f A |
22 | |
23 | #include "cmemory.h" | |
b75a7d8f A |
24 | #include "unicode/ucnv.h" |
25 | #include "unicode/ucnv_cb.h" | |
26 | #include "unicode/uset.h" | |
4388f060 | 27 | #include "unicode/utf16.h" |
b75a7d8f A |
28 | #include "ucnv_bld.h" |
29 | #include "ucnv_cnv.h" | |
729e4ab9 | 30 | #include "ucnv_imp.h" |
b75a7d8f A |
31 | |
32 | #define UCNV_TILDE 0x7E /* ~ */ | |
33 | #define UCNV_OPEN_BRACE 0x7B /* { */ | |
34 | #define UCNV_CLOSE_BRACE 0x7D /* } */ | |
35 | #define SB_ESCAPE "\x7E\x7D" | |
36 | #define DB_ESCAPE "\x7E\x7B" | |
37 | #define TILDE_ESCAPE "\x7E\x7E" | |
38 | #define ESC_LEN 2 | |
39 | ||
40 | ||
41 | #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \ | |
42 | while(len-->0){ \ | |
43 | if(targetIndex < targetLength){ \ | |
44 | args->target[targetIndex] = (unsigned char) *strToAppend; \ | |
45 | if(args->offsets!=NULL){ \ | |
46 | *(offsets++) = sourceIndex-1; \ | |
47 | } \ | |
48 | targetIndex++; \ | |
49 | } \ | |
50 | else{ \ | |
51 | args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \ | |
52 | *err =U_BUFFER_OVERFLOW_ERROR; \ | |
53 | } \ | |
54 | strToAppend++; \ | |
55 | } \ | |
56 | } | |
57 | ||
58 | ||
59 | typedef struct{ | |
73c04bcf | 60 | UConverter* gbConverter; |
b75a7d8f A |
61 | int32_t targetIndex; |
62 | int32_t sourceIndex; | |
63 | UBool isEscapeAppended; | |
b75a7d8f A |
64 | UBool isStateDBCS; |
65 | UBool isTargetUCharDBCS; | |
d5d484b0 | 66 | UBool isEmptySegment; |
b75a7d8f A |
67 | }UConverterDataHZ; |
68 | ||
69 | ||
f3c0d7a5 A |
70 | U_CDECL_BEGIN |
71 | static void U_CALLCONV | |
729e4ab9 A |
72 | _HZOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
73 | UConverter *gbConverter; | |
74 | if(pArgs->onlyTestIsLoadable) { | |
75 | ucnv_canCreateConverter("GBK", errorCode); /* errorCode carries result */ | |
76 | return; | |
77 | } | |
78 | gbConverter = ucnv_open("GBK", errorCode); | |
79 | if(U_FAILURE(*errorCode)) { | |
80 | return; | |
81 | } | |
b75a7d8f A |
82 | cnv->toUnicodeStatus = 0; |
83 | cnv->fromUnicodeStatus= 0; | |
84 | cnv->mode=0; | |
374ca955 | 85 | cnv->fromUChar32=0x0000; |
4388f060 | 86 | cnv->extraInfo = uprv_calloc(1, sizeof(UConverterDataHZ)); |
b75a7d8f | 87 | if(cnv->extraInfo != NULL){ |
729e4ab9 | 88 | ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = gbConverter; |
b75a7d8f | 89 | } |
b75a7d8f | 90 | else { |
729e4ab9 | 91 | ucnv_close(gbConverter); |
b75a7d8f A |
92 | *errorCode = U_MEMORY_ALLOCATION_ERROR; |
93 | return; | |
94 | } | |
95 | } | |
96 | ||
f3c0d7a5 | 97 | static void U_CALLCONV |
b75a7d8f A |
98 | _HZClose(UConverter *cnv){ |
99 | if(cnv->extraInfo != NULL) { | |
100 | ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter); | |
101 | if(!cnv->isExtraLocal) { | |
102 | uprv_free(cnv->extraInfo); | |
103 | } | |
104 | cnv->extraInfo = NULL; | |
105 | } | |
106 | } | |
107 | ||
f3c0d7a5 | 108 | static void U_CALLCONV |
b75a7d8f A |
109 | _HZReset(UConverter *cnv, UConverterResetChoice choice){ |
110 | if(choice<=UCNV_RESET_TO_UNICODE) { | |
111 | cnv->toUnicodeStatus = 0; | |
112 | cnv->mode=0; | |
113 | if(cnv->extraInfo != NULL){ | |
114 | ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; | |
d5d484b0 | 115 | ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; |
b75a7d8f A |
116 | } |
117 | } | |
118 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
119 | cnv->fromUnicodeStatus= 0; | |
374ca955 | 120 | cnv->fromUChar32=0x0000; |
b75a7d8f A |
121 | if(cnv->extraInfo != NULL){ |
122 | ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE; | |
123 | ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0; | |
124 | ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0; | |
125 | ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE; | |
126 | } | |
127 | } | |
128 | } | |
129 | ||
130 | /**************************************HZ Encoding************************************************* | |
131 | * Rules for HZ encoding | |
132 | * | |
133 | * In ASCII mode, a byte is interpreted as an ASCII character, unless a | |
134 | * '~' is encountered. The character '~' is an escape character. By | |
135 | * convention, it must be immediately followed ONLY by '~', '{' or '\n' | |
136 | * (<LF>), with the following special meaning. | |
137 | ||
138 | * 1. The escape sequence '~~' is interpreted as a '~'. | |
139 | * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB. | |
140 | * 3. The escape sequence '~\n' is a line-continuation marker to be | |
141 | * consumed with no output produced. | |
142 | * In GB mode, characters are interpreted two bytes at a time as (pure) | |
143 | * GB codes until the escape-from-GB code '~}' is read. This code | |
144 | * switches the mode from GB back to ASCII. (Note that the escape- | |
145 | * from-GB code '~}' ($7E7D) is outside the defined GB range.) | |
146 | * | |
147 | * Source: RFC 1842 | |
46f4442e A |
148 | * |
149 | * Note that the formal syntax in RFC 1842 is invalid. I assume that the | |
150 | * intended definition of single-byte-segment is as follows (pedberg): | |
151 | * single-byte-segment = single-byte-seq 1*single-byte-char | |
b75a7d8f A |
152 | */ |
153 | ||
154 | ||
f3c0d7a5 | 155 | static void U_CALLCONV |
b75a7d8f A |
156 | UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
157 | UErrorCode* err){ | |
374ca955 | 158 | char tempBuf[2]; |
b75a7d8f A |
159 | const char *mySource = ( char *) args->source; |
160 | UChar *myTarget = args->target; | |
b75a7d8f A |
161 | const char *mySourceLimit = args->sourceLimit; |
162 | UChar32 targetUniChar = 0x0000; | |
fd0068a8 | 163 | int32_t mySourceChar = 0x0000; |
b75a7d8f | 164 | UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); |
73c04bcf A |
165 | tempBuf[0]=0; |
166 | tempBuf[1]=0; | |
46f4442e A |
167 | |
168 | /* Calling code already handles this situation. */ | |
169 | /*if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){ | |
b75a7d8f A |
170 | *err = U_ILLEGAL_ARGUMENT_ERROR; |
171 | return; | |
46f4442e | 172 | }*/ |
b75a7d8f | 173 | |
374ca955 | 174 | while(mySource< mySourceLimit){ |
b75a7d8f A |
175 | |
176 | if(myTarget < args->targetLimit){ | |
177 | ||
178 | mySourceChar= (unsigned char) *mySource++; | |
46f4442e | 179 | |
fd0068a8 A |
180 | if(args->converter->mode == UCNV_TILDE) { |
181 | /* second byte after ~ */ | |
182 | args->converter->mode=0; | |
183 | switch(mySourceChar) { | |
46f4442e A |
184 | case 0x0A: |
185 | /* no output for ~\n (line-continuation marker) */ | |
186 | continue; | |
187 | case UCNV_TILDE: | |
188 | if(args->offsets) { | |
189 | args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); | |
190 | } | |
191 | *(myTarget++)=(UChar)mySourceChar; | |
192 | myData->isEmptySegment = FALSE; | |
193 | continue; | |
194 | case UCNV_OPEN_BRACE: | |
195 | case UCNV_CLOSE_BRACE: | |
196 | myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); | |
197 | if (myData->isEmptySegment) { | |
198 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ | |
199 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
200 | args->converter->toUCallbackReason = UCNV_IRREGULAR; | |
201 | args->converter->toUBytes[0] = UCNV_TILDE; | |
202 | args->converter->toUBytes[1] = mySourceChar; | |
203 | args->converter->toULength = 2; | |
204 | args->target = myTarget; | |
205 | args->source = mySource; | |
206 | return; | |
207 | } | |
208 | myData->isEmptySegment = TRUE; | |
209 | continue; | |
210 | default: | |
211 | /* if the first byte is equal to TILDE and the trail byte | |
212 | * is not a valid byte then it is an error condition | |
213 | */ | |
214 | /* | |
215 | * Ticket 5691: consistent illegal sequences: | |
216 | * - We include at least the first byte in the illegal sequence. | |
217 | * - If any of the non-initial bytes could be the start of a character, | |
218 | * we stop the illegal sequence before the first one of those. | |
219 | */ | |
220 | myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ | |
221 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
222 | args->converter->toUBytes[0] = UCNV_TILDE; | |
223 | if( myData->isStateDBCS ? | |
224 | (0x21 <= mySourceChar && mySourceChar <= 0x7e) : | |
225 | mySourceChar <= 0x7f | |
226 | ) { | |
227 | /* The current byte could be the start of a character: Back it out. */ | |
228 | args->converter->toULength = 1; | |
229 | --mySource; | |
230 | } else { | |
231 | /* Include the current byte in the illegal sequence. */ | |
232 | args->converter->toUBytes[1] = mySourceChar; | |
233 | args->converter->toULength = 2; | |
234 | } | |
235 | args->target = myTarget; | |
236 | args->source = mySource; | |
237 | return; | |
fd0068a8 A |
238 | } |
239 | } else if(myData->isStateDBCS) { | |
240 | if(args->converter->toUnicodeStatus == 0x00){ | |
241 | /* lead byte */ | |
242 | if(mySourceChar == UCNV_TILDE) { | |
b75a7d8f | 243 | args->converter->mode = UCNV_TILDE; |
fd0068a8 A |
244 | } else { |
245 | /* add another bit to distinguish a 0 byte from not having seen a lead byte */ | |
246 | args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); | |
247 | myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ | |
b75a7d8f | 248 | } |
b75a7d8f A |
249 | continue; |
250 | } | |
251 | else{ | |
fd0068a8 | 252 | /* trail byte */ |
fd0068a8 A |
253 | int leadIsOk, trailIsOk; |
254 | uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; | |
255 | targetUniChar = 0xffff; | |
256 | /* | |
257 | * Ticket 5691: consistent illegal sequences: | |
258 | * - We include at least the first byte in the illegal sequence. | |
259 | * - If any of the non-initial bytes could be the start of a character, | |
260 | * we stop the illegal sequence before the first one of those. | |
261 | * | |
262 | * In HZ DBCS, if the second byte is in the 21..7e range, | |
263 | * we report only the first byte as the illegal sequence. | |
264 | * Otherwise we convert or report the pair of bytes. | |
265 | */ | |
266 | leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); | |
267 | trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); | |
268 | if (leadIsOk && trailIsOk) { | |
269 | tempBuf[0] = (char) (leadByte+0x80) ; | |
270 | tempBuf[1] = (char) (mySourceChar+0x80); | |
271 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, | |
272 | tempBuf, 2, args->converter->useFallback); | |
273 | mySourceChar= (leadByte << 8) | mySourceChar; | |
274 | } else if (trailIsOk) { | |
275 | /* report a single illegal byte and continue with the following DBCS starter byte */ | |
276 | --mySource; | |
277 | mySourceChar = (int32_t)leadByte; | |
278 | } else { | |
279 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ | |
280 | /* add another bit so that the code below writes 2 bytes in case of error */ | |
281 | mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; | |
282 | } | |
283 | args->converter->toUnicodeStatus =0x00; | |
b75a7d8f A |
284 | } |
285 | } | |
286 | else{ | |
fd0068a8 A |
287 | if(mySourceChar == UCNV_TILDE) { |
288 | args->converter->mode = UCNV_TILDE; | |
289 | continue; | |
290 | } else if(mySourceChar <= 0x7f) { | |
291 | targetUniChar = (UChar)mySourceChar; /* ASCII */ | |
292 | myData->isEmptySegment = FALSE; /* the segment has something valid */ | |
293 | } else { | |
294 | targetUniChar = 0xffff; | |
295 | myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ | |
b75a7d8f | 296 | } |
b75a7d8f A |
297 | } |
298 | if(targetUniChar < 0xfffe){ | |
299 | if(args->offsets) { | |
300 | args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS)); | |
301 | } | |
302 | ||
303 | *(myTarget++)=(UChar)targetUniChar; | |
304 | } | |
fd0068a8 | 305 | else /* targetUniChar>=0xfffe */ { |
374ca955 A |
306 | if(targetUniChar == 0xfffe){ |
307 | *err = U_INVALID_CHAR_FOUND; | |
308 | } | |
309 | else{ | |
310 | *err = U_ILLEGAL_CHAR_FOUND; | |
311 | } | |
fd0068a8 A |
312 | if(mySourceChar > 0xff){ |
313 | args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); | |
314 | args->converter->toUBytes[1] = (uint8_t)mySourceChar; | |
46f4442e A |
315 | args->converter->toULength=2; |
316 | } | |
374ca955 A |
317 | else{ |
318 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; | |
319 | args->converter->toULength=1; | |
b75a7d8f | 320 | } |
374ca955 | 321 | break; |
b75a7d8f A |
322 | } |
323 | } | |
324 | else{ | |
325 | *err =U_BUFFER_OVERFLOW_ERROR; | |
326 | break; | |
327 | } | |
328 | } | |
fd0068a8 | 329 | |
b75a7d8f A |
330 | args->target = myTarget; |
331 | args->source = mySource; | |
332 | } | |
333 | ||
334 | ||
f3c0d7a5 | 335 | static void U_CALLCONV |
b75a7d8f A |
336 | UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, |
337 | UErrorCode * err){ | |
338 | const UChar *mySource = args->source; | |
374ca955 | 339 | char *myTarget = args->target; |
b75a7d8f A |
340 | int32_t* offsets = args->offsets; |
341 | int32_t mySourceIndex = 0; | |
342 | int32_t myTargetIndex = 0; | |
374ca955 | 343 | int32_t targetLength = (int32_t)(args->targetLimit - myTarget); |
b75a7d8f | 344 | int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source); |
b75a7d8f | 345 | uint32_t targetUniChar = 0x0000; |
73c04bcf | 346 | UChar32 mySourceChar = 0x0000; |
b75a7d8f A |
347 | UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo; |
348 | UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS; | |
b331163b | 349 | UBool oldIsTargetUCharDBCS; |
b75a7d8f A |
350 | int len =0; |
351 | const char* escSeq=NULL; | |
352 | ||
46f4442e A |
353 | /* Calling code already handles this situation. */ |
354 | /*if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){ | |
b75a7d8f A |
355 | *err = U_ILLEGAL_ARGUMENT_ERROR; |
356 | return; | |
46f4442e | 357 | }*/ |
374ca955 | 358 | if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) { |
b75a7d8f A |
359 | goto getTrail; |
360 | } | |
361 | /*writing the char to the output stream */ | |
362 | while (mySourceIndex < mySourceLength){ | |
363 | targetUniChar = missingCharMarker; | |
364 | if (myTargetIndex < targetLength){ | |
365 | ||
73c04bcf | 366 | mySourceChar = (UChar) mySource[mySourceIndex++]; |
b75a7d8f A |
367 | |
368 | ||
369 | oldIsTargetUCharDBCS = isTargetUCharDBCS; | |
370 | if(mySourceChar ==UCNV_TILDE){ | |
371 | /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/ | |
372 | len = ESC_LEN; | |
373 | escSeq = TILDE_ESCAPE; | |
374 | CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); | |
375 | continue; | |
46f4442e | 376 | } else if(mySourceChar <= 0x7f) { |
46f4442e A |
377 | targetUniChar = mySourceChar; |
378 | } else { | |
b331163b | 379 | int32_t length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, |
b75a7d8f | 380 | mySourceChar,&targetUniChar,args->converter->useFallback); |
46f4442e A |
381 | /* we can only use lead bytes 21..7D and trail bytes 21..7E */ |
382 | if( length == 2 && | |
383 | (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && | |
384 | (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) | |
385 | ) { | |
386 | targetUniChar -= 0x8080; | |
387 | } else { | |
388 | targetUniChar = missingCharMarker; | |
389 | } | |
b75a7d8f A |
390 | } |
391 | if (targetUniChar != missingCharMarker){ | |
392 | myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF); | |
393 | if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){ | |
394 | /*Shifting from a double byte to single byte mode*/ | |
395 | if(!isTargetUCharDBCS){ | |
396 | len =ESC_LEN; | |
397 | escSeq = SB_ESCAPE; | |
398 | CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); | |
73c04bcf | 399 | myConverterData->isEscapeAppended = TRUE; |
b75a7d8f A |
400 | } |
401 | else{ /* Shifting from a single byte to double byte mode*/ | |
402 | len =ESC_LEN; | |
403 | escSeq = DB_ESCAPE; | |
404 | CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); | |
73c04bcf | 405 | myConverterData->isEscapeAppended = TRUE; |
b75a7d8f A |
406 | |
407 | } | |
408 | } | |
409 | ||
410 | if(isTargetUCharDBCS){ | |
411 | if( myTargetIndex <targetLength){ | |
46f4442e | 412 | myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); |
b75a7d8f A |
413 | if(offsets){ |
414 | *(offsets++) = mySourceIndex-1; | |
415 | } | |
416 | if(myTargetIndex < targetLength){ | |
46f4442e | 417 | myTarget[myTargetIndex++] =(char) targetUniChar; |
b75a7d8f A |
418 | if(offsets){ |
419 | *(offsets++) = mySourceIndex-1; | |
420 | } | |
421 | }else{ | |
46f4442e | 422 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; |
b75a7d8f A |
423 | *err = U_BUFFER_OVERFLOW_ERROR; |
424 | } | |
425 | }else{ | |
46f4442e A |
426 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); |
427 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; | |
b75a7d8f A |
428 | *err = U_BUFFER_OVERFLOW_ERROR; |
429 | } | |
430 | ||
431 | }else{ | |
432 | if( myTargetIndex <targetLength){ | |
374ca955 | 433 | myTarget[myTargetIndex++] = (char) (targetUniChar ); |
b75a7d8f A |
434 | if(offsets){ |
435 | *(offsets++) = mySourceIndex-1; | |
436 | } | |
437 | ||
438 | }else{ | |
439 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; | |
440 | *err = U_BUFFER_OVERFLOW_ERROR; | |
441 | } | |
442 | } | |
443 | ||
444 | } | |
445 | else{ | |
374ca955 | 446 | /* oops.. the code point is unassigned */ |
b75a7d8f A |
447 | /*Handle surrogates */ |
448 | /*check if the char is a First surrogate*/ | |
4388f060 A |
449 | if(U16_IS_SURROGATE(mySourceChar)) { |
450 | if(U16_IS_SURROGATE_LEAD(mySourceChar)) { | |
374ca955 | 451 | args->converter->fromUChar32=mySourceChar; |
b75a7d8f A |
452 | getTrail: |
453 | /*look ahead to find the trail surrogate*/ | |
454 | if(mySourceIndex < mySourceLength) { | |
455 | /* test the following code unit */ | |
456 | UChar trail=(UChar) args->source[mySourceIndex]; | |
4388f060 | 457 | if(U16_IS_TRAIL(trail)) { |
b75a7d8f | 458 | ++mySourceIndex; |
4388f060 | 459 | mySourceChar=U16_GET_SUPPLEMENTARY(args->converter->fromUChar32, trail); |
374ca955 | 460 | args->converter->fromUChar32=0x00; |
b75a7d8f A |
461 | /* there are no surrogates in GB2312*/ |
462 | *err = U_INVALID_CHAR_FOUND; | |
b75a7d8f A |
463 | /* exit this condition tree */ |
464 | } else { | |
465 | /* this is an unmatched lead code unit (1st surrogate) */ | |
466 | /* callback(illegal) */ | |
b75a7d8f A |
467 | *err=U_ILLEGAL_CHAR_FOUND; |
468 | } | |
469 | } else { | |
470 | /* no more input */ | |
471 | *err = U_ZERO_ERROR; | |
b75a7d8f A |
472 | } |
473 | } else { | |
474 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
475 | /* callback(illegal) */ | |
b75a7d8f A |
476 | *err=U_ILLEGAL_CHAR_FOUND; |
477 | } | |
374ca955 A |
478 | } else { |
479 | /* callback(unassigned) for a BMP code point */ | |
480 | *err = U_INVALID_CHAR_FOUND; | |
b75a7d8f A |
481 | } |
482 | ||
374ca955 A |
483 | args->converter->fromUChar32=mySourceChar; |
484 | break; | |
b75a7d8f A |
485 | } |
486 | } | |
487 | else{ | |
488 | *err = U_BUFFER_OVERFLOW_ERROR; | |
489 | break; | |
490 | } | |
491 | targetUniChar=missingCharMarker; | |
492 | } | |
b75a7d8f A |
493 | |
494 | args->target += myTargetIndex; | |
495 | args->source += mySourceIndex; | |
496 | myConverterData->isTargetUCharDBCS = isTargetUCharDBCS; | |
497 | } | |
498 | ||
f3c0d7a5 | 499 | static void U_CALLCONV |
b75a7d8f A |
500 | _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { |
501 | UConverter *cnv = args->converter; | |
502 | UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo; | |
503 | char *p; | |
504 | char buffer[4]; | |
505 | p = buffer; | |
506 | ||
507 | if( convData->isTargetUCharDBCS){ | |
508 | *p++= UCNV_TILDE; | |
509 | *p++= UCNV_CLOSE_BRACE; | |
510 | convData->isTargetUCharDBCS=FALSE; | |
511 | } | |
73c04bcf | 512 | *p++= (char)cnv->subChars[0]; |
b75a7d8f A |
513 | |
514 | ucnv_cbFromUWriteBytes(args, | |
515 | buffer, (int32_t)(p - buffer), | |
516 | offsetIndex, err); | |
517 | } | |
518 | ||
73c04bcf A |
519 | /* |
520 | * Structure for cloning an HZ converter into a single memory block. | |
521 | * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct, | |
522 | * and then ucnv_safeClone() of the sub-converter may additionally align | |
523 | * subCnv inside the cloneHZStruct, for which we need the deadSpace after | |
524 | * subCnv. This is because UAlignedMemory may be larger than the actually | |
525 | * necessary alignment size for the platform. | |
526 | * The other cloneHZStruct fields will not be moved around, | |
527 | * and are aligned properly with cloneHZStruct's alignment. | |
528 | */ | |
374ca955 | 529 | struct cloneHZStruct |
b75a7d8f A |
530 | { |
531 | UConverter cnv; | |
b75a7d8f | 532 | UConverter subCnv; |
73c04bcf | 533 | UAlignedMemory deadSpace; |
b75a7d8f A |
534 | UConverterDataHZ mydata; |
535 | }; | |
536 | ||
537 | ||
f3c0d7a5 | 538 | static UConverter * U_CALLCONV |
b75a7d8f A |
539 | _HZ_SafeClone(const UConverter *cnv, |
540 | void *stackBuffer, | |
541 | int32_t *pBufferSize, | |
542 | UErrorCode *status) | |
543 | { | |
374ca955 A |
544 | struct cloneHZStruct * localClone; |
545 | int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct); | |
b75a7d8f A |
546 | |
547 | if (U_FAILURE(*status)){ | |
548 | return 0; | |
549 | } | |
550 | ||
551 | if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ | |
552 | *pBufferSize = bufferSizeNeeded; | |
553 | return 0; | |
554 | } | |
555 | ||
374ca955 | 556 | localClone = (struct cloneHZStruct *)stackBuffer; |
73c04bcf | 557 | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
b75a7d8f A |
558 | |
559 | uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ)); | |
560 | localClone->cnv.extraInfo = &localClone->mydata; | |
561 | localClone->cnv.isExtraLocal = TRUE; | |
562 | ||
563 | /* deep-clone the sub-converter */ | |
73c04bcf | 564 | size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ |
b75a7d8f A |
565 | ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter = |
566 | ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status); | |
567 | ||
568 | return &localClone->cnv; | |
569 | } | |
570 | ||
f3c0d7a5 | 571 | static void U_CALLCONV |
b75a7d8f | 572 | _HZ_GetUnicodeSet(const UConverter *cnv, |
73c04bcf | 573 | const USetAdder *sa, |
b75a7d8f A |
574 | UConverterUnicodeSet which, |
575 | UErrorCode *pErrorCode) { | |
46f4442e A |
576 | /* HZ converts all of ASCII */ |
577 | sa->addRange(sa->set, 0, 0x7f); | |
b75a7d8f A |
578 | |
579 | /* add all of the code points that the sub-converter handles */ | |
46f4442e A |
580 | ucnv_MBCSGetFilteredUnicodeSetForUnicode( |
581 | ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, | |
582 | sa, which, UCNV_SET_FILTER_HZ, | |
583 | pErrorCode); | |
b75a7d8f | 584 | } |
f3c0d7a5 | 585 | U_CDECL_END |
b75a7d8f A |
586 | static const UConverterImpl _HZImpl={ |
587 | ||
588 | UCNV_HZ, | |
589 | ||
590 | NULL, | |
591 | NULL, | |
592 | ||
593 | _HZOpen, | |
594 | _HZClose, | |
595 | _HZReset, | |
596 | ||
597 | UConverter_toUnicode_HZ_OFFSETS_LOGIC, | |
598 | UConverter_toUnicode_HZ_OFFSETS_LOGIC, | |
599 | UConverter_fromUnicode_HZ_OFFSETS_LOGIC, | |
600 | UConverter_fromUnicode_HZ_OFFSETS_LOGIC, | |
601 | NULL, | |
602 | ||
603 | NULL, | |
604 | NULL, | |
605 | _HZ_WriteSub, | |
606 | _HZ_SafeClone, | |
f3c0d7a5 A |
607 | _HZ_GetUnicodeSet, |
608 | NULL, | |
609 | NULL | |
b75a7d8f A |
610 | }; |
611 | ||
612 | static const UConverterStaticData _HZStaticData={ | |
613 | sizeof(UConverterStaticData), | |
614 | "HZ", | |
615 | 0, | |
616 | UCNV_IBM, | |
617 | UCNV_HZ, | |
618 | 1, | |
619 | 4, | |
620 | { 0x1a, 0, 0, 0 }, | |
621 | 1, | |
622 | FALSE, | |
623 | FALSE, | |
624 | 0, | |
625 | 0, | |
626 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ | |
627 | ||
628 | }; | |
2ca993e8 A |
629 | |
630 | const UConverterSharedData _HZData= | |
631 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_HZStaticData, &_HZImpl); | |
b75a7d8f | 632 | |
b331163b | 633 | #endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION */ |