]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnvhz.c
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / ucnvhz.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvhz.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000oct16
12 * created by: Ram Viswanadha
13 * 10/31/2000 Ram Implemented offsets logic function
14 *
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
20
21 #include "cmemory.h"
22 #include "unicode/ucnv.h"
23 #include "unicode/ucnv_cb.h"
24 #include "unicode/uset.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27
28 #define UCNV_TILDE 0x7E /* ~ */
29 #define UCNV_OPEN_BRACE 0x7B /* { */
30 #define UCNV_CLOSE_BRACE 0x7D /* } */
31 #define SB_ESCAPE "\x7E\x7D"
32 #define DB_ESCAPE "\x7E\x7B"
33 #define TILDE_ESCAPE "\x7E\x7E"
34 #define ESC_LEN 2
35
36
37 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
38 while(len-->0){ \
39 if(targetIndex < targetLength){ \
40 args->target[targetIndex] = (unsigned char) *strToAppend; \
41 if(args->offsets!=NULL){ \
42 *(offsets++) = sourceIndex-1; \
43 } \
44 targetIndex++; \
45 } \
46 else{ \
47 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
48 *err =U_BUFFER_OVERFLOW_ERROR; \
49 } \
50 strToAppend++; \
51 } \
52 }
53
54
55 typedef struct{
56 int32_t targetIndex;
57 int32_t sourceIndex;
58 UBool isEscapeAppended;
59 UConverter* gbConverter;
60 UBool isStateDBCS;
61 UBool isTargetUCharDBCS;
62 }UConverterDataHZ;
63
64
65
66 static void
67 _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
68 cnv->toUnicodeStatus = 0;
69 cnv->fromUnicodeStatus= 0;
70 cnv->mode=0;
71 cnv->fromUChar32=0x0000;
72 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ));
73 if(cnv->extraInfo != NULL){
74 ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
75 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
76 ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
77 ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
78 ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
79 ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
80 }
81 /* test for NULL */
82 else {
83 *errorCode = U_MEMORY_ALLOCATION_ERROR;
84 return;
85 }
86 }
87
88 static void
89 _HZClose(UConverter *cnv){
90 if(cnv->extraInfo != NULL) {
91 ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
92 if(!cnv->isExtraLocal) {
93 uprv_free(cnv->extraInfo);
94 }
95 cnv->extraInfo = NULL;
96 }
97 }
98
99 static void
100 _HZReset(UConverter *cnv, UConverterResetChoice choice){
101 if(choice<=UCNV_RESET_TO_UNICODE) {
102 cnv->toUnicodeStatus = 0;
103 cnv->mode=0;
104 if(cnv->extraInfo != NULL){
105 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
106 }
107 }
108 if(choice!=UCNV_RESET_TO_UNICODE) {
109 cnv->fromUnicodeStatus= 0;
110 cnv->fromUChar32=0x0000;
111 if(cnv->extraInfo != NULL){
112 ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
113 ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
114 ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
115 ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
116 }
117 }
118 }
119
120 /**************************************HZ Encoding*************************************************
121 * Rules for HZ encoding
122 *
123 * In ASCII mode, a byte is interpreted as an ASCII character, unless a
124 * '~' is encountered. The character '~' is an escape character. By
125 * convention, it must be immediately followed ONLY by '~', '{' or '\n'
126 * (<LF>), with the following special meaning.
127
128 * 1. The escape sequence '~~' is interpreted as a '~'.
129 * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
130 * 3. The escape sequence '~\n' is a line-continuation marker to be
131 * consumed with no output produced.
132 * In GB mode, characters are interpreted two bytes at a time as (pure)
133 * GB codes until the escape-from-GB code '~}' is read. This code
134 * switches the mode from GB back to ASCII. (Note that the escape-
135 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
136 *
137 * Source: RFC 1842
138 */
139
140
141 static void
142 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
143 UErrorCode* err){
144 char tempBuf[2];
145 const char *mySource = ( char *) args->source;
146 UChar *myTarget = args->target;
147 const char *mySourceLimit = args->sourceLimit;
148 UChar32 targetUniChar = 0x0000;
149 UChar mySourceChar = 0x0000;
150 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
151
152 if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
153 *err = U_ILLEGAL_ARGUMENT_ERROR;
154 return;
155 }
156
157 while(mySource< mySourceLimit){
158
159 if(myTarget < args->targetLimit){
160
161 mySourceChar= (unsigned char) *mySource++;
162
163 switch(mySourceChar){
164 case 0x0A:
165 if(args->converter->mode ==UCNV_TILDE){
166 args->converter->mode=0;
167
168 }
169 *(myTarget++)=(UChar)mySourceChar;
170 continue;
171
172 case UCNV_TILDE:
173 if(args->converter->mode ==UCNV_TILDE){
174 *(myTarget++)=(UChar)mySourceChar;
175 args->converter->mode=0;
176 continue;
177
178 }
179 else if(args->converter->toUnicodeStatus !=0){
180 args->converter->mode=0;
181 break;
182 }
183 else{
184 args->converter->mode = UCNV_TILDE;
185 continue;
186 }
187
188
189 case UCNV_OPEN_BRACE:
190 if(args->converter->mode == UCNV_TILDE){
191 args->converter->mode=0;
192 myData->isStateDBCS = TRUE;
193 continue;
194 }
195 else{
196 break;
197 }
198
199
200 case UCNV_CLOSE_BRACE:
201 if(args->converter->mode == UCNV_TILDE){
202 args->converter->mode=0;
203 myData->isStateDBCS = FALSE;
204 continue;
205 }
206 else{
207 break;
208 }
209
210 default:
211 /* if the first byte is equal to TILDE and the trail byte
212 * is not a valid byte then it is an error condition
213 */
214 if(args->converter->mode == UCNV_TILDE){
215 args->converter->mode=0;
216 mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
217 goto SAVE_STATE;
218 }
219
220 break;
221
222 }
223
224 if(myData->isStateDBCS){
225 if(args->converter->toUnicodeStatus == 0x00){
226 args->converter->toUnicodeStatus = (UChar) mySourceChar;
227 continue;
228 }
229 else{
230 tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
231 tempBuf[1] = (char) (mySourceChar+0x80);
232 mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
233 args->converter->toUnicodeStatus =0x00;
234 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
235 tempBuf, 2, args->converter->useFallback);
236 }
237 }
238 else{
239 if(args->converter->fromUnicodeStatus == 0x00){
240 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
241 mySource - 1, 1, args->converter->useFallback);
242 }
243 else{
244 goto SAVE_STATE;
245 }
246
247 }
248 if(targetUniChar < 0xfffe){
249 if(args->offsets) {
250 args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
251 }
252
253 *(myTarget++)=(UChar)targetUniChar;
254 }
255 else if(targetUniChar>=0xfffe){
256 SAVE_STATE:
257 if(targetUniChar == 0xfffe){
258 *err = U_INVALID_CHAR_FOUND;
259 }
260 else{
261 *err = U_ILLEGAL_CHAR_FOUND;
262 }
263 if(myData->isStateDBCS){
264 args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
265 args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
266 args->converter->toULength=2;
267 }
268 else{
269 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
270 args->converter->toULength=1;
271 }
272 break;
273 }
274 }
275 else{
276 *err =U_BUFFER_OVERFLOW_ERROR;
277 break;
278 }
279 }
280
281 args->target = myTarget;
282 args->source = mySource;
283 }
284
285
286 static void
287 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
288 UErrorCode * err){
289 const UChar *mySource = args->source;
290 char *myTarget = args->target;
291 int32_t* offsets = args->offsets;
292 int32_t mySourceIndex = 0;
293 int32_t myTargetIndex = 0;
294 int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
295 int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
296 int32_t length=0;
297 uint32_t targetUniChar = 0x0000;
298 UChar32 mySourceChar = 0x0000,c=0x0000;
299 UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
300 UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
301 UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
302 UBool isEscapeAppended =FALSE;
303 int len =0;
304 const char* escSeq=NULL;
305
306 if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
307 *err = U_ILLEGAL_ARGUMENT_ERROR;
308 return;
309 }
310 if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
311 goto getTrail;
312 }
313 /*writing the char to the output stream */
314 while (mySourceIndex < mySourceLength){
315 targetUniChar = missingCharMarker;
316 if (myTargetIndex < targetLength){
317
318 c=mySourceChar = (UChar) mySource[mySourceIndex++];
319
320
321 oldIsTargetUCharDBCS = isTargetUCharDBCS;
322 if(mySourceChar ==UCNV_TILDE){
323 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
324 len = ESC_LEN;
325 escSeq = TILDE_ESCAPE;
326 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
327 continue;
328 }
329 else{
330 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
331 mySourceChar,&targetUniChar,args->converter->useFallback);
332
333 }
334 /* only DBCS or SBCS characters are expected*/
335 /* DB haracters with high bit set to 1 are expected */
336 if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
337 targetUniChar= missingCharMarker;
338 }
339 if (targetUniChar != missingCharMarker){
340 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
341 if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
342 /*Shifting from a double byte to single byte mode*/
343 if(!isTargetUCharDBCS){
344 len =ESC_LEN;
345 escSeq = SB_ESCAPE;
346 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
347 myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
348 }
349 else{ /* Shifting from a single byte to double byte mode*/
350 len =ESC_LEN;
351 escSeq = DB_ESCAPE;
352 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
353 myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
354
355 }
356 }
357
358 if(isTargetUCharDBCS){
359 if( myTargetIndex <targetLength){
360 myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
361 if(offsets){
362 *(offsets++) = mySourceIndex-1;
363 }
364 if(myTargetIndex < targetLength){
365 myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
366 if(offsets){
367 *(offsets++) = mySourceIndex-1;
368 }
369 }else{
370 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
371 *err = U_BUFFER_OVERFLOW_ERROR;
372 }
373 }else{
374 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
375 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
376 *err = U_BUFFER_OVERFLOW_ERROR;
377 }
378
379 }else{
380 if( myTargetIndex <targetLength){
381 myTarget[myTargetIndex++] = (char) (targetUniChar );
382 if(offsets){
383 *(offsets++) = mySourceIndex-1;
384 }
385
386 }else{
387 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
388 *err = U_BUFFER_OVERFLOW_ERROR;
389 }
390 }
391
392 }
393 else{
394 /* oops.. the code point is unassigned */
395 /*Handle surrogates */
396 /*check if the char is a First surrogate*/
397 if(UTF_IS_SURROGATE(mySourceChar)) {
398 if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
399 args->converter->fromUChar32=mySourceChar;
400 getTrail:
401 /*look ahead to find the trail surrogate*/
402 if(mySourceIndex < mySourceLength) {
403 /* test the following code unit */
404 UChar trail=(UChar) args->source[mySourceIndex];
405 if(UTF_IS_SECOND_SURROGATE(trail)) {
406 ++mySourceIndex;
407 mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
408 args->converter->fromUChar32=0x00;
409 /* there are no surrogates in GB2312*/
410 *err = U_INVALID_CHAR_FOUND;
411 /* exit this condition tree */
412 } else {
413 /* this is an unmatched lead code unit (1st surrogate) */
414 /* callback(illegal) */
415 *err=U_ILLEGAL_CHAR_FOUND;
416 }
417 } else {
418 /* no more input */
419 *err = U_ZERO_ERROR;
420 }
421 } else {
422 /* this is an unmatched trail code unit (2nd surrogate) */
423 /* callback(illegal) */
424 *err=U_ILLEGAL_CHAR_FOUND;
425 }
426 } else {
427 /* callback(unassigned) for a BMP code point */
428 *err = U_INVALID_CHAR_FOUND;
429 }
430
431 args->converter->fromUChar32=mySourceChar;
432 break;
433 }
434 }
435 else{
436 *err = U_BUFFER_OVERFLOW_ERROR;
437 break;
438 }
439 targetUniChar=missingCharMarker;
440 }
441
442 args->target += myTargetIndex;
443 args->source += mySourceIndex;
444 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
445 }
446
447 static void
448 _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
449 UConverter *cnv = args->converter;
450 UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
451 char *p;
452 char buffer[4];
453 p = buffer;
454
455 if( convData->isTargetUCharDBCS){
456 *p++= UCNV_TILDE;
457 *p++= UCNV_CLOSE_BRACE;
458 convData->isTargetUCharDBCS=FALSE;
459 }
460 *p++= cnv->subChar[0];
461
462 ucnv_cbFromUWriteBytes(args,
463 buffer, (int32_t)(p - buffer),
464 offsetIndex, err);
465 }
466
467 /* structure for SafeClone calculations */
468 struct cloneHZStruct
469 {
470 UConverter cnv;
471 UAlignedMemory deadSpace1;
472 UConverter subCnv;
473 UAlignedMemory deadSpace2;
474 UConverterDataHZ mydata;
475 };
476
477
478 static UConverter *
479 _HZ_SafeClone(const UConverter *cnv,
480 void *stackBuffer,
481 int32_t *pBufferSize,
482 UErrorCode *status)
483 {
484 struct cloneHZStruct * localClone;
485 int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
486
487 if (U_FAILURE(*status)){
488 return 0;
489 }
490
491 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
492 *pBufferSize = bufferSizeNeeded;
493 return 0;
494 }
495
496 localClone = (struct cloneHZStruct *)stackBuffer;
497 uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
498
499 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
500 localClone->cnv.extraInfo = &localClone->mydata;
501 localClone->cnv.isExtraLocal = TRUE;
502
503 /* deep-clone the sub-converter */
504 size = (int32_t)sizeof(UConverter);
505 ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
506 ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
507
508 return &localClone->cnv;
509 }
510
511 static void
512 _HZ_GetUnicodeSet(const UConverter *cnv,
513 USetAdder *sa,
514 UConverterUnicodeSet which,
515 UErrorCode *pErrorCode) {
516 /* the tilde '~' is hardcoded in the converter */
517 sa->add(sa->set, 0x7e);
518
519 /* add all of the code points that the sub-converter handles */
520 ((UConverterDataHZ*)cnv->extraInfo)->
521 gbConverter->sharedData->impl->
522 getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
523 sa, which, pErrorCode);
524 }
525
526 static const UConverterImpl _HZImpl={
527
528 UCNV_HZ,
529
530 NULL,
531 NULL,
532
533 _HZOpen,
534 _HZClose,
535 _HZReset,
536
537 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
538 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
539 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
540 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
541 NULL,
542
543 NULL,
544 NULL,
545 _HZ_WriteSub,
546 _HZ_SafeClone,
547 _HZ_GetUnicodeSet
548 };
549
550 static const UConverterStaticData _HZStaticData={
551 sizeof(UConverterStaticData),
552 "HZ",
553 0,
554 UCNV_IBM,
555 UCNV_HZ,
556 1,
557 4,
558 { 0x1a, 0, 0, 0 },
559 1,
560 FALSE,
561 FALSE,
562 0,
563 0,
564 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
565
566 };
567
568
569 const UConverterSharedData _HZData={
570 sizeof(UConverterSharedData),
571 ~((uint32_t) 0),
572 NULL,
573 NULL,
574 &_HZStaticData,
575 FALSE,
576 &_HZImpl,
577 0
578 };
579
580 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */