2 **********************************************************************
3 * Copyright (C) 2000-2003, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2000oct16
12 * created by: Ram Viswanadha
13 * 10/31/2000 Ram Implemented offsets logic function
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_LEGACY_CONVERSION
22 #include "unicode/ucnv_err.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/ucnv_cb.h"
25 #include "unicode/uset.h"
29 #define UCNV_TILDE 0x7E /* ~ */
30 #define UCNV_OPEN_BRACE 0x7B /* { */
31 #define UCNV_CLOSE_BRACE 0x7D /* } */
32 #define SB_ESCAPE "\x7E\x7D"
33 #define DB_ESCAPE "\x7E\x7B"
34 #define TILDE_ESCAPE "\x7E\x7E"
38 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
40 if(targetIndex < targetLength){ \
41 args->target[targetIndex] = (unsigned char) *strToAppend; \
42 if(args->offsets!=NULL){ \
43 *(offsets++) = sourceIndex-1; \
48 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
49 *err =U_BUFFER_OVERFLOW_ERROR; \
59 UBool isEscapeAppended
;
60 UConverter
* gbConverter
;
62 UBool isTargetUCharDBCS
;
68 _HZOpen(UConverter
*cnv
, const char *name
,const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
69 cnv
->toUnicodeStatus
= 0;
70 cnv
->fromUnicodeStatus
= 0;
72 cnv
->fromUSurrogateLead
=0x0000;
73 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataHZ
));
74 if(cnv
->extraInfo
!= NULL
){
75 ((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
= ucnv_open("ibm-1386",errorCode
);
76 ((UConverterDataHZ
*)cnv
->extraInfo
)->isStateDBCS
= FALSE
;
77 ((UConverterDataHZ
*)cnv
->extraInfo
)->isEscapeAppended
= FALSE
;
78 ((UConverterDataHZ
*)cnv
->extraInfo
)->targetIndex
= 0;
79 ((UConverterDataHZ
*)cnv
->extraInfo
)->sourceIndex
= 0;
80 ((UConverterDataHZ
*)cnv
->extraInfo
)->isTargetUCharDBCS
= FALSE
;
84 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
90 _HZClose(UConverter
*cnv
){
91 if(cnv
->extraInfo
!= NULL
) {
92 ucnv_close (((UConverterDataHZ
*) (cnv
->extraInfo
))->gbConverter
);
93 if(!cnv
->isExtraLocal
) {
94 uprv_free(cnv
->extraInfo
);
96 cnv
->extraInfo
= NULL
;
101 _HZReset(UConverter
*cnv
, UConverterResetChoice choice
){
102 if(choice
<=UCNV_RESET_TO_UNICODE
) {
103 cnv
->toUnicodeStatus
= 0;
105 if(cnv
->extraInfo
!= NULL
){
106 ((UConverterDataHZ
*)cnv
->extraInfo
)->isStateDBCS
= FALSE
;
109 if(choice
!=UCNV_RESET_TO_UNICODE
) {
110 cnv
->fromUnicodeStatus
= 0;
111 cnv
->fromUSurrogateLead
=0x0000;
112 if(cnv
->extraInfo
!= NULL
){
113 ((UConverterDataHZ
*)cnv
->extraInfo
)->isEscapeAppended
= FALSE
;
114 ((UConverterDataHZ
*)cnv
->extraInfo
)->targetIndex
= 0;
115 ((UConverterDataHZ
*)cnv
->extraInfo
)->sourceIndex
= 0;
116 ((UConverterDataHZ
*)cnv
->extraInfo
)->isTargetUCharDBCS
= FALSE
;
121 /**************************************HZ Encoding*************************************************
122 * Rules for HZ encoding
124 * In ASCII mode, a byte is interpreted as an ASCII character, unless a
125 * '~' is encountered. The character '~' is an escape character. By
126 * convention, it must be immediately followed ONLY by '~', '{' or '\n'
127 * (<LF>), with the following special meaning.
129 * 1. The escape sequence '~~' is interpreted as a '~'.
130 * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
131 * 3. The escape sequence '~\n' is a line-continuation marker to be
132 * consumed with no output produced.
133 * In GB mode, characters are interpreted two bytes at a time as (pure)
134 * GB codes until the escape-from-GB code '~}' is read. This code
135 * switches the mode from GB back to ASCII. (Note that the escape-
136 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
143 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
147 const char *mySource
= ( char *) args
->source
;
148 UChar
*myTarget
= args
->target
;
149 char *tempLimit
= &tempBuf
[3];
150 const char *mySourceLimit
= args
->sourceLimit
;
151 UChar32 targetUniChar
= 0x0000;
152 UChar mySourceChar
= 0x0000;
153 UConverterDataHZ
* myData
=(UConverterDataHZ
*)(args
->converter
->extraInfo
);
155 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (args
->sourceLimit
< args
->source
)){
156 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
160 while(mySource
< args
->sourceLimit
){
162 if(myTarget
< args
->targetLimit
){
164 mySourceChar
= (unsigned char) *mySource
++;
166 switch(mySourceChar
){
168 if(args
->converter
->mode
==UCNV_TILDE
){
169 args
->converter
->mode
=0;
172 *(myTarget
++)=(UChar
)mySourceChar
;
176 if(args
->converter
->mode
==UCNV_TILDE
){
177 *(myTarget
++)=(UChar
)mySourceChar
;
178 args
->converter
->mode
=0;
182 else if(args
->converter
->toUnicodeStatus
!=0){
183 args
->converter
->mode
=0;
187 args
->converter
->mode
= UCNV_TILDE
;
192 case UCNV_OPEN_BRACE
:
193 if(args
->converter
->mode
== UCNV_TILDE
){
194 args
->converter
->mode
=0;
195 myData
->isStateDBCS
= TRUE
;
203 case UCNV_CLOSE_BRACE
:
204 if(args
->converter
->mode
== UCNV_TILDE
){
205 args
->converter
->mode
=0;
206 myData
->isStateDBCS
= FALSE
;
214 /* if the first byte is equal to TILDE and the trail byte
215 * is not a valid byte then it is an error condition
217 if(args
->converter
->mode
== UCNV_TILDE
){
218 args
->converter
->mode
=0;
219 mySourceChar
= (UChar
)(((UCNV_TILDE
+0x80) << 8) | ((mySourceChar
& 0x00ff)+0x80));
227 if(myData
->isStateDBCS
){
228 if(args
->converter
->toUnicodeStatus
== 0x00){
229 args
->converter
->toUnicodeStatus
= (UChar
) mySourceChar
;
233 tempBuf
[0] = (char) (args
->converter
->toUnicodeStatus
+0x80) ;
234 tempBuf
[1] = (char) (mySourceChar
+0x80);
235 mySourceChar
= (UChar
)(((args
->converter
->toUnicodeStatus
+0x80) << 8) | ((mySourceChar
& 0x00ff)+0x80));
236 args
->converter
->toUnicodeStatus
=0x00;
238 tempLimit
= &tempBuf
[2]+1;
239 targetUniChar
= _MBCSSimpleGetNextUChar(myData
->gbConverter
->sharedData
,
240 &pBuf
,tempLimit
,args
->converter
->useFallback
);
244 if(args
->converter
->fromUnicodeStatus
== 0x00){
245 tempBuf
[0] = (char) mySourceChar
;
247 tempLimit
= &tempBuf
[1];
248 targetUniChar
= _MBCSSimpleGetNextUChar(myData
->gbConverter
->sharedData
,
249 &pBuf
,tempLimit
,args
->converter
->useFallback
);
256 if(targetUniChar
< 0xfffe){
258 args
->offsets
[myTarget
- args
->target
]=(int32_t)(mySource
- args
->source
- 1-(myData
->isStateDBCS
));
261 *(myTarget
++)=(UChar
)targetUniChar
;
263 else if(targetUniChar
>=0xfffe){
266 const char *saveSource
= args
->source
;
267 UChar
*saveTarget
= args
->target
;
268 int32_t *saveOffsets
= args
->offsets
;
270 UConverterCallbackReason reason
;
271 int32_t currentOffset
;
272 int32_t saveIndex
= (int32_t)(myTarget
- args
->target
);
274 args
->converter
->invalidCharLength
=0;
276 if(targetUniChar
== 0xfffe){
277 reason
= UCNV_UNASSIGNED
;
278 *err
= U_INVALID_CHAR_FOUND
;
281 reason
= UCNV_ILLEGAL
;
282 *err
= U_ILLEGAL_CHAR_FOUND
;
284 if(myData
->isStateDBCS
){
286 args
->converter
->invalidCharBuffer
[args
->converter
->invalidCharLength
++] = (char)(tempBuf
[0]-0x80);
287 args
->converter
->invalidCharBuffer
[args
->converter
->invalidCharLength
++] = (char)(tempBuf
[1]-0x80);
288 currentOffset
= (int32_t)(mySource
- args
->source
-2);
292 args
->converter
->invalidCharBuffer
[args
->converter
->invalidCharLength
++] = (char)mySourceChar
;
293 currentOffset
= (int32_t)(mySource
- args
->source
-1);
295 args
->offsets
= args
->offsets
?args
->offsets
+(myTarget
- args
->target
):0;
296 args
->target
= myTarget
;
297 args
->source
= mySource
;
298 myTarget
= saveTarget
;
299 args
->converter
->fromCharErrorBehaviour (
300 args
->converter
->toUContext
,
302 args
->converter
->invalidCharBuffer
,
303 args
->converter
->invalidCharLength
,
308 args
->offsets
= saveOffsets
;
310 for (;saveIndex
< (args
->target
- myTarget
);saveIndex
++) {
311 args
->offsets
[saveIndex
] += currentOffset
;
314 args
->source
= saveSource
;
315 myTarget
= args
->target
;
316 args
->target
= saveTarget
;
317 args
->offsets
= saveOffsets
;
324 *err
=U_BUFFER_OVERFLOW_ERROR
;
328 if((args
->flush
==TRUE
)
329 && (mySource
== mySourceLimit
)
330 && ( args
->converter
->toUnicodeStatus
!=0x00)){
331 *err
= U_TRUNCATED_CHAR_FOUND
;
332 args
->converter
->toUnicodeStatus
= 0x00;
334 /* Reset the state of converter if we consumed
335 * the source and flush is true
337 if( (mySource
== mySourceLimit
) && args
->flush
){
338 _HZReset(args
->converter
, UCNV_RESET_TO_UNICODE
);
341 args
->target
= myTarget
;
342 args
->source
= mySource
;
347 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
349 const UChar
*mySource
= args
->source
;
350 unsigned char *myTarget
= (unsigned char *) args
->target
;
351 int32_t* offsets
= args
->offsets
;
352 int32_t mySourceIndex
= 0;
353 int32_t myTargetIndex
= 0;
354 int32_t targetLength
= (int32_t)(args
->targetLimit
- args
->target
);
355 int32_t mySourceLength
= (int32_t)(args
->sourceLimit
- args
->source
);
357 uint32_t targetUniChar
= 0x0000;
358 UChar32 mySourceChar
= 0x0000,c
=0x0000;
359 UConverterDataHZ
*myConverterData
=(UConverterDataHZ
*)args
->converter
->extraInfo
;
360 UBool isTargetUCharDBCS
= (UBool
) myConverterData
->isTargetUCharDBCS
;
361 UBool oldIsTargetUCharDBCS
= isTargetUCharDBCS
;
362 UConverterCallbackReason reason
;
363 UBool isEscapeAppended
=FALSE
;
365 const char* escSeq
=NULL
;
367 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (args
->sourceLimit
< args
->source
)){
368 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
371 if(args
->converter
->fromUSurrogateLead
!=0 && myTargetIndex
< targetLength
) {
374 /*writing the char to the output stream */
375 while (mySourceIndex
< mySourceLength
){
376 targetUniChar
= missingCharMarker
;
377 if (myTargetIndex
< targetLength
){
379 c
=mySourceChar
= (UChar
) args
->source
[mySourceIndex
++];
382 oldIsTargetUCharDBCS
= isTargetUCharDBCS
;
383 if(mySourceChar
==UCNV_TILDE
){
384 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
386 escSeq
= TILDE_ESCAPE
;
387 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
391 length
= _MBCSFromUChar32(myConverterData
->gbConverter
->sharedData
,
392 mySourceChar
,&targetUniChar
,args
->converter
->useFallback
);
395 /* only DBCS or SBCS characters are expected*/
396 /* DB haracters with high bit set to 1 are expected */
397 if(length
> 2 || length
==0 ||(((targetUniChar
& 0x8080) != 0x8080)&& length
==2)){
398 targetUniChar
= missingCharMarker
;
400 if (targetUniChar
!= missingCharMarker
){
401 myConverterData
->isTargetUCharDBCS
= isTargetUCharDBCS
= (UBool
)(targetUniChar
>0x00FF);
402 if(oldIsTargetUCharDBCS
!= isTargetUCharDBCS
|| !myConverterData
->isEscapeAppended
){
403 /*Shifting from a double byte to single byte mode*/
404 if(!isTargetUCharDBCS
){
407 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
408 myConverterData
->isEscapeAppended
=isEscapeAppended
=TRUE
;
410 else{ /* Shifting from a single byte to double byte mode*/
413 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
414 myConverterData
->isEscapeAppended
=isEscapeAppended
=TRUE
;
419 if(isTargetUCharDBCS
){
420 if( myTargetIndex
<targetLength
){
421 args
->target
[myTargetIndex
++] =(char) ((targetUniChar
>> 8) -0x80);
423 *(offsets
++) = mySourceIndex
-1;
425 if(myTargetIndex
< targetLength
){
426 args
->target
[myTargetIndex
++] =(char) ((targetUniChar
& 0x00FF) -0x80);
428 *(offsets
++) = mySourceIndex
-1;
431 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) ((targetUniChar
& 0x00FF) -0x80);
432 *err
= U_BUFFER_OVERFLOW_ERROR
;
435 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] =(char) ((targetUniChar
>> 8) -0x80);
436 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) ((targetUniChar
& 0x00FF) -0x80);
437 *err
= U_BUFFER_OVERFLOW_ERROR
;
441 if( myTargetIndex
<targetLength
){
442 args
->target
[myTargetIndex
++] = (char) (targetUniChar
);
444 *(offsets
++) = mySourceIndex
-1;
448 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) targetUniChar
;
449 *err
= U_BUFFER_OVERFLOW_ERROR
;
455 /* oops.. the code point is unassingned
456 * set the error and reason
458 reason
=UCNV_UNASSIGNED
;
459 *err
=U_INVALID_CHAR_FOUND
;
460 /*Handle surrogates */
461 /*check if the char is a First surrogate*/
462 if(UTF_IS_SURROGATE(mySourceChar
)) {
463 if(UTF_IS_SURROGATE_FIRST(mySourceChar
)) {
464 args
->converter
->fromUSurrogateLead
=(UChar
)mySourceChar
;
466 /*look ahead to find the trail surrogate*/
467 if(mySourceIndex
< mySourceLength
) {
468 /* test the following code unit */
469 UChar trail
=(UChar
) args
->source
[mySourceIndex
];
470 if(UTF_IS_SECOND_SURROGATE(trail
)) {
472 mySourceChar
=UTF16_GET_PAIR_VALUE(args
->converter
->fromUSurrogateLead
, trail
);
473 args
->converter
->fromUSurrogateLead
=0x00;
474 /* there are no surrogates in GB2312*/
475 *err
= U_INVALID_CHAR_FOUND
;
476 reason
=UCNV_UNASSIGNED
;
477 /* exit this condition tree */
479 /* this is an unmatched lead code unit (1st surrogate) */
480 /* callback(illegal) */
482 *err
=U_ILLEGAL_CHAR_FOUND
;
490 /* this is an unmatched trail code unit (2nd surrogate) */
491 /* callback(illegal) */
493 *err
=U_ILLEGAL_CHAR_FOUND
;
499 int32_t currentOffset
= (args
->offsets
) ? *(offsets
-1)+1:0;
500 char * saveTarget
= args
->target
;
501 const UChar
* saveSource
= args
->source
;
502 int32_t *saveOffsets
= args
->offsets
;
504 args
->converter
->invalidUCharLength
= 0;
506 if(mySourceChar
>0xffff){
507 args
->converter
->invalidUCharBuffer
[args
->converter
->invalidUCharLength
++] =(uint16_t)(((mySourceChar
)>>10)+0xd7c0);
508 args
->converter
->invalidUCharBuffer
[args
->converter
->invalidUCharLength
++] =(uint16_t)(((mySourceChar
)&0x3ff)|0xdc00);
511 args
->converter
->invalidUCharBuffer
[args
->converter
->invalidUCharLength
++] =(UChar
)mySourceChar
;
514 myConverterData
->isTargetUCharDBCS
= (UBool
)isTargetUCharDBCS
;
515 args
->target
+= myTargetIndex
;
516 args
->source
+= mySourceIndex
;
517 args
->offsets
= args
->offsets
?offsets
:0;
520 saveIndex
= myTargetIndex
;
521 /*copies current values for the ErrorFunctor to update */
522 /*Calls the ErrorFunctor */
523 args
->converter
->fromUCharErrorBehaviour ( args
->converter
->fromUContext
,
525 args
->converter
->invalidUCharBuffer
,
526 args
->converter
->invalidUCharLength
,
527 (UChar32
) (mySourceChar
),
530 /*Update the local Indexes so that the conversion
531 *can restart at the right points
533 myTargetIndex
= (int32_t)(args
->target
- (char*)myTarget
);
534 mySourceIndex
= (int32_t)(args
->source
- mySource
);
535 args
->offsets
= saveOffsets
;
536 saveIndex
= myTargetIndex
- saveIndex
;
538 args
->offsets
= saveOffsets
;
539 while(saveIndex
-->0){
540 *offsets
= currentOffset
;
544 isTargetUCharDBCS
=myConverterData
->isTargetUCharDBCS
;
545 args
->source
= saveSource
;
546 args
->target
= saveTarget
;
547 args
->offsets
= saveOffsets
;
548 args
->converter
->fromUSurrogateLead
=0x00;
549 if (U_FAILURE (*err
))
556 *err
= U_BUFFER_OVERFLOW_ERROR
;
559 targetUniChar
=missingCharMarker
;
561 /*If at the end of conversion we are still carrying state information
562 *flush is TRUE, we can deduce that the input stream is truncated
564 if (args
->converter
->fromUSurrogateLead
!=0 && (mySourceIndex
== mySourceLength
) && args
->flush
){
565 *err
= U_TRUNCATED_CHAR_FOUND
;
566 args
->converter
->toUnicodeStatus
= 0x00;
568 /* Reset the state of converter if we consumed
569 * the source and flush is true
571 if( (mySourceIndex
== mySourceLength
) && args
->flush
){
572 _HZReset(args
->converter
, UCNV_RESET_FROM_UNICODE
);
575 args
->target
+= myTargetIndex
;
576 args
->source
+= mySourceIndex
;
577 myConverterData
->isTargetUCharDBCS
= isTargetUCharDBCS
;
581 _HZ_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
582 UConverter
*cnv
= args
->converter
;
583 UConverterDataHZ
*convData
=(UConverterDataHZ
*) cnv
->extraInfo
;
588 if( convData
->isTargetUCharDBCS
){
590 *p
++= UCNV_CLOSE_BRACE
;
591 convData
->isTargetUCharDBCS
=FALSE
;
593 *p
++= cnv
->subChar
[0];
595 ucnv_cbFromUWriteBytes(args
,
596 buffer
, (int32_t)(p
- buffer
),
600 /* structure for SafeClone calculations */
604 UAlignedMemory deadSpace1
;
606 UAlignedMemory deadSpace2
;
607 UConverterDataHZ mydata
;
612 _HZ_SafeClone(const UConverter
*cnv
,
614 int32_t *pBufferSize
,
617 struct cloneStruct
* localClone
;
618 int32_t size
, bufferSizeNeeded
= sizeof(struct cloneStruct
);
620 if (U_FAILURE(*status
)){
624 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
625 *pBufferSize
= bufferSizeNeeded
;
629 localClone
= (struct cloneStruct
*)stackBuffer
;
630 uprv_memcpy(&localClone
->cnv
, cnv
, sizeof(UConverter
));
631 localClone
->cnv
.isCopyLocal
= TRUE
;
633 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataHZ
));
634 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
635 localClone
->cnv
.isExtraLocal
= TRUE
;
637 /* deep-clone the sub-converter */
638 size
= (int32_t)sizeof(UConverter
);
639 ((UConverterDataHZ
*)localClone
->cnv
.extraInfo
)->gbConverter
=
640 ucnv_safeClone(((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
, &localClone
->subCnv
, &size
, status
);
642 return &localClone
->cnv
;
646 _HZ_GetUnicodeSet(const UConverter
*cnv
,
648 UConverterUnicodeSet which
,
649 UErrorCode
*pErrorCode
) {
650 /* the tilde '~' is hardcoded in the converter */
653 /* add all of the code points that the sub-converter handles */
654 ((UConverterDataHZ
*)cnv
->extraInfo
)->
655 gbConverter
->sharedData
->impl
->
656 getUnicodeSet(((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
,
657 set
, which
, pErrorCode
);
660 static const UConverterImpl _HZImpl
={
671 UConverter_toUnicode_HZ_OFFSETS_LOGIC
,
672 UConverter_toUnicode_HZ_OFFSETS_LOGIC
,
673 UConverter_fromUnicode_HZ_OFFSETS_LOGIC
,
674 UConverter_fromUnicode_HZ_OFFSETS_LOGIC
,
684 static const UConverterStaticData _HZStaticData
={
685 sizeof(UConverterStaticData
),
698 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
703 const UConverterSharedData _HZData
={
704 sizeof(UConverterSharedData
),
714 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */