1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2000-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 * tab size: 8 (not used)
13 * created on: 2000oct16
14 * created by: Ram Viswanadha
15 * 10/31/2000 Ram Implemented offsets logic function
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/uset.h"
27 #include "unicode/utf16.h"
32 #define UCNV_TILDE 0x7E /* ~ */
33 #define UCNV_OPEN_BRACE 0x7B /* { */
34 #define UCNV_CLOSE_BRACE 0x7D /* } */
35 #define SB_ESCAPE "\x7E\x7D"
36 #define DB_ESCAPE "\x7E\x7B"
37 #define TILDE_ESCAPE "\x7E\x7E"
41 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
43 if(targetIndex < targetLength){ \
44 args->target[targetIndex] = (unsigned char) *strToAppend; \
45 if(args->offsets!=NULL){ \
46 *(offsets++) = sourceIndex-1; \
51 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
52 *err =U_BUFFER_OVERFLOW_ERROR; \
60 UConverter
* gbConverter
;
63 UBool isEscapeAppended
;
65 UBool isTargetUCharDBCS
;
71 static void U_CALLCONV
72 _HZOpen(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
){
73 UConverter
*gbConverter
;
74 if(pArgs
->onlyTestIsLoadable
) {
75 ucnv_canCreateConverter("GBK", errorCode
); /* errorCode carries result */
78 gbConverter
= ucnv_open("GBK", errorCode
);
79 if(U_FAILURE(*errorCode
)) {
82 cnv
->toUnicodeStatus
= 0;
83 cnv
->fromUnicodeStatus
= 0;
85 cnv
->fromUChar32
=0x0000;
86 cnv
->extraInfo
= uprv_calloc(1, sizeof(UConverterDataHZ
));
87 if(cnv
->extraInfo
!= NULL
){
88 ((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
= gbConverter
;
91 ucnv_close(gbConverter
);
92 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
97 static void U_CALLCONV
98 _HZClose(UConverter
*cnv
){
99 if(cnv
->extraInfo
!= NULL
) {
100 ucnv_close (((UConverterDataHZ
*) (cnv
->extraInfo
))->gbConverter
);
101 if(!cnv
->isExtraLocal
) {
102 uprv_free(cnv
->extraInfo
);
104 cnv
->extraInfo
= NULL
;
108 static void U_CALLCONV
109 _HZReset(UConverter
*cnv
, UConverterResetChoice choice
){
110 if(choice
<=UCNV_RESET_TO_UNICODE
) {
111 cnv
->toUnicodeStatus
= 0;
113 if(cnv
->extraInfo
!= NULL
){
114 ((UConverterDataHZ
*)cnv
->extraInfo
)->isStateDBCS
= FALSE
;
115 ((UConverterDataHZ
*)cnv
->extraInfo
)->isEmptySegment
= FALSE
;
118 if(choice
!=UCNV_RESET_TO_UNICODE
) {
119 cnv
->fromUnicodeStatus
= 0;
120 cnv
->fromUChar32
=0x0000;
121 if(cnv
->extraInfo
!= NULL
){
122 ((UConverterDataHZ
*)cnv
->extraInfo
)->isEscapeAppended
= FALSE
;
123 ((UConverterDataHZ
*)cnv
->extraInfo
)->targetIndex
= 0;
124 ((UConverterDataHZ
*)cnv
->extraInfo
)->sourceIndex
= 0;
125 ((UConverterDataHZ
*)cnv
->extraInfo
)->isTargetUCharDBCS
= FALSE
;
130 /**************************************HZ Encoding*************************************************
131 * Rules for HZ encoding
133 * In ASCII mode, a byte is interpreted as an ASCII character, unless a
134 * '~' is encountered. The character '~' is an escape character. By
135 * convention, it must be immediately followed ONLY by '~', '{' or '\n'
136 * (<LF>), with the following special meaning.
138 * 1. The escape sequence '~~' is interpreted as a '~'.
139 * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
140 * 3. The escape sequence '~\n' is a line-continuation marker to be
141 * consumed with no output produced.
142 * In GB mode, characters are interpreted two bytes at a time as (pure)
143 * GB codes until the escape-from-GB code '~}' is read. This code
144 * switches the mode from GB back to ASCII. (Note that the escape-
145 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
149 * Note that the formal syntax in RFC 1842 is invalid. I assume that the
150 * intended definition of single-byte-segment is as follows (pedberg):
151 * single-byte-segment = single-byte-seq 1*single-byte-char
155 static void U_CALLCONV
156 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
159 const char *mySource
= ( char *) args
->source
;
160 UChar
*myTarget
= args
->target
;
161 const char *mySourceLimit
= args
->sourceLimit
;
162 UChar32 targetUniChar
= 0x0000;
163 int32_t mySourceChar
= 0x0000;
164 UConverterDataHZ
* myData
=(UConverterDataHZ
*)(args
->converter
->extraInfo
);
168 /* Calling code already handles this situation. */
169 /*if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
170 *err = U_ILLEGAL_ARGUMENT_ERROR;
174 while(mySource
< mySourceLimit
){
176 if(myTarget
< args
->targetLimit
){
178 mySourceChar
= (unsigned char) *mySource
++;
180 if(args
->converter
->mode
== UCNV_TILDE
) {
181 /* second byte after ~ */
182 args
->converter
->mode
=0;
183 switch(mySourceChar
) {
185 /* no output for ~\n (line-continuation marker) */
189 args
->offsets
[myTarget
- args
->target
]=(int32_t)(mySource
- args
->source
- 2);
191 *(myTarget
++)=(UChar
)mySourceChar
;
192 myData
->isEmptySegment
= FALSE
;
194 case UCNV_OPEN_BRACE
:
195 case UCNV_CLOSE_BRACE
:
196 myData
->isStateDBCS
= (mySourceChar
== UCNV_OPEN_BRACE
);
197 if (myData
->isEmptySegment
) {
198 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
199 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
200 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
201 args
->converter
->toUBytes
[0] = UCNV_TILDE
;
202 args
->converter
->toUBytes
[1] = static_cast<uint8_t>(mySourceChar
);
203 args
->converter
->toULength
= 2;
204 args
->target
= myTarget
;
205 args
->source
= mySource
;
208 myData
->isEmptySegment
= TRUE
;
211 /* if the first byte is equal to TILDE and the trail byte
212 * is not a valid byte then it is an error condition
215 * Ticket 5691: consistent illegal sequences:
216 * - We include at least the first byte in the illegal sequence.
217 * - If any of the non-initial bytes could be the start of a character,
218 * we stop the illegal sequence before the first one of those.
220 myData
->isEmptySegment
= FALSE
; /* different error here, reset this to avoid spurious future error */
221 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
222 args
->converter
->toUBytes
[0] = UCNV_TILDE
;
223 if( myData
->isStateDBCS
?
224 (0x21 <= mySourceChar
&& mySourceChar
<= 0x7e) :
227 /* The current byte could be the start of a character: Back it out. */
228 args
->converter
->toULength
= 1;
231 /* Include the current byte in the illegal sequence. */
232 args
->converter
->toUBytes
[1] = static_cast<uint8_t>(mySourceChar
);
233 args
->converter
->toULength
= 2;
235 args
->target
= myTarget
;
236 args
->source
= mySource
;
239 } else if(myData
->isStateDBCS
) {
240 if(args
->converter
->toUnicodeStatus
== 0x00){
242 if(mySourceChar
== UCNV_TILDE
) {
243 args
->converter
->mode
= UCNV_TILDE
;
245 /* add another bit to distinguish a 0 byte from not having seen a lead byte */
246 args
->converter
->toUnicodeStatus
= (uint32_t) (mySourceChar
| 0x100);
247 myData
->isEmptySegment
= FALSE
; /* the segment has something, either valid or will produce a different error, so reset this */
253 int leadIsOk
, trailIsOk
;
254 uint32_t leadByte
= args
->converter
->toUnicodeStatus
& 0xff;
255 targetUniChar
= 0xffff;
257 * Ticket 5691: consistent illegal sequences:
258 * - We include at least the first byte in the illegal sequence.
259 * - If any of the non-initial bytes could be the start of a character,
260 * we stop the illegal sequence before the first one of those.
262 * In HZ DBCS, if the second byte is in the 21..7e range,
263 * we report only the first byte as the illegal sequence.
264 * Otherwise we convert or report the pair of bytes.
266 leadIsOk
= (uint8_t)(leadByte
- 0x21) <= (0x7d - 0x21);
267 trailIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
268 if (leadIsOk
&& trailIsOk
) {
269 tempBuf
[0] = (char) (leadByte
+0x80) ;
270 tempBuf
[1] = (char) (mySourceChar
+0x80);
271 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->gbConverter
->sharedData
,
272 tempBuf
, 2, args
->converter
->useFallback
);
273 mySourceChar
= (leadByte
<< 8) | mySourceChar
;
274 } else if (trailIsOk
) {
275 /* report a single illegal byte and continue with the following DBCS starter byte */
277 mySourceChar
= (int32_t)leadByte
;
279 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
280 /* add another bit so that the code below writes 2 bytes in case of error */
281 mySourceChar
= 0x10000 | (leadByte
<< 8) | mySourceChar
;
283 args
->converter
->toUnicodeStatus
=0x00;
287 if(mySourceChar
== UCNV_TILDE
) {
288 args
->converter
->mode
= UCNV_TILDE
;
290 } else if(mySourceChar
<= 0x7f) {
291 targetUniChar
= (UChar
)mySourceChar
; /* ASCII */
292 myData
->isEmptySegment
= FALSE
; /* the segment has something valid */
294 targetUniChar
= 0xffff;
295 myData
->isEmptySegment
= FALSE
; /* different error here, reset this to avoid spurious future error */
298 if(targetUniChar
< 0xfffe){
300 args
->offsets
[myTarget
- args
->target
]=(int32_t)(mySource
- args
->source
- 1-(myData
->isStateDBCS
));
303 *(myTarget
++)=(UChar
)targetUniChar
;
305 else /* targetUniChar>=0xfffe */ {
306 if(targetUniChar
== 0xfffe){
307 *err
= U_INVALID_CHAR_FOUND
;
310 *err
= U_ILLEGAL_CHAR_FOUND
;
312 if(mySourceChar
> 0xff){
313 args
->converter
->toUBytes
[0] = (uint8_t)(mySourceChar
>> 8);
314 args
->converter
->toUBytes
[1] = (uint8_t)mySourceChar
;
315 args
->converter
->toULength
=2;
318 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
319 args
->converter
->toULength
=1;
325 *err
=U_BUFFER_OVERFLOW_ERROR
;
330 args
->target
= myTarget
;
331 args
->source
= mySource
;
335 static void U_CALLCONV
336 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
338 const UChar
*mySource
= args
->source
;
339 char *myTarget
= args
->target
;
340 int32_t* offsets
= args
->offsets
;
341 int32_t mySourceIndex
= 0;
342 int32_t myTargetIndex
= 0;
343 int32_t targetLength
= (int32_t)(args
->targetLimit
- myTarget
);
344 int32_t mySourceLength
= (int32_t)(args
->sourceLimit
- args
->source
);
345 uint32_t targetUniChar
= 0x0000;
346 UChar32 mySourceChar
= 0x0000;
347 UConverterDataHZ
*myConverterData
=(UConverterDataHZ
*)args
->converter
->extraInfo
;
348 UBool isTargetUCharDBCS
= (UBool
) myConverterData
->isTargetUCharDBCS
;
349 UBool oldIsTargetUCharDBCS
;
351 const char* escSeq
=NULL
;
353 /* Calling code already handles this situation. */
354 /*if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
355 *err = U_ILLEGAL_ARGUMENT_ERROR;
358 if(args
->converter
->fromUChar32
!=0 && myTargetIndex
< targetLength
) {
361 /*writing the char to the output stream */
362 while (mySourceIndex
< mySourceLength
){
363 targetUniChar
= missingCharMarker
;
364 if (myTargetIndex
< targetLength
){
366 mySourceChar
= (UChar
) mySource
[mySourceIndex
++];
369 oldIsTargetUCharDBCS
= isTargetUCharDBCS
;
370 if(mySourceChar
==UCNV_TILDE
){
371 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
373 escSeq
= TILDE_ESCAPE
;
374 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
376 } else if(mySourceChar
<= 0x7f) {
377 targetUniChar
= mySourceChar
;
379 int32_t length
= ucnv_MBCSFromUChar32(myConverterData
->gbConverter
->sharedData
,
380 mySourceChar
,&targetUniChar
,args
->converter
->useFallback
);
381 /* we can only use lead bytes 21..7D and trail bytes 21..7E */
383 (uint16_t)(targetUniChar
- 0xa1a1) <= (0xfdfe - 0xa1a1) &&
384 (uint8_t)(targetUniChar
- 0xa1) <= (0xfe - 0xa1)
386 targetUniChar
-= 0x8080;
388 targetUniChar
= missingCharMarker
;
391 if (targetUniChar
!= missingCharMarker
){
392 myConverterData
->isTargetUCharDBCS
= isTargetUCharDBCS
= (UBool
)(targetUniChar
>0x00FF);
393 if(oldIsTargetUCharDBCS
!= isTargetUCharDBCS
|| !myConverterData
->isEscapeAppended
){
394 /*Shifting from a double byte to single byte mode*/
395 if(!isTargetUCharDBCS
){
398 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
399 myConverterData
->isEscapeAppended
= TRUE
;
401 else{ /* Shifting from a single byte to double byte mode*/
404 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
405 myConverterData
->isEscapeAppended
= TRUE
;
410 if(isTargetUCharDBCS
){
411 if( myTargetIndex
<targetLength
){
412 myTarget
[myTargetIndex
++] =(char) (targetUniChar
>> 8);
414 *(offsets
++) = mySourceIndex
-1;
416 if(myTargetIndex
< targetLength
){
417 myTarget
[myTargetIndex
++] =(char) targetUniChar
;
419 *(offsets
++) = mySourceIndex
-1;
422 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) targetUniChar
;
423 *err
= U_BUFFER_OVERFLOW_ERROR
;
426 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] =(char) (targetUniChar
>> 8);
427 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) targetUniChar
;
428 *err
= U_BUFFER_OVERFLOW_ERROR
;
432 if( myTargetIndex
<targetLength
){
433 myTarget
[myTargetIndex
++] = (char) (targetUniChar
);
435 *(offsets
++) = mySourceIndex
-1;
439 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) targetUniChar
;
440 *err
= U_BUFFER_OVERFLOW_ERROR
;
446 /* oops.. the code point is unassigned */
447 /*Handle surrogates */
448 /*check if the char is a First surrogate*/
449 if(U16_IS_SURROGATE(mySourceChar
)) {
450 if(U16_IS_SURROGATE_LEAD(mySourceChar
)) {
451 args
->converter
->fromUChar32
=mySourceChar
;
453 /*look ahead to find the trail surrogate*/
454 if(mySourceIndex
< mySourceLength
) {
455 /* test the following code unit */
456 UChar trail
=(UChar
) args
->source
[mySourceIndex
];
457 if(U16_IS_TRAIL(trail
)) {
459 mySourceChar
=U16_GET_SUPPLEMENTARY(args
->converter
->fromUChar32
, trail
);
460 args
->converter
->fromUChar32
=0x00;
461 /* there are no surrogates in GB2312*/
462 *err
= U_INVALID_CHAR_FOUND
;
463 /* exit this condition tree */
465 /* this is an unmatched lead code unit (1st surrogate) */
466 /* callback(illegal) */
467 *err
=U_ILLEGAL_CHAR_FOUND
;
474 /* this is an unmatched trail code unit (2nd surrogate) */
475 /* callback(illegal) */
476 *err
=U_ILLEGAL_CHAR_FOUND
;
479 /* callback(unassigned) for a BMP code point */
480 *err
= U_INVALID_CHAR_FOUND
;
483 args
->converter
->fromUChar32
=mySourceChar
;
488 *err
= U_BUFFER_OVERFLOW_ERROR
;
491 targetUniChar
=missingCharMarker
;
494 args
->target
+= myTargetIndex
;
495 args
->source
+= mySourceIndex
;
496 myConverterData
->isTargetUCharDBCS
= isTargetUCharDBCS
;
499 static void U_CALLCONV
500 _HZ_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
501 UConverter
*cnv
= args
->converter
;
502 UConverterDataHZ
*convData
=(UConverterDataHZ
*) cnv
->extraInfo
;
507 if( convData
->isTargetUCharDBCS
){
509 *p
++= UCNV_CLOSE_BRACE
;
510 convData
->isTargetUCharDBCS
=FALSE
;
512 *p
++= (char)cnv
->subChars
[0];
514 ucnv_cbFromUWriteBytes(args
,
515 buffer
, (int32_t)(p
- buffer
),
520 * Structure for cloning an HZ converter into a single memory block.
521 * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct,
522 * and then ucnv_safeClone() of the sub-converter may additionally align
523 * subCnv inside the cloneHZStruct, for which we need the deadSpace after
524 * subCnv. This is because UAlignedMemory may be larger than the actually
525 * necessary alignment size for the platform.
526 * The other cloneHZStruct fields will not be moved around,
527 * and are aligned properly with cloneHZStruct's alignment.
533 UAlignedMemory deadSpace
;
534 UConverterDataHZ mydata
;
538 static UConverter
* U_CALLCONV
539 _HZ_SafeClone(const UConverter
*cnv
,
541 int32_t *pBufferSize
,
544 struct cloneHZStruct
* localClone
;
545 int32_t size
, bufferSizeNeeded
= sizeof(struct cloneHZStruct
);
547 if (U_FAILURE(*status
)){
551 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
552 *pBufferSize
= bufferSizeNeeded
;
556 localClone
= (struct cloneHZStruct
*)stackBuffer
;
557 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
559 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataHZ
));
560 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
561 localClone
->cnv
.isExtraLocal
= TRUE
;
563 /* deep-clone the sub-converter */
564 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
565 ((UConverterDataHZ
*)localClone
->cnv
.extraInfo
)->gbConverter
=
566 ucnv_safeClone(((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
, &localClone
->subCnv
, &size
, status
);
568 return &localClone
->cnv
;
571 static void U_CALLCONV
572 _HZ_GetUnicodeSet(const UConverter
*cnv
,
574 UConverterUnicodeSet which
,
575 UErrorCode
*pErrorCode
) {
576 /* HZ converts all of ASCII */
577 sa
->addRange(sa
->set
, 0, 0x7f);
579 /* add all of the code points that the sub-converter handles */
580 ucnv_MBCSGetFilteredUnicodeSetForUnicode(
581 ((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
->sharedData
,
582 sa
, which
, UCNV_SET_FILTER_HZ
,
586 static const UConverterImpl _HZImpl
={
597 UConverter_toUnicode_HZ_OFFSETS_LOGIC
,
598 UConverter_toUnicode_HZ_OFFSETS_LOGIC
,
599 UConverter_fromUnicode_HZ_OFFSETS_LOGIC
,
600 UConverter_fromUnicode_HZ_OFFSETS_LOGIC
,
612 static const UConverterStaticData _HZStaticData
={
613 sizeof(UConverterStaticData
),
626 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
630 const UConverterSharedData _HZData
=
631 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_HZStaticData
, &_HZImpl
);
633 #endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION */