]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvhz.c
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / ucnvhz.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 2000-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnvhz.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000oct16
12* created by: Ram Viswanadha
13* 10/31/2000 Ram Implemented offsets logic function
14*
15*/
16
17#include "unicode/utypes.h"
18
374ca955 19#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
20
21#include "cmemory.h"
b75a7d8f
A
22#include "unicode/ucnv.h"
23#include "unicode/ucnv_cb.h"
24#include "unicode/uset.h"
25#include "ucnv_bld.h"
26#include "ucnv_cnv.h"
27
28#define UCNV_TILDE 0x7E /* ~ */
29#define UCNV_OPEN_BRACE 0x7B /* { */
30#define UCNV_CLOSE_BRACE 0x7D /* } */
31#define SB_ESCAPE "\x7E\x7D"
32#define DB_ESCAPE "\x7E\x7B"
33#define TILDE_ESCAPE "\x7E\x7E"
34#define ESC_LEN 2
35
36
37#define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
38 while(len-->0){ \
39 if(targetIndex < targetLength){ \
40 args->target[targetIndex] = (unsigned char) *strToAppend; \
41 if(args->offsets!=NULL){ \
42 *(offsets++) = sourceIndex-1; \
43 } \
44 targetIndex++; \
45 } \
46 else{ \
47 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
48 *err =U_BUFFER_OVERFLOW_ERROR; \
49 } \
50 strToAppend++; \
51 } \
52}
53
54
55typedef struct{
56 int32_t targetIndex;
57 int32_t sourceIndex;
58 UBool isEscapeAppended;
59 UConverter* gbConverter;
60 UBool isStateDBCS;
61 UBool isTargetUCharDBCS;
62}UConverterDataHZ;
63
64
65
66static void
67_HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
68 cnv->toUnicodeStatus = 0;
69 cnv->fromUnicodeStatus= 0;
70 cnv->mode=0;
374ca955 71 cnv->fromUChar32=0x0000;
b75a7d8f
A
72 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ));
73 if(cnv->extraInfo != NULL){
74 ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
75 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
76 ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
77 ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
78 ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
79 ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
80 }
81 /* test for NULL */
82 else {
83 *errorCode = U_MEMORY_ALLOCATION_ERROR;
84 return;
85 }
86}
87
88static void
89_HZClose(UConverter *cnv){
90 if(cnv->extraInfo != NULL) {
91 ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
92 if(!cnv->isExtraLocal) {
93 uprv_free(cnv->extraInfo);
94 }
95 cnv->extraInfo = NULL;
96 }
97}
98
99static void
100_HZReset(UConverter *cnv, UConverterResetChoice choice){
101 if(choice<=UCNV_RESET_TO_UNICODE) {
102 cnv->toUnicodeStatus = 0;
103 cnv->mode=0;
104 if(cnv->extraInfo != NULL){
105 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
106 }
107 }
108 if(choice!=UCNV_RESET_TO_UNICODE) {
109 cnv->fromUnicodeStatus= 0;
374ca955 110 cnv->fromUChar32=0x0000;
b75a7d8f
A
111 if(cnv->extraInfo != NULL){
112 ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
113 ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
114 ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
115 ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
116 }
117 }
118}
119
120/**************************************HZ Encoding*************************************************
121* Rules for HZ encoding
122*
123* In ASCII mode, a byte is interpreted as an ASCII character, unless a
124* '~' is encountered. The character '~' is an escape character. By
125* convention, it must be immediately followed ONLY by '~', '{' or '\n'
126* (<LF>), with the following special meaning.
127
128* 1. The escape sequence '~~' is interpreted as a '~'.
129* 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
130* 3. The escape sequence '~\n' is a line-continuation marker to be
131* consumed with no output produced.
132* In GB mode, characters are interpreted two bytes at a time as (pure)
133* GB codes until the escape-from-GB code '~}' is read. This code
134* switches the mode from GB back to ASCII. (Note that the escape-
135* from-GB code '~}' ($7E7D) is outside the defined GB range.)
136*
137* Source: RFC 1842
138*/
139
140
141static void
142UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
143 UErrorCode* err){
374ca955 144 char tempBuf[2];
b75a7d8f
A
145 const char *mySource = ( char *) args->source;
146 UChar *myTarget = args->target;
b75a7d8f
A
147 const char *mySourceLimit = args->sourceLimit;
148 UChar32 targetUniChar = 0x0000;
149 UChar mySourceChar = 0x0000;
150 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
151
374ca955 152 if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
b75a7d8f
A
153 *err = U_ILLEGAL_ARGUMENT_ERROR;
154 return;
155 }
156
374ca955 157 while(mySource< mySourceLimit){
b75a7d8f
A
158
159 if(myTarget < args->targetLimit){
160
161 mySourceChar= (unsigned char) *mySource++;
162
163 switch(mySourceChar){
164 case 0x0A:
165 if(args->converter->mode ==UCNV_TILDE){
166 args->converter->mode=0;
167
168 }
169 *(myTarget++)=(UChar)mySourceChar;
170 continue;
171
172 case UCNV_TILDE:
173 if(args->converter->mode ==UCNV_TILDE){
174 *(myTarget++)=(UChar)mySourceChar;
175 args->converter->mode=0;
176 continue;
177
178 }
179 else if(args->converter->toUnicodeStatus !=0){
180 args->converter->mode=0;
181 break;
182 }
183 else{
184 args->converter->mode = UCNV_TILDE;
185 continue;
186 }
187
188
189 case UCNV_OPEN_BRACE:
190 if(args->converter->mode == UCNV_TILDE){
191 args->converter->mode=0;
192 myData->isStateDBCS = TRUE;
193 continue;
194 }
195 else{
196 break;
197 }
198
199
200 case UCNV_CLOSE_BRACE:
201 if(args->converter->mode == UCNV_TILDE){
202 args->converter->mode=0;
203 myData->isStateDBCS = FALSE;
204 continue;
205 }
206 else{
207 break;
208 }
209
210 default:
211 /* if the first byte is equal to TILDE and the trail byte
212 * is not a valid byte then it is an error condition
213 */
214 if(args->converter->mode == UCNV_TILDE){
215 args->converter->mode=0;
216 mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
217 goto SAVE_STATE;
218 }
219
220 break;
221
222 }
223
224 if(myData->isStateDBCS){
225 if(args->converter->toUnicodeStatus == 0x00){
226 args->converter->toUnicodeStatus = (UChar) mySourceChar;
227 continue;
228 }
229 else{
230 tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
231 tempBuf[1] = (char) (mySourceChar+0x80);
232 mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
233 args->converter->toUnicodeStatus =0x00;
374ca955
A
234 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
235 tempBuf, 2, args->converter->useFallback);
b75a7d8f
A
236 }
237 }
238 else{
239 if(args->converter->fromUnicodeStatus == 0x00){
374ca955
A
240 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
241 mySource - 1, 1, args->converter->useFallback);
b75a7d8f
A
242 }
243 else{
244 goto SAVE_STATE;
245 }
246
247 }
248 if(targetUniChar < 0xfffe){
249 if(args->offsets) {
250 args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
251 }
252
253 *(myTarget++)=(UChar)targetUniChar;
254 }
255 else if(targetUniChar>=0xfffe){
256SAVE_STATE:
374ca955
A
257 if(targetUniChar == 0xfffe){
258 *err = U_INVALID_CHAR_FOUND;
259 }
260 else{
261 *err = U_ILLEGAL_CHAR_FOUND;
262 }
263 if(myData->isStateDBCS){
264 args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
265 args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
266 args->converter->toULength=2;
267 }
268 else{
269 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
270 args->converter->toULength=1;
b75a7d8f 271 }
374ca955 272 break;
b75a7d8f
A
273 }
274 }
275 else{
276 *err =U_BUFFER_OVERFLOW_ERROR;
277 break;
278 }
279 }
b75a7d8f
A
280
281 args->target = myTarget;
282 args->source = mySource;
283}
284
285
286static void
287UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
288 UErrorCode * err){
289 const UChar *mySource = args->source;
374ca955 290 char *myTarget = args->target;
b75a7d8f
A
291 int32_t* offsets = args->offsets;
292 int32_t mySourceIndex = 0;
293 int32_t myTargetIndex = 0;
374ca955 294 int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
b75a7d8f
A
295 int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
296 int32_t length=0;
297 uint32_t targetUniChar = 0x0000;
298 UChar32 mySourceChar = 0x0000,c=0x0000;
299 UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
300 UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
301 UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
b75a7d8f
A
302 UBool isEscapeAppended =FALSE;
303 int len =0;
304 const char* escSeq=NULL;
305
374ca955 306 if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
b75a7d8f
A
307 *err = U_ILLEGAL_ARGUMENT_ERROR;
308 return;
309 }
374ca955 310 if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
b75a7d8f
A
311 goto getTrail;
312 }
313 /*writing the char to the output stream */
314 while (mySourceIndex < mySourceLength){
315 targetUniChar = missingCharMarker;
316 if (myTargetIndex < targetLength){
317
374ca955 318 c=mySourceChar = (UChar) mySource[mySourceIndex++];
b75a7d8f
A
319
320
321 oldIsTargetUCharDBCS = isTargetUCharDBCS;
322 if(mySourceChar ==UCNV_TILDE){
323 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
324 len = ESC_LEN;
325 escSeq = TILDE_ESCAPE;
326 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
327 continue;
328 }
329 else{
374ca955 330 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
b75a7d8f
A
331 mySourceChar,&targetUniChar,args->converter->useFallback);
332
333 }
334 /* only DBCS or SBCS characters are expected*/
335 /* DB haracters with high bit set to 1 are expected */
336 if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
337 targetUniChar= missingCharMarker;
338 }
339 if (targetUniChar != missingCharMarker){
340 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
341 if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
342 /*Shifting from a double byte to single byte mode*/
343 if(!isTargetUCharDBCS){
344 len =ESC_LEN;
345 escSeq = SB_ESCAPE;
346 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
347 myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
348 }
349 else{ /* Shifting from a single byte to double byte mode*/
350 len =ESC_LEN;
351 escSeq = DB_ESCAPE;
352 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
353 myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
354
355 }
356 }
357
358 if(isTargetUCharDBCS){
359 if( myTargetIndex <targetLength){
374ca955 360 myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
b75a7d8f
A
361 if(offsets){
362 *(offsets++) = mySourceIndex-1;
363 }
364 if(myTargetIndex < targetLength){
374ca955 365 myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
b75a7d8f
A
366 if(offsets){
367 *(offsets++) = mySourceIndex-1;
368 }
369 }else{
370 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
371 *err = U_BUFFER_OVERFLOW_ERROR;
372 }
373 }else{
374 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
375 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
376 *err = U_BUFFER_OVERFLOW_ERROR;
377 }
378
379 }else{
380 if( myTargetIndex <targetLength){
374ca955 381 myTarget[myTargetIndex++] = (char) (targetUniChar );
b75a7d8f
A
382 if(offsets){
383 *(offsets++) = mySourceIndex-1;
384 }
385
386 }else{
387 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
388 *err = U_BUFFER_OVERFLOW_ERROR;
389 }
390 }
391
392 }
393 else{
374ca955 394 /* oops.. the code point is unassigned */
b75a7d8f
A
395 /*Handle surrogates */
396 /*check if the char is a First surrogate*/
397 if(UTF_IS_SURROGATE(mySourceChar)) {
398 if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
374ca955 399 args->converter->fromUChar32=mySourceChar;
b75a7d8f
A
400getTrail:
401 /*look ahead to find the trail surrogate*/
402 if(mySourceIndex < mySourceLength) {
403 /* test the following code unit */
404 UChar trail=(UChar) args->source[mySourceIndex];
405 if(UTF_IS_SECOND_SURROGATE(trail)) {
406 ++mySourceIndex;
374ca955
A
407 mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
408 args->converter->fromUChar32=0x00;
b75a7d8f
A
409 /* there are no surrogates in GB2312*/
410 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
411 /* exit this condition tree */
412 } else {
413 /* this is an unmatched lead code unit (1st surrogate) */
414 /* callback(illegal) */
b75a7d8f
A
415 *err=U_ILLEGAL_CHAR_FOUND;
416 }
417 } else {
418 /* no more input */
419 *err = U_ZERO_ERROR;
b75a7d8f
A
420 }
421 } else {
422 /* this is an unmatched trail code unit (2nd surrogate) */
423 /* callback(illegal) */
b75a7d8f
A
424 *err=U_ILLEGAL_CHAR_FOUND;
425 }
374ca955
A
426 } else {
427 /* callback(unassigned) for a BMP code point */
428 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
429 }
430
374ca955
A
431 args->converter->fromUChar32=mySourceChar;
432 break;
b75a7d8f
A
433 }
434 }
435 else{
436 *err = U_BUFFER_OVERFLOW_ERROR;
437 break;
438 }
439 targetUniChar=missingCharMarker;
440 }
b75a7d8f
A
441
442 args->target += myTargetIndex;
443 args->source += mySourceIndex;
444 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
445}
446
447static void
448_HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
449 UConverter *cnv = args->converter;
450 UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
451 char *p;
452 char buffer[4];
453 p = buffer;
454
455 if( convData->isTargetUCharDBCS){
456 *p++= UCNV_TILDE;
457 *p++= UCNV_CLOSE_BRACE;
458 convData->isTargetUCharDBCS=FALSE;
459 }
460 *p++= cnv->subChar[0];
461
462 ucnv_cbFromUWriteBytes(args,
463 buffer, (int32_t)(p - buffer),
464 offsetIndex, err);
465}
466
467/* structure for SafeClone calculations */
374ca955 468struct cloneHZStruct
b75a7d8f
A
469{
470 UConverter cnv;
471 UAlignedMemory deadSpace1;
472 UConverter subCnv;
473 UAlignedMemory deadSpace2;
474 UConverterDataHZ mydata;
475};
476
477
478static UConverter *
479_HZ_SafeClone(const UConverter *cnv,
480 void *stackBuffer,
481 int32_t *pBufferSize,
482 UErrorCode *status)
483{
374ca955
A
484 struct cloneHZStruct * localClone;
485 int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
b75a7d8f
A
486
487 if (U_FAILURE(*status)){
488 return 0;
489 }
490
491 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
492 *pBufferSize = bufferSizeNeeded;
493 return 0;
494 }
495
374ca955 496 localClone = (struct cloneHZStruct *)stackBuffer;
b75a7d8f 497 uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
b75a7d8f
A
498
499 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
500 localClone->cnv.extraInfo = &localClone->mydata;
501 localClone->cnv.isExtraLocal = TRUE;
502
503 /* deep-clone the sub-converter */
504 size = (int32_t)sizeof(UConverter);
505 ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
506 ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
507
508 return &localClone->cnv;
509}
510
511static void
512_HZ_GetUnicodeSet(const UConverter *cnv,
374ca955 513 USetAdder *sa,
b75a7d8f
A
514 UConverterUnicodeSet which,
515 UErrorCode *pErrorCode) {
516 /* the tilde '~' is hardcoded in the converter */
374ca955 517 sa->add(sa->set, 0x7e);
b75a7d8f
A
518
519 /* add all of the code points that the sub-converter handles */
520 ((UConverterDataHZ*)cnv->extraInfo)->
521 gbConverter->sharedData->impl->
522 getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
374ca955 523 sa, which, pErrorCode);
b75a7d8f
A
524}
525
526static const UConverterImpl _HZImpl={
527
528 UCNV_HZ,
529
530 NULL,
531 NULL,
532
533 _HZOpen,
534 _HZClose,
535 _HZReset,
536
537 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
538 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
539 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
540 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
541 NULL,
542
543 NULL,
544 NULL,
545 _HZ_WriteSub,
546 _HZ_SafeClone,
547 _HZ_GetUnicodeSet
548};
549
550static const UConverterStaticData _HZStaticData={
551 sizeof(UConverterStaticData),
552 "HZ",
553 0,
554 UCNV_IBM,
555 UCNV_HZ,
556 1,
557 4,
558 { 0x1a, 0, 0, 0 },
559 1,
560 FALSE,
561 FALSE,
562 0,
563 0,
564 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
565
566};
567
568
569const UConverterSharedData _HZData={
570 sizeof(UConverterSharedData),
571 ~((uint32_t) 0),
572 NULL,
573 NULL,
574 &_HZStaticData,
575 FALSE,
576 &_HZImpl,
577 0
578};
579
580#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */