]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnvhz.c
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / common / ucnvhz.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2006, 2008 International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvhz.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000oct16
12 * created by: Ram Viswanadha
13 * 10/31/2000 Ram Implemented offsets logic function
14 *
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
20
21 #include "cmemory.h"
22 #include "unicode/ucnv.h"
23 #include "unicode/ucnv_cb.h"
24 #include "unicode/uset.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27
28 #define UCNV_TILDE 0x7E /* ~ */
29 #define UCNV_OPEN_BRACE 0x7B /* { */
30 #define UCNV_CLOSE_BRACE 0x7D /* } */
31 #define SB_ESCAPE "\x7E\x7D"
32 #define DB_ESCAPE "\x7E\x7B"
33 #define TILDE_ESCAPE "\x7E\x7E"
34 #define ESC_LEN 2
35
36
37 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
38 while(len-->0){ \
39 if(targetIndex < targetLength){ \
40 args->target[targetIndex] = (unsigned char) *strToAppend; \
41 if(args->offsets!=NULL){ \
42 *(offsets++) = sourceIndex-1; \
43 } \
44 targetIndex++; \
45 } \
46 else{ \
47 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
48 *err =U_BUFFER_OVERFLOW_ERROR; \
49 } \
50 strToAppend++; \
51 } \
52 }
53
54
55 typedef struct{
56 UConverter* gbConverter;
57 int32_t targetIndex;
58 int32_t sourceIndex;
59 UBool isEscapeAppended;
60 UBool isStateDBCS;
61 UBool isTargetUCharDBCS;
62 UBool isEmptySegment;
63 }UConverterDataHZ;
64
65
66
67 static void
68 _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
69 cnv->toUnicodeStatus = 0;
70 cnv->fromUnicodeStatus= 0;
71 cnv->mode=0;
72 cnv->fromUChar32=0x0000;
73 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
74 if(cnv->extraInfo != NULL){
75 uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
76 ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
77 }
78 else {
79 *errorCode = U_MEMORY_ALLOCATION_ERROR;
80 return;
81 }
82 }
83
84 static void
85 _HZClose(UConverter *cnv){
86 if(cnv->extraInfo != NULL) {
87 ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
88 if(!cnv->isExtraLocal) {
89 uprv_free(cnv->extraInfo);
90 }
91 cnv->extraInfo = NULL;
92 }
93 }
94
95 static void
96 _HZReset(UConverter *cnv, UConverterResetChoice choice){
97 if(choice<=UCNV_RESET_TO_UNICODE) {
98 cnv->toUnicodeStatus = 0;
99 cnv->mode=0;
100 if(cnv->extraInfo != NULL){
101 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
102 ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
103 }
104 }
105 if(choice!=UCNV_RESET_TO_UNICODE) {
106 cnv->fromUnicodeStatus= 0;
107 cnv->fromUChar32=0x0000;
108 if(cnv->extraInfo != NULL){
109 ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
110 ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
111 ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
112 ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
113 }
114 }
115 }
116
117 /**************************************HZ Encoding*************************************************
118 * Rules for HZ encoding
119 *
120 * In ASCII mode, a byte is interpreted as an ASCII character, unless a
121 * '~' is encountered. The character '~' is an escape character. By
122 * convention, it must be immediately followed ONLY by '~', '{' or '\n'
123 * (<LF>), with the following special meaning.
124
125 * 1. The escape sequence '~~' is interpreted as a '~'.
126 * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
127 * 3. The escape sequence '~\n' is a line-continuation marker to be
128 * consumed with no output produced.
129 * In GB mode, characters are interpreted two bytes at a time as (pure)
130 * GB codes until the escape-from-GB code '~}' is read. This code
131 * switches the mode from GB back to ASCII. (Note that the escape-
132 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
133 *
134 * Source: RFC 1842
135 */
136
137
138 static void
139 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
140 UErrorCode* err){
141 char tempBuf[2];
142 const char *mySource = ( char *) args->source;
143 UChar *myTarget = args->target;
144 const char *mySourceLimit = args->sourceLimit;
145 UChar32 targetUniChar = 0x0000;
146 UChar mySourceChar = 0x0000;
147 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
148 tempBuf[0]=0;
149 tempBuf[1]=0;
150 if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
151 *err = U_ILLEGAL_ARGUMENT_ERROR;
152 return;
153 }
154
155 while(mySource< mySourceLimit){
156
157 if(myTarget < args->targetLimit){
158
159 mySourceChar= (unsigned char) *mySource++;
160
161 switch(mySourceChar){
162 case 0x0A:
163 if(args->converter->mode ==UCNV_TILDE){
164 args->converter->mode=0;
165
166 }
167 *(myTarget++)=(UChar)mySourceChar;
168 myData->isEmptySegment = FALSE;
169 continue;
170
171 case UCNV_TILDE:
172 if(args->converter->mode ==UCNV_TILDE){
173 *(myTarget++)=(UChar)mySourceChar;
174 args->converter->mode=0;
175 myData->isEmptySegment = FALSE;
176 continue;
177
178 }
179 else if(args->converter->toUnicodeStatus !=0){
180 args->converter->mode=0;
181 break;
182 }
183 else{
184 args->converter->mode = UCNV_TILDE;
185 continue;
186 }
187
188
189 case UCNV_OPEN_BRACE:
190 if(args->converter->mode == UCNV_TILDE){
191 args->converter->mode=0;
192 myData->isStateDBCS = TRUE;
193 myData->isEmptySegment = TRUE;
194 continue;
195 }
196 else{
197 break;
198 }
199
200
201 case UCNV_CLOSE_BRACE:
202 if(args->converter->mode == UCNV_TILDE){
203 args->converter->mode=0;
204 myData->isStateDBCS = FALSE;
205 if (myData->isEmptySegment) {
206 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
207 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
208 args->converter->toUBytes[0] = UCNV_TILDE;
209 args->converter->toUBytes[1] = mySourceChar;
210 args->converter->toULength = 2;
211 goto EXIT;
212 }
213 myData->isEmptySegment = TRUE;
214 continue;
215 }
216 else{
217 break;
218 }
219
220 default:
221 /* if the first byte is equal to TILDE and the trail byte
222 * is not a valid byte then it is an error condition
223 */
224 if(args->converter->mode == UCNV_TILDE){
225 args->converter->mode=0;
226 mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
227 myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
228 goto SAVE_STATE;
229 }
230
231 break;
232
233 }
234
235 myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
236 if(myData->isStateDBCS){
237 if(args->converter->toUnicodeStatus == 0x00){
238 args->converter->toUnicodeStatus = (UChar) mySourceChar;
239 continue;
240 }
241 else{
242 tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
243 tempBuf[1] = (char) (mySourceChar+0x80);
244 mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
245 args->converter->toUnicodeStatus =0x00;
246 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
247 tempBuf, 2, args->converter->useFallback);
248 }
249 }
250 else{
251 if(args->converter->fromUnicodeStatus == 0x00){
252 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
253 mySource - 1, 1, args->converter->useFallback);
254 }
255 else{
256 goto SAVE_STATE;
257 }
258
259 }
260 if(targetUniChar < 0xfffe){
261 if(args->offsets) {
262 args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
263 }
264
265 *(myTarget++)=(UChar)targetUniChar;
266 }
267 else if(targetUniChar>=0xfffe){
268 SAVE_STATE:
269 if(targetUniChar == 0xfffe){
270 *err = U_INVALID_CHAR_FOUND;
271 }
272 else{
273 *err = U_ILLEGAL_CHAR_FOUND;
274 }
275 if(myData->isStateDBCS){
276 /* this should never occur since isStateDBCS is set to true
277 * only after tempBuf[0] and tempBuf[1]
278 * are set to the input .. just to please BEAM
279 */
280 if(tempBuf[0]==0 || tempBuf[1]==0){
281 *err = U_INTERNAL_PROGRAM_ERROR;
282 }else{
283 args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
284 args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
285 args->converter->toULength=2;
286 }
287 }
288 else{
289 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
290 args->converter->toULength=1;
291 }
292 break;
293 }
294 }
295 else{
296 *err =U_BUFFER_OVERFLOW_ERROR;
297 break;
298 }
299 }
300 EXIT:
301 args->target = myTarget;
302 args->source = mySource;
303 }
304
305
306 static void
307 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
308 UErrorCode * err){
309 const UChar *mySource = args->source;
310 char *myTarget = args->target;
311 int32_t* offsets = args->offsets;
312 int32_t mySourceIndex = 0;
313 int32_t myTargetIndex = 0;
314 int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
315 int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
316 int32_t length=0;
317 uint32_t targetUniChar = 0x0000;
318 UChar32 mySourceChar = 0x0000;
319 UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
320 UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
321 UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
322 int len =0;
323 const char* escSeq=NULL;
324
325 if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
326 *err = U_ILLEGAL_ARGUMENT_ERROR;
327 return;
328 }
329 if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
330 goto getTrail;
331 }
332 /*writing the char to the output stream */
333 while (mySourceIndex < mySourceLength){
334 targetUniChar = missingCharMarker;
335 if (myTargetIndex < targetLength){
336
337 mySourceChar = (UChar) mySource[mySourceIndex++];
338
339
340 oldIsTargetUCharDBCS = isTargetUCharDBCS;
341 if(mySourceChar ==UCNV_TILDE){
342 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
343 len = ESC_LEN;
344 escSeq = TILDE_ESCAPE;
345 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
346 continue;
347 }
348 else{
349 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
350 mySourceChar,&targetUniChar,args->converter->useFallback);
351
352 }
353 /* only DBCS or SBCS characters are expected*/
354 /* DB haracters with high bit set to 1 are expected */
355 if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
356 targetUniChar= missingCharMarker;
357 }
358 if (targetUniChar != missingCharMarker){
359 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
360 if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
361 /*Shifting from a double byte to single byte mode*/
362 if(!isTargetUCharDBCS){
363 len =ESC_LEN;
364 escSeq = SB_ESCAPE;
365 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
366 myConverterData->isEscapeAppended = TRUE;
367 }
368 else{ /* Shifting from a single byte to double byte mode*/
369 len =ESC_LEN;
370 escSeq = DB_ESCAPE;
371 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
372 myConverterData->isEscapeAppended = TRUE;
373
374 }
375 }
376
377 if(isTargetUCharDBCS){
378 if( myTargetIndex <targetLength){
379 myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
380 if(offsets){
381 *(offsets++) = mySourceIndex-1;
382 }
383 if(myTargetIndex < targetLength){
384 myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
385 if(offsets){
386 *(offsets++) = mySourceIndex-1;
387 }
388 }else{
389 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
390 *err = U_BUFFER_OVERFLOW_ERROR;
391 }
392 }else{
393 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
394 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
395 *err = U_BUFFER_OVERFLOW_ERROR;
396 }
397
398 }else{
399 if( myTargetIndex <targetLength){
400 myTarget[myTargetIndex++] = (char) (targetUniChar );
401 if(offsets){
402 *(offsets++) = mySourceIndex-1;
403 }
404
405 }else{
406 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
407 *err = U_BUFFER_OVERFLOW_ERROR;
408 }
409 }
410
411 }
412 else{
413 /* oops.. the code point is unassigned */
414 /*Handle surrogates */
415 /*check if the char is a First surrogate*/
416 if(UTF_IS_SURROGATE(mySourceChar)) {
417 if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
418 args->converter->fromUChar32=mySourceChar;
419 getTrail:
420 /*look ahead to find the trail surrogate*/
421 if(mySourceIndex < mySourceLength) {
422 /* test the following code unit */
423 UChar trail=(UChar) args->source[mySourceIndex];
424 if(UTF_IS_SECOND_SURROGATE(trail)) {
425 ++mySourceIndex;
426 mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
427 args->converter->fromUChar32=0x00;
428 /* there are no surrogates in GB2312*/
429 *err = U_INVALID_CHAR_FOUND;
430 /* exit this condition tree */
431 } else {
432 /* this is an unmatched lead code unit (1st surrogate) */
433 /* callback(illegal) */
434 *err=U_ILLEGAL_CHAR_FOUND;
435 }
436 } else {
437 /* no more input */
438 *err = U_ZERO_ERROR;
439 }
440 } else {
441 /* this is an unmatched trail code unit (2nd surrogate) */
442 /* callback(illegal) */
443 *err=U_ILLEGAL_CHAR_FOUND;
444 }
445 } else {
446 /* callback(unassigned) for a BMP code point */
447 *err = U_INVALID_CHAR_FOUND;
448 }
449
450 args->converter->fromUChar32=mySourceChar;
451 break;
452 }
453 }
454 else{
455 *err = U_BUFFER_OVERFLOW_ERROR;
456 break;
457 }
458 targetUniChar=missingCharMarker;
459 }
460
461 args->target += myTargetIndex;
462 args->source += mySourceIndex;
463 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
464 }
465
466 static void
467 _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
468 UConverter *cnv = args->converter;
469 UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
470 char *p;
471 char buffer[4];
472 p = buffer;
473
474 if( convData->isTargetUCharDBCS){
475 *p++= UCNV_TILDE;
476 *p++= UCNV_CLOSE_BRACE;
477 convData->isTargetUCharDBCS=FALSE;
478 }
479 *p++= (char)cnv->subChars[0];
480
481 ucnv_cbFromUWriteBytes(args,
482 buffer, (int32_t)(p - buffer),
483 offsetIndex, err);
484 }
485
486 /*
487 * Structure for cloning an HZ converter into a single memory block.
488 * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct,
489 * and then ucnv_safeClone() of the sub-converter may additionally align
490 * subCnv inside the cloneHZStruct, for which we need the deadSpace after
491 * subCnv. This is because UAlignedMemory may be larger than the actually
492 * necessary alignment size for the platform.
493 * The other cloneHZStruct fields will not be moved around,
494 * and are aligned properly with cloneHZStruct's alignment.
495 */
496 struct cloneHZStruct
497 {
498 UConverter cnv;
499 UConverter subCnv;
500 UAlignedMemory deadSpace;
501 UConverterDataHZ mydata;
502 };
503
504
505 static UConverter *
506 _HZ_SafeClone(const UConverter *cnv,
507 void *stackBuffer,
508 int32_t *pBufferSize,
509 UErrorCode *status)
510 {
511 struct cloneHZStruct * localClone;
512 int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
513
514 if (U_FAILURE(*status)){
515 return 0;
516 }
517
518 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
519 *pBufferSize = bufferSizeNeeded;
520 return 0;
521 }
522
523 localClone = (struct cloneHZStruct *)stackBuffer;
524 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
525
526 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
527 localClone->cnv.extraInfo = &localClone->mydata;
528 localClone->cnv.isExtraLocal = TRUE;
529
530 /* deep-clone the sub-converter */
531 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
532 ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
533 ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
534
535 return &localClone->cnv;
536 }
537
538 static void
539 _HZ_GetUnicodeSet(const UConverter *cnv,
540 const USetAdder *sa,
541 UConverterUnicodeSet which,
542 UErrorCode *pErrorCode) {
543 /* the tilde '~' is hardcoded in the converter */
544 sa->add(sa->set, 0x7e);
545
546 /* add all of the code points that the sub-converter handles */
547 ((UConverterDataHZ*)cnv->extraInfo)->
548 gbConverter->sharedData->impl->
549 getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
550 sa, which, pErrorCode);
551 }
552
553 static const UConverterImpl _HZImpl={
554
555 UCNV_HZ,
556
557 NULL,
558 NULL,
559
560 _HZOpen,
561 _HZClose,
562 _HZReset,
563
564 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
565 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
566 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
567 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
568 NULL,
569
570 NULL,
571 NULL,
572 _HZ_WriteSub,
573 _HZ_SafeClone,
574 _HZ_GetUnicodeSet
575 };
576
577 static const UConverterStaticData _HZStaticData={
578 sizeof(UConverterStaticData),
579 "HZ",
580 0,
581 UCNV_IBM,
582 UCNV_HZ,
583 1,
584 4,
585 { 0x1a, 0, 0, 0 },
586 1,
587 FALSE,
588 FALSE,
589 0,
590 0,
591 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
592
593 };
594
595
596 const UConverterSharedData _HZData={
597 sizeof(UConverterSharedData),
598 ~((uint32_t) 0),
599 NULL,
600 NULL,
601 &_HZStaticData,
602 FALSE,
603 &_HZImpl,
604 0
605 };
606
607 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */