]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnvscsu.c
ICU-551.30.tar.gz
[apple/icu.git] / icuSources / common / ucnvscsu.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ucnvscsu.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
15 *
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
20 */
21
22 #include "unicode/utypes.h"
23
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
29 #include "ucnv_bld.h"
30 #include "ucnv_cnv.h"
31 #include "cmemory.h"
32
33 /* SCSU definitions --------------------------------------------------------- */
34
35 /* SCSU command byte values */
36 enum {
37 SQ0=0x01, /* Quote from window pair 0 */
38 SQ7=0x08, /* Quote from window pair 7 */
39 SDX=0x0B, /* Define a window as extended */
40 Srs=0x0C, /* reserved */
41 SQU=0x0E, /* Quote a single Unicode character */
42 SCU=0x0F, /* Change to Unicode mode */
43 SC0=0x10, /* Select window 0 */
44 SC7=0x17, /* Select window 7 */
45 SD0=0x18, /* Define and select window 0 */
46 SD7=0x1F, /* Define and select window 7 */
47
48 UC0=0xE0, /* Select window 0 */
49 UC7=0xE7, /* Select window 7 */
50 UD0=0xE8, /* Define and select window 0 */
51 UD7=0xEF, /* Define and select window 7 */
52 UQU=0xF0, /* Quote a single Unicode character */
53 UDX=0xF1, /* Define a Window as extended */
54 Urs=0xF2 /* reserved */
55 };
56
57 enum {
58 /*
59 * Unicode code points from 3400 to E000 are not adressible by
60 * dynamic window, since in these areas no short run alphabets are
61 * found. Therefore add gapOffset to all values from gapThreshold.
62 */
63 gapThreshold=0x68,
64 gapOffset=0xAC00,
65
66 /* values between reservedStart and fixedThreshold are reserved */
67 reservedStart=0xA8,
68
69 /* use table of predefined fixed offsets for values from fixedThreshold */
70 fixedThreshold=0xF9
71 };
72
73 /* constant offsets for the 8 static windows */
74 static const uint32_t staticOffsets[8]={
75 0x0000, /* ASCII for quoted tags */
76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
77 0x0100, /* Latin Extended-A */
78 0x0300, /* Combining Diacritical Marks */
79 0x2000, /* General Punctuation */
80 0x2080, /* Currency Symbols */
81 0x2100, /* Letterlike Symbols and Number Forms */
82 0x3000 /* CJK Symbols and punctuation */
83 };
84
85 /* initial offsets for the 8 dynamic (sliding) windows */
86 static const uint32_t initialDynamicOffsets[8]={
87 0x0080, /* Latin-1 */
88 0x00C0, /* Latin Extended A */
89 0x0400, /* Cyrillic */
90 0x0600, /* Arabic */
91 0x0900, /* Devanagari */
92 0x3040, /* Hiragana */
93 0x30A0, /* Katakana */
94 0xFF00 /* Fullwidth ASCII */
95 };
96
97 /* Table of fixed predefined Offsets */
98 static const uint32_t fixedOffsets[]={
99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
100 /* 0xFA */ 0x0250, /* IPA extensions */
101 /* 0xFB */ 0x0370, /* Greek */
102 /* 0xFC */ 0x0530, /* Armenian */
103 /* 0xFD */ 0x3040, /* Hiragana */
104 /* 0xFE */ 0x30A0, /* Katakana */
105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
106 };
107
108 /* state values */
109 enum {
110 readCommand,
111 quotePairOne,
112 quotePairTwo,
113 quoteOne,
114 definePairOne,
115 definePairTwo,
116 defineOne
117 };
118
119 typedef struct SCSUData {
120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
121 uint32_t toUDynamicOffsets[8];
122 uint32_t fromUDynamicOffsets[8];
123
124 /* state machine state - toUnicode */
125 UBool toUIsSingleByteMode;
126 uint8_t toUState;
127 int8_t toUQuoteWindow, toUDynamicWindow;
128 uint8_t toUByteOne;
129 uint8_t toUPadding[3];
130
131 /* state machine state - fromUnicode */
132 UBool fromUIsSingleByteMode;
133 int8_t fromUDynamicWindow;
134
135 /*
136 * windowUse[] keeps track of the use of the dynamic windows:
137 * At nextWindowUseIndex there is the least recently used window,
138 * and the following windows (in a wrapping manner) are more and more
139 * recently used.
140 * At nextWindowUseIndex-1 there is the most recently used window.
141 */
142 uint8_t locale;
143 int8_t nextWindowUseIndex;
144 int8_t windowUse[8];
145 } SCSUData;
146
147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
149
150 enum {
151 lGeneric, l_ja
152 };
153
154 /* SCSU setup functions ----------------------------------------------------- */
155
156 static void
157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
158 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
159
160 if(choice<=UCNV_RESET_TO_UNICODE) {
161 /* reset toUnicode */
162 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
163
164 scsu->toUIsSingleByteMode=TRUE;
165 scsu->toUState=readCommand;
166 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
167 scsu->toUByteOne=0;
168
169 cnv->toULength=0;
170 }
171 if(choice!=UCNV_RESET_TO_UNICODE) {
172 /* reset fromUnicode */
173 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
174
175 scsu->fromUIsSingleByteMode=TRUE;
176 scsu->fromUDynamicWindow=0;
177
178 scsu->nextWindowUseIndex=0;
179 switch(scsu->locale) {
180 case l_ja:
181 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
182 break;
183 default:
184 uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
185 break;
186 }
187
188 cnv->fromUChar32=0;
189 }
190 }
191
192 static void
193 _SCSUOpen(UConverter *cnv,
194 UConverterLoadArgs *pArgs,
195 UErrorCode *pErrorCode) {
196 const char *locale=pArgs->locale;
197 if(pArgs->onlyTestIsLoadable) {
198 return;
199 }
200 cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
201 if(cnv->extraInfo!=NULL) {
202 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
203 ((SCSUData *)cnv->extraInfo)->locale=l_ja;
204 } else {
205 ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
206 }
207 _SCSUReset(cnv, UCNV_RESET_BOTH);
208 } else {
209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
210 }
211
212 /* Set the substitution character U+fffd as a Unicode string. */
213 cnv->subUChars[0]=0xfffd;
214 cnv->subCharLen=-1;
215 }
216
217 static void
218 _SCSUClose(UConverter *cnv) {
219 if(cnv->extraInfo!=NULL) {
220 if(!cnv->isExtraLocal) {
221 uprv_free(cnv->extraInfo);
222 }
223 cnv->extraInfo=NULL;
224 }
225 }
226
227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
228
229 static void
230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
231 UErrorCode *pErrorCode) {
232 UConverter *cnv;
233 SCSUData *scsu;
234 const uint8_t *source, *sourceLimit;
235 UChar *target;
236 const UChar *targetLimit;
237 int32_t *offsets;
238 UBool isSingleByteMode;
239 uint8_t state, byteOne;
240 int8_t quoteWindow, dynamicWindow;
241
242 int32_t sourceIndex, nextSourceIndex;
243
244 uint8_t b;
245
246 /* set up the local pointers */
247 cnv=pArgs->converter;
248 scsu=(SCSUData *)cnv->extraInfo;
249
250 source=(const uint8_t *)pArgs->source;
251 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
252 target=pArgs->target;
253 targetLimit=pArgs->targetLimit;
254 offsets=pArgs->offsets;
255
256 /* get the state machine state */
257 isSingleByteMode=scsu->toUIsSingleByteMode;
258 state=scsu->toUState;
259 quoteWindow=scsu->toUQuoteWindow;
260 dynamicWindow=scsu->toUDynamicWindow;
261 byteOne=scsu->toUByteOne;
262
263 /* sourceIndex=-1 if the current character began in the previous buffer */
264 sourceIndex=state==readCommand ? 0 : -1;
265 nextSourceIndex=0;
266
267 /*
268 * conversion "loop"
269 *
270 * For performance, this is not a normal C loop.
271 * Instead, there are two code blocks for the two SCSU modes.
272 * The function branches to either one, and a change of the mode is done with a goto to
273 * the other branch.
274 *
275 * Each branch has two conventional loops:
276 * - a fast-path loop for the most common codes in the mode
277 * - a loop for all other codes in the mode
278 * When the fast-path runs into a code that it cannot handle, its loop ends and it
279 * runs into the following loop to handle the other codes.
280 * The end of the input or output buffer is also handled by the slower loop.
281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
282 *
283 * The callback handling is done by returning with an error code.
284 * The conversion framework actually calls the callback function.
285 */
286 if(isSingleByteMode) {
287 /* fast path for single-byte mode */
288 if(state==readCommand) {
289 fastSingle:
290 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
291 ++source;
292 ++nextSourceIndex;
293 if(b<=0x7f) {
294 /* write US-ASCII graphic character or DEL */
295 *target++=(UChar)b;
296 if(offsets!=NULL) {
297 *offsets++=sourceIndex;
298 }
299 } else {
300 /* write from dynamic window */
301 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
302 if(c<=0xffff) {
303 *target++=(UChar)c;
304 if(offsets!=NULL) {
305 *offsets++=sourceIndex;
306 }
307 } else {
308 /* output surrogate pair */
309 *target++=(UChar)(0xd7c0+(c>>10));
310 if(target<targetLimit) {
311 *target++=(UChar)(0xdc00|(c&0x3ff));
312 if(offsets!=NULL) {
313 *offsets++=sourceIndex;
314 *offsets++=sourceIndex;
315 }
316 } else {
317 /* target overflow */
318 if(offsets!=NULL) {
319 *offsets++=sourceIndex;
320 }
321 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
322 cnv->UCharErrorBufferLength=1;
323 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
324 goto endloop;
325 }
326 }
327 }
328 sourceIndex=nextSourceIndex;
329 }
330 }
331
332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
333 singleByteMode:
334 while(source<sourceLimit) {
335 if(target>=targetLimit) {
336 /* target is full */
337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
338 break;
339 }
340 b=*source++;
341 ++nextSourceIndex;
342 switch(state) {
343 case readCommand:
344 /* redundant conditions are commented out */
345 /* here: b<0x20 because otherwise we would be in fastSingle */
346 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
347 /* CR/LF/TAB/NUL */
348 *target++=(UChar)b;
349 if(offsets!=NULL) {
350 *offsets++=sourceIndex;
351 }
352 sourceIndex=nextSourceIndex;
353 goto fastSingle;
354 } else if(SC0<=b) {
355 if(b<=SC7) {
356 dynamicWindow=(int8_t)(b-SC0);
357 sourceIndex=nextSourceIndex;
358 goto fastSingle;
359 } else /* if(SD0<=b && b<=SD7) */ {
360 dynamicWindow=(int8_t)(b-SD0);
361 state=defineOne;
362 }
363 } else if(/* SQ0<=b && */ b<=SQ7) {
364 quoteWindow=(int8_t)(b-SQ0);
365 state=quoteOne;
366 } else if(b==SDX) {
367 state=definePairOne;
368 } else if(b==SQU) {
369 state=quotePairOne;
370 } else if(b==SCU) {
371 sourceIndex=nextSourceIndex;
372 isSingleByteMode=FALSE;
373 goto fastUnicode;
374 } else /* Srs */ {
375 /* callback(illegal) */
376 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
377 cnv->toUBytes[0]=b;
378 cnv->toULength=1;
379 goto endloop;
380 }
381
382 /* store the first byte of a multibyte sequence in toUBytes[] */
383 cnv->toUBytes[0]=b;
384 cnv->toULength=1;
385 break;
386 case quotePairOne:
387 byteOne=b;
388 cnv->toUBytes[1]=b;
389 cnv->toULength=2;
390 state=quotePairTwo;
391 break;
392 case quotePairTwo:
393 *target++=(UChar)((byteOne<<8)|b);
394 if(offsets!=NULL) {
395 *offsets++=sourceIndex;
396 }
397 sourceIndex=nextSourceIndex;
398 state=readCommand;
399 goto fastSingle;
400 case quoteOne:
401 if(b<0x80) {
402 /* all static offsets are in the BMP */
403 *target++=(UChar)(staticOffsets[quoteWindow]+b);
404 if(offsets!=NULL) {
405 *offsets++=sourceIndex;
406 }
407 } else {
408 /* write from dynamic window */
409 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
410 if(c<=0xffff) {
411 *target++=(UChar)c;
412 if(offsets!=NULL) {
413 *offsets++=sourceIndex;
414 }
415 } else {
416 /* output surrogate pair */
417 *target++=(UChar)(0xd7c0+(c>>10));
418 if(target<targetLimit) {
419 *target++=(UChar)(0xdc00|(c&0x3ff));
420 if(offsets!=NULL) {
421 *offsets++=sourceIndex;
422 *offsets++=sourceIndex;
423 }
424 } else {
425 /* target overflow */
426 if(offsets!=NULL) {
427 *offsets++=sourceIndex;
428 }
429 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
430 cnv->UCharErrorBufferLength=1;
431 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
432 goto endloop;
433 }
434 }
435 }
436 sourceIndex=nextSourceIndex;
437 state=readCommand;
438 goto fastSingle;
439 case definePairOne:
440 dynamicWindow=(int8_t)((b>>5)&7);
441 byteOne=(uint8_t)(b&0x1f);
442 cnv->toUBytes[1]=b;
443 cnv->toULength=2;
444 state=definePairTwo;
445 break;
446 case definePairTwo:
447 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
448 sourceIndex=nextSourceIndex;
449 state=readCommand;
450 goto fastSingle;
451 case defineOne:
452 if(b==0) {
453 /* callback(illegal): Reserved window offset value 0 */
454 cnv->toUBytes[1]=b;
455 cnv->toULength=2;
456 goto endloop;
457 } else if(b<gapThreshold) {
458 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
459 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
460 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
461 } else if(b>=fixedThreshold) {
462 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
463 } else {
464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
465 cnv->toUBytes[1]=b;
466 cnv->toULength=2;
467 goto endloop;
468 }
469 sourceIndex=nextSourceIndex;
470 state=readCommand;
471 goto fastSingle;
472 }
473 }
474 } else {
475 /* fast path for Unicode mode */
476 if(state==readCommand) {
477 fastUnicode:
478 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
479 *target++=(UChar)((b<<8)|source[1]);
480 if(offsets!=NULL) {
481 *offsets++=sourceIndex;
482 }
483 sourceIndex=nextSourceIndex;
484 nextSourceIndex+=2;
485 source+=2;
486 }
487 }
488
489 /* normal state machine for Unicode mode */
490 /* unicodeByteMode: */
491 while(source<sourceLimit) {
492 if(target>=targetLimit) {
493 /* target is full */
494 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
495 break;
496 }
497 b=*source++;
498 ++nextSourceIndex;
499 switch(state) {
500 case readCommand:
501 if((uint8_t)(b-UC0)>(Urs-UC0)) {
502 byteOne=b;
503 cnv->toUBytes[0]=b;
504 cnv->toULength=1;
505 state=quotePairTwo;
506 } else if(/* UC0<=b && */ b<=UC7) {
507 dynamicWindow=(int8_t)(b-UC0);
508 sourceIndex=nextSourceIndex;
509 isSingleByteMode=TRUE;
510 goto fastSingle;
511 } else if(/* UD0<=b && */ b<=UD7) {
512 dynamicWindow=(int8_t)(b-UD0);
513 isSingleByteMode=TRUE;
514 cnv->toUBytes[0]=b;
515 cnv->toULength=1;
516 state=defineOne;
517 goto singleByteMode;
518 } else if(b==UDX) {
519 isSingleByteMode=TRUE;
520 cnv->toUBytes[0]=b;
521 cnv->toULength=1;
522 state=definePairOne;
523 goto singleByteMode;
524 } else if(b==UQU) {
525 cnv->toUBytes[0]=b;
526 cnv->toULength=1;
527 state=quotePairOne;
528 } else /* Urs */ {
529 /* callback(illegal) */
530 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
531 cnv->toUBytes[0]=b;
532 cnv->toULength=1;
533 goto endloop;
534 }
535 break;
536 case quotePairOne:
537 byteOne=b;
538 cnv->toUBytes[1]=b;
539 cnv->toULength=2;
540 state=quotePairTwo;
541 break;
542 case quotePairTwo:
543 *target++=(UChar)((byteOne<<8)|b);
544 if(offsets!=NULL) {
545 *offsets++=sourceIndex;
546 }
547 sourceIndex=nextSourceIndex;
548 state=readCommand;
549 goto fastUnicode;
550 }
551 }
552 }
553 endloop:
554
555 /* set the converter state back into UConverter */
556 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
557 /* reset to deal with the next character */
558 state=readCommand;
559 } else if(state==readCommand) {
560 /* not in a multi-byte sequence, reset toULength */
561 cnv->toULength=0;
562 }
563 scsu->toUIsSingleByteMode=isSingleByteMode;
564 scsu->toUState=state;
565 scsu->toUQuoteWindow=quoteWindow;
566 scsu->toUDynamicWindow=dynamicWindow;
567 scsu->toUByteOne=byteOne;
568
569 /* write back the updated pointers */
570 pArgs->source=(const char *)source;
571 pArgs->target=target;
572 pArgs->offsets=offsets;
573 return;
574 }
575
576 /*
577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
578 * If a change is made in the original function, then either
579 * change this function the same way or
580 * re-copy the original function and remove the variables
581 * offsets, sourceIndex, and nextSourceIndex.
582 */
583 static void
584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
585 UErrorCode *pErrorCode) {
586 UConverter *cnv;
587 SCSUData *scsu;
588 const uint8_t *source, *sourceLimit;
589 UChar *target;
590 const UChar *targetLimit;
591 UBool isSingleByteMode;
592 uint8_t state, byteOne;
593 int8_t quoteWindow, dynamicWindow;
594
595 uint8_t b;
596
597 /* set up the local pointers */
598 cnv=pArgs->converter;
599 scsu=(SCSUData *)cnv->extraInfo;
600
601 source=(const uint8_t *)pArgs->source;
602 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
603 target=pArgs->target;
604 targetLimit=pArgs->targetLimit;
605
606 /* get the state machine state */
607 isSingleByteMode=scsu->toUIsSingleByteMode;
608 state=scsu->toUState;
609 quoteWindow=scsu->toUQuoteWindow;
610 dynamicWindow=scsu->toUDynamicWindow;
611 byteOne=scsu->toUByteOne;
612
613 /*
614 * conversion "loop"
615 *
616 * For performance, this is not a normal C loop.
617 * Instead, there are two code blocks for the two SCSU modes.
618 * The function branches to either one, and a change of the mode is done with a goto to
619 * the other branch.
620 *
621 * Each branch has two conventional loops:
622 * - a fast-path loop for the most common codes in the mode
623 * - a loop for all other codes in the mode
624 * When the fast-path runs into a code that it cannot handle, its loop ends and it
625 * runs into the following loop to handle the other codes.
626 * The end of the input or output buffer is also handled by the slower loop.
627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
628 *
629 * The callback handling is done by returning with an error code.
630 * The conversion framework actually calls the callback function.
631 */
632 if(isSingleByteMode) {
633 /* fast path for single-byte mode */
634 if(state==readCommand) {
635 fastSingle:
636 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
637 ++source;
638 if(b<=0x7f) {
639 /* write US-ASCII graphic character or DEL */
640 *target++=(UChar)b;
641 } else {
642 /* write from dynamic window */
643 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
644 if(c<=0xffff) {
645 *target++=(UChar)c;
646 } else {
647 /* output surrogate pair */
648 *target++=(UChar)(0xd7c0+(c>>10));
649 if(target<targetLimit) {
650 *target++=(UChar)(0xdc00|(c&0x3ff));
651 } else {
652 /* target overflow */
653 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
654 cnv->UCharErrorBufferLength=1;
655 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
656 goto endloop;
657 }
658 }
659 }
660 }
661 }
662
663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
664 singleByteMode:
665 while(source<sourceLimit) {
666 if(target>=targetLimit) {
667 /* target is full */
668 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
669 break;
670 }
671 b=*source++;
672 switch(state) {
673 case readCommand:
674 /* redundant conditions are commented out */
675 /* here: b<0x20 because otherwise we would be in fastSingle */
676 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
677 /* CR/LF/TAB/NUL */
678 *target++=(UChar)b;
679 goto fastSingle;
680 } else if(SC0<=b) {
681 if(b<=SC7) {
682 dynamicWindow=(int8_t)(b-SC0);
683 goto fastSingle;
684 } else /* if(SD0<=b && b<=SD7) */ {
685 dynamicWindow=(int8_t)(b-SD0);
686 state=defineOne;
687 }
688 } else if(/* SQ0<=b && */ b<=SQ7) {
689 quoteWindow=(int8_t)(b-SQ0);
690 state=quoteOne;
691 } else if(b==SDX) {
692 state=definePairOne;
693 } else if(b==SQU) {
694 state=quotePairOne;
695 } else if(b==SCU) {
696 isSingleByteMode=FALSE;
697 goto fastUnicode;
698 } else /* Srs */ {
699 /* callback(illegal) */
700 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
701 cnv->toUBytes[0]=b;
702 cnv->toULength=1;
703 goto endloop;
704 }
705
706 /* store the first byte of a multibyte sequence in toUBytes[] */
707 cnv->toUBytes[0]=b;
708 cnv->toULength=1;
709 break;
710 case quotePairOne:
711 byteOne=b;
712 cnv->toUBytes[1]=b;
713 cnv->toULength=2;
714 state=quotePairTwo;
715 break;
716 case quotePairTwo:
717 *target++=(UChar)((byteOne<<8)|b);
718 state=readCommand;
719 goto fastSingle;
720 case quoteOne:
721 if(b<0x80) {
722 /* all static offsets are in the BMP */
723 *target++=(UChar)(staticOffsets[quoteWindow]+b);
724 } else {
725 /* write from dynamic window */
726 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
727 if(c<=0xffff) {
728 *target++=(UChar)c;
729 } else {
730 /* output surrogate pair */
731 *target++=(UChar)(0xd7c0+(c>>10));
732 if(target<targetLimit) {
733 *target++=(UChar)(0xdc00|(c&0x3ff));
734 } else {
735 /* target overflow */
736 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
737 cnv->UCharErrorBufferLength=1;
738 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
739 goto endloop;
740 }
741 }
742 }
743 state=readCommand;
744 goto fastSingle;
745 case definePairOne:
746 dynamicWindow=(int8_t)((b>>5)&7);
747 byteOne=(uint8_t)(b&0x1f);
748 cnv->toUBytes[1]=b;
749 cnv->toULength=2;
750 state=definePairTwo;
751 break;
752 case definePairTwo:
753 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
754 state=readCommand;
755 goto fastSingle;
756 case defineOne:
757 if(b==0) {
758 /* callback(illegal): Reserved window offset value 0 */
759 cnv->toUBytes[1]=b;
760 cnv->toULength=2;
761 goto endloop;
762 } else if(b<gapThreshold) {
763 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
764 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
765 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
766 } else if(b>=fixedThreshold) {
767 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
768 } else {
769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
770 cnv->toUBytes[1]=b;
771 cnv->toULength=2;
772 goto endloop;
773 }
774 state=readCommand;
775 goto fastSingle;
776 }
777 }
778 } else {
779 /* fast path for Unicode mode */
780 if(state==readCommand) {
781 fastUnicode:
782 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
783 *target++=(UChar)((b<<8)|source[1]);
784 source+=2;
785 }
786 }
787
788 /* normal state machine for Unicode mode */
789 /* unicodeByteMode: */
790 while(source<sourceLimit) {
791 if(target>=targetLimit) {
792 /* target is full */
793 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
794 break;
795 }
796 b=*source++;
797 switch(state) {
798 case readCommand:
799 if((uint8_t)(b-UC0)>(Urs-UC0)) {
800 byteOne=b;
801 cnv->toUBytes[0]=b;
802 cnv->toULength=1;
803 state=quotePairTwo;
804 } else if(/* UC0<=b && */ b<=UC7) {
805 dynamicWindow=(int8_t)(b-UC0);
806 isSingleByteMode=TRUE;
807 goto fastSingle;
808 } else if(/* UD0<=b && */ b<=UD7) {
809 dynamicWindow=(int8_t)(b-UD0);
810 isSingleByteMode=TRUE;
811 cnv->toUBytes[0]=b;
812 cnv->toULength=1;
813 state=defineOne;
814 goto singleByteMode;
815 } else if(b==UDX) {
816 isSingleByteMode=TRUE;
817 cnv->toUBytes[0]=b;
818 cnv->toULength=1;
819 state=definePairOne;
820 goto singleByteMode;
821 } else if(b==UQU) {
822 cnv->toUBytes[0]=b;
823 cnv->toULength=1;
824 state=quotePairOne;
825 } else /* Urs */ {
826 /* callback(illegal) */
827 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
828 cnv->toUBytes[0]=b;
829 cnv->toULength=1;
830 goto endloop;
831 }
832 break;
833 case quotePairOne:
834 byteOne=b;
835 cnv->toUBytes[1]=b;
836 cnv->toULength=2;
837 state=quotePairTwo;
838 break;
839 case quotePairTwo:
840 *target++=(UChar)((byteOne<<8)|b);
841 state=readCommand;
842 goto fastUnicode;
843 }
844 }
845 }
846 endloop:
847
848 /* set the converter state back into UConverter */
849 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
850 /* reset to deal with the next character */
851 state=readCommand;
852 } else if(state==readCommand) {
853 /* not in a multi-byte sequence, reset toULength */
854 cnv->toULength=0;
855 }
856 scsu->toUIsSingleByteMode=isSingleByteMode;
857 scsu->toUState=state;
858 scsu->toUQuoteWindow=quoteWindow;
859 scsu->toUDynamicWindow=dynamicWindow;
860 scsu->toUByteOne=byteOne;
861
862 /* write back the updated pointers */
863 pArgs->source=(const char *)source;
864 pArgs->target=target;
865 return;
866 }
867
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
869
870 /*
871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
872 * reasonable results. The lookahead is minimal.
873 * Many cases are simple:
874 * A character fits directly into the current mode, a dynamic or static window,
875 * or is not compressible. These cases are tested first.
876 * Real compression heuristics are applied to the rest, in code branches for
877 * single/Unicode mode and BMP/supplementary code points.
878 * The heuristics used here are extremely simple.
879 */
880
881 /* get the number of the window that this character is in, or -1 */
882 static int8_t
883 getWindow(const uint32_t offsets[8], uint32_t c) {
884 int i;
885 for(i=0; i<8; ++i) {
886 if((uint32_t)(c-offsets[i])<=0x7f) {
887 return (int8_t)(i);
888 }
889 }
890 return -1;
891 }
892
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
894 static UBool
895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
896 return (UBool)(c<=offset+0x7f &&
897 (c>=offset || (c<=0x7f &&
898 (c>=0x20 || (1UL<<c)&0x2601))));
899 /* binary 0010 0110 0000 0001,
900 check for b==0xd || b==0xa || b==9 || b==0 */
901 }
902
903 /*
904 * getNextDynamicWindow returns the next dynamic window to be redefined
905 */
906 static int8_t
907 getNextDynamicWindow(SCSUData *scsu) {
908 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
909 if(++scsu->nextWindowUseIndex==8) {
910 scsu->nextWindowUseIndex=0;
911 }
912 return window;
913 }
914
915 /*
916 * useDynamicWindow() adjusts
917 * windowUse[] and nextWindowUseIndex for the algorithm to choose
918 * the next dynamic window to be defined;
919 * a subclass may override it and provide its own algorithm.
920 */
921 static void
922 useDynamicWindow(SCSUData *scsu, int8_t window) {
923 /*
924 * move the existing window, which just became the most recently used one,
925 * up in windowUse[] to nextWindowUseIndex-1
926 */
927
928 /* first, find the index of the window - backwards to favor the more recently used windows */
929 int i, j;
930
931 i=scsu->nextWindowUseIndex;
932 do {
933 if(--i<0) {
934 i=7;
935 }
936 } while(scsu->windowUse[i]!=window);
937
938 /* now copy each windowUse[i+1] to [i] */
939 j=i+1;
940 if(j==8) {
941 j=0;
942 }
943 while(j!=scsu->nextWindowUseIndex) {
944 scsu->windowUse[i]=scsu->windowUse[j];
945 i=j;
946 if(++j==8) { j=0; }
947 }
948
949 /* finally, set the window into the most recently used index */
950 scsu->windowUse[i]=window;
951 }
952
953 /*
954 * calculate the offset and the code for a dynamic window that contains the character
955 * takes fixed offsets into account
956 * the offset of the window is stored in the offset variable,
957 * the code is returned
958 *
959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
960 */
961 static int
962 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
963 int i;
964
965 for(i=0; i<7; ++i) {
966 if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
967 *pOffset=fixedOffsets[i];
968 return 0xf9+i;
969 }
970 }
971
972 if(c<0x80) {
973 /* No dynamic window for US-ASCII. */
974 return -1;
975 } else if(c<0x3400 ||
976 (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
977 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
978 ) {
979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
980 *pOffset=c&0x7fffff80;
981 return (int)(c>>7);
982 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
983 /* For these characters we need to take the gapOffset into account. */
984 *pOffset=c&0x7fffff80;
985 return (int)((c-gapOffset)>>7);
986 } else {
987 return -1;
988 }
989 }
990
991 /*
992 * Idea for compression:
993 * - save SCSUData and other state before really starting work
994 * - at endloop, see if compression could be better with just unicode mode
995 * - don't do this if a callback has been called
996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
997 * - different buffer handling!
998 *
999 * Drawback or need for corrective handling:
1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1003 *
1004 * How to achieve both?
1005 * - Only replace the result after an SDX or SCU?
1006 */
1007
1008 static void
1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1010 UErrorCode *pErrorCode) {
1011 UConverter *cnv;
1012 SCSUData *scsu;
1013 const UChar *source, *sourceLimit;
1014 uint8_t *target;
1015 int32_t targetCapacity;
1016 int32_t *offsets;
1017
1018 UBool isSingleByteMode;
1019 uint8_t dynamicWindow;
1020 uint32_t currentOffset;
1021
1022 uint32_t c, delta;
1023
1024 int32_t sourceIndex, nextSourceIndex;
1025
1026 int32_t length;
1027
1028 /* variables for compression heuristics */
1029 uint32_t offset;
1030 UChar lead, trail;
1031 int code;
1032 int8_t window;
1033
1034 /* set up the local pointers */
1035 cnv=pArgs->converter;
1036 scsu=(SCSUData *)cnv->extraInfo;
1037
1038 /* set up the local pointers */
1039 source=pArgs->source;
1040 sourceLimit=pArgs->sourceLimit;
1041 target=(uint8_t *)pArgs->target;
1042 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1043 offsets=pArgs->offsets;
1044
1045 /* get the state machine state */
1046 isSingleByteMode=scsu->fromUIsSingleByteMode;
1047 dynamicWindow=scsu->fromUDynamicWindow;
1048 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1049
1050 c=cnv->fromUChar32;
1051
1052 /* sourceIndex=-1 if the current character began in the previous buffer */
1053 sourceIndex= c==0 ? 0 : -1;
1054 nextSourceIndex=0;
1055
1056 /* similar conversion "loop" as in toUnicode */
1057 loop:
1058 if(isSingleByteMode) {
1059 if(c!=0 && targetCapacity>0) {
1060 goto getTrailSingle;
1061 }
1062
1063 /* state machine for single-byte mode */
1064 /* singleByteMode: */
1065 while(source<sourceLimit) {
1066 if(targetCapacity<=0) {
1067 /* target is full */
1068 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1069 break;
1070 }
1071 c=*source++;
1072 ++nextSourceIndex;
1073
1074 if((c-0x20)<=0x5f) {
1075 /* pass US-ASCII graphic character through */
1076 *target++=(uint8_t)c;
1077 if(offsets!=NULL) {
1078 *offsets++=sourceIndex;
1079 }
1080 --targetCapacity;
1081 } else if(c<0x20) {
1082 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1083 /* CR/LF/TAB/NUL */
1084 *target++=(uint8_t)c;
1085 if(offsets!=NULL) {
1086 *offsets++=sourceIndex;
1087 }
1088 --targetCapacity;
1089 } else {
1090 /* quote C0 control character */
1091 c|=SQ0<<8;
1092 length=2;
1093 goto outputBytes;
1094 }
1095 } else if((delta=c-currentOffset)<=0x7f) {
1096 /* use the current dynamic window */
1097 *target++=(uint8_t)(delta|0x80);
1098 if(offsets!=NULL) {
1099 *offsets++=sourceIndex;
1100 }
1101 --targetCapacity;
1102 } else if(U16_IS_SURROGATE(c)) {
1103 if(U16_IS_SURROGATE_LEAD(c)) {
1104 getTrailSingle:
1105 lead=(UChar)c;
1106 if(source<sourceLimit) {
1107 /* test the following code unit */
1108 trail=*source;
1109 if(U16_IS_TRAIL(trail)) {
1110 ++source;
1111 ++nextSourceIndex;
1112 c=U16_GET_SUPPLEMENTARY(c, trail);
1113 /* convert this surrogate code point */
1114 /* exit this condition tree */
1115 } else {
1116 /* this is an unmatched lead code unit (1st surrogate) */
1117 /* callback(illegal) */
1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1119 goto endloop;
1120 }
1121 } else {
1122 /* no more input */
1123 break;
1124 }
1125 } else {
1126 /* this is an unmatched trail code unit (2nd surrogate) */
1127 /* callback(illegal) */
1128 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1129 goto endloop;
1130 }
1131
1132 /* compress supplementary character U+10000..U+10ffff */
1133 if((delta=c-currentOffset)<=0x7f) {
1134 /* use the current dynamic window */
1135 *target++=(uint8_t)(delta|0x80);
1136 if(offsets!=NULL) {
1137 *offsets++=sourceIndex;
1138 }
1139 --targetCapacity;
1140 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1141 /* there is a dynamic window that contains this character, change to it */
1142 dynamicWindow=window;
1143 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1144 useDynamicWindow(scsu, dynamicWindow);
1145 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1146 length=2;
1147 goto outputBytes;
1148 } else if((code=getDynamicOffset(c, &offset))>=0) {
1149 /* might check if there are more characters in this window to come */
1150 /* define an extended window with this character */
1151 code-=0x200;
1152 dynamicWindow=getNextDynamicWindow(scsu);
1153 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1154 useDynamicWindow(scsu, dynamicWindow);
1155 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1156 length=4;
1157 goto outputBytes;
1158 } else {
1159 /* change to Unicode mode and output this (lead, trail) pair */
1160 isSingleByteMode=FALSE;
1161 *target++=(uint8_t)SCU;
1162 if(offsets!=NULL) {
1163 *offsets++=sourceIndex;
1164 }
1165 --targetCapacity;
1166 c=((uint32_t)lead<<16)|trail;
1167 length=4;
1168 goto outputBytes;
1169 }
1170 } else if(c<0xa0) {
1171 /* quote C1 control character */
1172 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1173 length=2;
1174 goto outputBytes;
1175 } else if(c==0xfeff || c>=0xfff0) {
1176 /* quote signature character=byte order mark and specials */
1177 c|=SQU<<16;
1178 length=3;
1179 goto outputBytes;
1180 } else {
1181 /* compress all other BMP characters */
1182 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1183 /* there is a window defined that contains this character - switch to it or quote from it? */
1184 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1185 /* change to dynamic window */
1186 dynamicWindow=window;
1187 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1188 useDynamicWindow(scsu, dynamicWindow);
1189 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1190 length=2;
1191 goto outputBytes;
1192 } else {
1193 /* quote from dynamic window */
1194 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1195 length=2;
1196 goto outputBytes;
1197 }
1198 } else if((window=getWindow(staticOffsets, c))>=0) {
1199 /* quote from static window */
1200 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1201 length=2;
1202 goto outputBytes;
1203 } else if((code=getDynamicOffset(c, &offset))>=0) {
1204 /* define a dynamic window with this character */
1205 dynamicWindow=getNextDynamicWindow(scsu);
1206 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1207 useDynamicWindow(scsu, dynamicWindow);
1208 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1209 length=3;
1210 goto outputBytes;
1211 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1212 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1213 ) {
1214 /*
1215 * this character is not compressible (a BMP ideograph or similar);
1216 * switch to Unicode mode if this is the last character in the block
1217 * or there is at least one more ideograph following immediately
1218 */
1219 isSingleByteMode=FALSE;
1220 c|=SCU<<16;
1221 length=3;
1222 goto outputBytes;
1223 } else {
1224 /* quote Unicode */
1225 c|=SQU<<16;
1226 length=3;
1227 goto outputBytes;
1228 }
1229 }
1230
1231 /* normal end of conversion: prepare for a new character */
1232 c=0;
1233 sourceIndex=nextSourceIndex;
1234 }
1235 } else {
1236 if(c!=0 && targetCapacity>0) {
1237 goto getTrailUnicode;
1238 }
1239
1240 /* state machine for Unicode mode */
1241 /* unicodeByteMode: */
1242 while(source<sourceLimit) {
1243 if(targetCapacity<=0) {
1244 /* target is full */
1245 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1246 break;
1247 }
1248 c=*source++;
1249 ++nextSourceIndex;
1250
1251 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1252 /* not compressible, write character directly */
1253 if(targetCapacity>=2) {
1254 *target++=(uint8_t)(c>>8);
1255 *target++=(uint8_t)c;
1256 if(offsets!=NULL) {
1257 *offsets++=sourceIndex;
1258 *offsets++=sourceIndex;
1259 }
1260 targetCapacity-=2;
1261 } else {
1262 length=2;
1263 goto outputBytes;
1264 }
1265 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1266 /* compress BMP character if the following one is not an uncompressible ideograph */
1267 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1268 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1269 /* ASCII digit or letter */
1270 isSingleByteMode=TRUE;
1271 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1272 length=2;
1273 goto outputBytes;
1274 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1275 /* there is a dynamic window that contains this character, change to it */
1276 isSingleByteMode=TRUE;
1277 dynamicWindow=window;
1278 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1279 useDynamicWindow(scsu, dynamicWindow);
1280 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1281 length=2;
1282 goto outputBytes;
1283 } else if((code=getDynamicOffset(c, &offset))>=0) {
1284 /* define a dynamic window with this character */
1285 isSingleByteMode=TRUE;
1286 dynamicWindow=getNextDynamicWindow(scsu);
1287 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1288 useDynamicWindow(scsu, dynamicWindow);
1289 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1290 length=3;
1291 goto outputBytes;
1292 }
1293 }
1294
1295 /* don't know how to compress this character, just write it directly */
1296 length=2;
1297 goto outputBytes;
1298 } else if(c<0xe000) {
1299 /* c is a surrogate */
1300 if(U16_IS_SURROGATE_LEAD(c)) {
1301 getTrailUnicode:
1302 lead=(UChar)c;
1303 if(source<sourceLimit) {
1304 /* test the following code unit */
1305 trail=*source;
1306 if(U16_IS_TRAIL(trail)) {
1307 ++source;
1308 ++nextSourceIndex;
1309 c=U16_GET_SUPPLEMENTARY(c, trail);
1310 /* convert this surrogate code point */
1311 /* exit this condition tree */
1312 } else {
1313 /* this is an unmatched lead code unit (1st surrogate) */
1314 /* callback(illegal) */
1315 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1316 goto endloop;
1317 }
1318 } else {
1319 /* no more input */
1320 break;
1321 }
1322 } else {
1323 /* this is an unmatched trail code unit (2nd surrogate) */
1324 /* callback(illegal) */
1325 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1326 goto endloop;
1327 }
1328
1329 /* compress supplementary character */
1330 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1331 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1332 ) {
1333 /*
1334 * there is a dynamic window that contains this character and
1335 * the following character is not uncompressible,
1336 * change to the window
1337 */
1338 isSingleByteMode=TRUE;
1339 dynamicWindow=window;
1340 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1341 useDynamicWindow(scsu, dynamicWindow);
1342 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1343 length=2;
1344 goto outputBytes;
1345 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1346 (code=getDynamicOffset(c, &offset))>=0
1347 ) {
1348 /* two supplementary characters in (probably) the same window - define an extended one */
1349 isSingleByteMode=TRUE;
1350 code-=0x200;
1351 dynamicWindow=getNextDynamicWindow(scsu);
1352 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1353 useDynamicWindow(scsu, dynamicWindow);
1354 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1355 length=4;
1356 goto outputBytes;
1357 } else {
1358 /* don't know how to compress this character, just write it directly */
1359 c=((uint32_t)lead<<16)|trail;
1360 length=4;
1361 goto outputBytes;
1362 }
1363 } else /* 0xe000<=c<0xf300 */ {
1364 /* quote to avoid SCSU tags */
1365 c|=UQU<<16;
1366 length=3;
1367 goto outputBytes;
1368 }
1369
1370 /* normal end of conversion: prepare for a new character */
1371 c=0;
1372 sourceIndex=nextSourceIndex;
1373 }
1374 }
1375 endloop:
1376
1377 /* set the converter state back into UConverter */
1378 scsu->fromUIsSingleByteMode=isSingleByteMode;
1379 scsu->fromUDynamicWindow=dynamicWindow;
1380
1381 cnv->fromUChar32=c;
1382
1383 /* write back the updated pointers */
1384 pArgs->source=source;
1385 pArgs->target=(char *)target;
1386 pArgs->offsets=offsets;
1387 return;
1388
1389 outputBytes:
1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1391 /* from the first if in the loop we know that targetCapacity>0 */
1392 if(length<=targetCapacity) {
1393 if(offsets==NULL) {
1394 switch(length) {
1395 /* each branch falls through to the next one */
1396 case 4:
1397 *target++=(uint8_t)(c>>24);
1398 case 3: /*fall through*/
1399 *target++=(uint8_t)(c>>16);
1400 case 2: /*fall through*/
1401 *target++=(uint8_t)(c>>8);
1402 case 1: /*fall through*/
1403 *target++=(uint8_t)c;
1404 default:
1405 /* will never occur */
1406 break;
1407 }
1408 } else {
1409 switch(length) {
1410 /* each branch falls through to the next one */
1411 case 4:
1412 *target++=(uint8_t)(c>>24);
1413 *offsets++=sourceIndex;
1414 case 3: /*fall through*/
1415 *target++=(uint8_t)(c>>16);
1416 *offsets++=sourceIndex;
1417 case 2: /*fall through*/
1418 *target++=(uint8_t)(c>>8);
1419 *offsets++=sourceIndex;
1420 case 1: /*fall through*/
1421 *target++=(uint8_t)c;
1422 *offsets++=sourceIndex;
1423 default:
1424 /* will never occur */
1425 break;
1426 }
1427 }
1428 targetCapacity-=length;
1429
1430 /* normal end of conversion: prepare for a new character */
1431 c=0;
1432 sourceIndex=nextSourceIndex;
1433 goto loop;
1434 } else {
1435 uint8_t *p;
1436
1437 /*
1438 * We actually do this backwards here:
1439 * In order to save an intermediate variable, we output
1440 * first to the overflow buffer what does not fit into the
1441 * regular target.
1442 */
1443 /* we know that 0<=targetCapacity<length<=4 */
1444 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1445 length-=targetCapacity;
1446 p=(uint8_t *)cnv->charErrorBuffer;
1447 switch(length) {
1448 /* each branch falls through to the next one */
1449 case 4:
1450 *p++=(uint8_t)(c>>24);
1451 case 3: /*fall through*/
1452 *p++=(uint8_t)(c>>16);
1453 case 2: /*fall through*/
1454 *p++=(uint8_t)(c>>8);
1455 case 1: /*fall through*/
1456 *p=(uint8_t)c;
1457 default:
1458 /* will never occur */
1459 break;
1460 }
1461 cnv->charErrorBufferLength=(int8_t)length;
1462
1463 /* now output what fits into the regular target */
1464 c>>=8*length; /* length was reduced by targetCapacity */
1465 switch(targetCapacity) {
1466 /* each branch falls through to the next one */
1467 case 3:
1468 *target++=(uint8_t)(c>>16);
1469 if(offsets!=NULL) {
1470 *offsets++=sourceIndex;
1471 }
1472 case 2: /*fall through*/
1473 *target++=(uint8_t)(c>>8);
1474 if(offsets!=NULL) {
1475 *offsets++=sourceIndex;
1476 }
1477 case 1: /*fall through*/
1478 *target++=(uint8_t)c;
1479 if(offsets!=NULL) {
1480 *offsets++=sourceIndex;
1481 }
1482 default:
1483 break;
1484 }
1485
1486 /* target overflow */
1487 targetCapacity=0;
1488 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1489 c=0;
1490 goto endloop;
1491 }
1492 }
1493
1494 /*
1495 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1496 * If a change is made in the original function, then either
1497 * change this function the same way or
1498 * re-copy the original function and remove the variables
1499 * offsets, sourceIndex, and nextSourceIndex.
1500 */
1501 static void
1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1503 UErrorCode *pErrorCode) {
1504 UConverter *cnv;
1505 SCSUData *scsu;
1506 const UChar *source, *sourceLimit;
1507 uint8_t *target;
1508 int32_t targetCapacity;
1509
1510 UBool isSingleByteMode;
1511 uint8_t dynamicWindow;
1512 uint32_t currentOffset;
1513
1514 uint32_t c, delta;
1515
1516 int32_t length;
1517
1518 /* variables for compression heuristics */
1519 uint32_t offset;
1520 UChar lead, trail;
1521 int code;
1522 int8_t window;
1523
1524 /* set up the local pointers */
1525 cnv=pArgs->converter;
1526 scsu=(SCSUData *)cnv->extraInfo;
1527
1528 /* set up the local pointers */
1529 source=pArgs->source;
1530 sourceLimit=pArgs->sourceLimit;
1531 target=(uint8_t *)pArgs->target;
1532 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1533
1534 /* get the state machine state */
1535 isSingleByteMode=scsu->fromUIsSingleByteMode;
1536 dynamicWindow=scsu->fromUDynamicWindow;
1537 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1538
1539 c=cnv->fromUChar32;
1540
1541 /* similar conversion "loop" as in toUnicode */
1542 loop:
1543 if(isSingleByteMode) {
1544 if(c!=0 && targetCapacity>0) {
1545 goto getTrailSingle;
1546 }
1547
1548 /* state machine for single-byte mode */
1549 /* singleByteMode: */
1550 while(source<sourceLimit) {
1551 if(targetCapacity<=0) {
1552 /* target is full */
1553 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1554 break;
1555 }
1556 c=*source++;
1557
1558 if((c-0x20)<=0x5f) {
1559 /* pass US-ASCII graphic character through */
1560 *target++=(uint8_t)c;
1561 --targetCapacity;
1562 } else if(c<0x20) {
1563 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1564 /* CR/LF/TAB/NUL */
1565 *target++=(uint8_t)c;
1566 --targetCapacity;
1567 } else {
1568 /* quote C0 control character */
1569 c|=SQ0<<8;
1570 length=2;
1571 goto outputBytes;
1572 }
1573 } else if((delta=c-currentOffset)<=0x7f) {
1574 /* use the current dynamic window */
1575 *target++=(uint8_t)(delta|0x80);
1576 --targetCapacity;
1577 } else if(U16_IS_SURROGATE(c)) {
1578 if(U16_IS_SURROGATE_LEAD(c)) {
1579 getTrailSingle:
1580 lead=(UChar)c;
1581 if(source<sourceLimit) {
1582 /* test the following code unit */
1583 trail=*source;
1584 if(U16_IS_TRAIL(trail)) {
1585 ++source;
1586 c=U16_GET_SUPPLEMENTARY(c, trail);
1587 /* convert this surrogate code point */
1588 /* exit this condition tree */
1589 } else {
1590 /* this is an unmatched lead code unit (1st surrogate) */
1591 /* callback(illegal) */
1592 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1593 goto endloop;
1594 }
1595 } else {
1596 /* no more input */
1597 break;
1598 }
1599 } else {
1600 /* this is an unmatched trail code unit (2nd surrogate) */
1601 /* callback(illegal) */
1602 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1603 goto endloop;
1604 }
1605
1606 /* compress supplementary character U+10000..U+10ffff */
1607 if((delta=c-currentOffset)<=0x7f) {
1608 /* use the current dynamic window */
1609 *target++=(uint8_t)(delta|0x80);
1610 --targetCapacity;
1611 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1612 /* there is a dynamic window that contains this character, change to it */
1613 dynamicWindow=window;
1614 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1615 useDynamicWindow(scsu, dynamicWindow);
1616 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1617 length=2;
1618 goto outputBytes;
1619 } else if((code=getDynamicOffset(c, &offset))>=0) {
1620 /* might check if there are more characters in this window to come */
1621 /* define an extended window with this character */
1622 code-=0x200;
1623 dynamicWindow=getNextDynamicWindow(scsu);
1624 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1625 useDynamicWindow(scsu, dynamicWindow);
1626 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1627 length=4;
1628 goto outputBytes;
1629 } else {
1630 /* change to Unicode mode and output this (lead, trail) pair */
1631 isSingleByteMode=FALSE;
1632 *target++=(uint8_t)SCU;
1633 --targetCapacity;
1634 c=((uint32_t)lead<<16)|trail;
1635 length=4;
1636 goto outputBytes;
1637 }
1638 } else if(c<0xa0) {
1639 /* quote C1 control character */
1640 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1641 length=2;
1642 goto outputBytes;
1643 } else if(c==0xfeff || c>=0xfff0) {
1644 /* quote signature character=byte order mark and specials */
1645 c|=SQU<<16;
1646 length=3;
1647 goto outputBytes;
1648 } else {
1649 /* compress all other BMP characters */
1650 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1651 /* there is a window defined that contains this character - switch to it or quote from it? */
1652 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1653 /* change to dynamic window */
1654 dynamicWindow=window;
1655 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1656 useDynamicWindow(scsu, dynamicWindow);
1657 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1658 length=2;
1659 goto outputBytes;
1660 } else {
1661 /* quote from dynamic window */
1662 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1663 length=2;
1664 goto outputBytes;
1665 }
1666 } else if((window=getWindow(staticOffsets, c))>=0) {
1667 /* quote from static window */
1668 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1669 length=2;
1670 goto outputBytes;
1671 } else if((code=getDynamicOffset(c, &offset))>=0) {
1672 /* define a dynamic window with this character */
1673 dynamicWindow=getNextDynamicWindow(scsu);
1674 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1675 useDynamicWindow(scsu, dynamicWindow);
1676 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1677 length=3;
1678 goto outputBytes;
1679 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1680 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1681 ) {
1682 /*
1683 * this character is not compressible (a BMP ideograph or similar);
1684 * switch to Unicode mode if this is the last character in the block
1685 * or there is at least one more ideograph following immediately
1686 */
1687 isSingleByteMode=FALSE;
1688 c|=SCU<<16;
1689 length=3;
1690 goto outputBytes;
1691 } else {
1692 /* quote Unicode */
1693 c|=SQU<<16;
1694 length=3;
1695 goto outputBytes;
1696 }
1697 }
1698
1699 /* normal end of conversion: prepare for a new character */
1700 c=0;
1701 }
1702 } else {
1703 if(c!=0 && targetCapacity>0) {
1704 goto getTrailUnicode;
1705 }
1706
1707 /* state machine for Unicode mode */
1708 /* unicodeByteMode: */
1709 while(source<sourceLimit) {
1710 if(targetCapacity<=0) {
1711 /* target is full */
1712 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1713 break;
1714 }
1715 c=*source++;
1716
1717 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1718 /* not compressible, write character directly */
1719 if(targetCapacity>=2) {
1720 *target++=(uint8_t)(c>>8);
1721 *target++=(uint8_t)c;
1722 targetCapacity-=2;
1723 } else {
1724 length=2;
1725 goto outputBytes;
1726 }
1727 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1728 /* compress BMP character if the following one is not an uncompressible ideograph */
1729 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1730 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1731 /* ASCII digit or letter */
1732 isSingleByteMode=TRUE;
1733 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1734 length=2;
1735 goto outputBytes;
1736 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1737 /* there is a dynamic window that contains this character, change to it */
1738 isSingleByteMode=TRUE;
1739 dynamicWindow=window;
1740 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1741 useDynamicWindow(scsu, dynamicWindow);
1742 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1743 length=2;
1744 goto outputBytes;
1745 } else if((code=getDynamicOffset(c, &offset))>=0) {
1746 /* define a dynamic window with this character */
1747 isSingleByteMode=TRUE;
1748 dynamicWindow=getNextDynamicWindow(scsu);
1749 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1750 useDynamicWindow(scsu, dynamicWindow);
1751 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1752 length=3;
1753 goto outputBytes;
1754 }
1755 }
1756
1757 /* don't know how to compress this character, just write it directly */
1758 length=2;
1759 goto outputBytes;
1760 } else if(c<0xe000) {
1761 /* c is a surrogate */
1762 if(U16_IS_SURROGATE_LEAD(c)) {
1763 getTrailUnicode:
1764 lead=(UChar)c;
1765 if(source<sourceLimit) {
1766 /* test the following code unit */
1767 trail=*source;
1768 if(U16_IS_TRAIL(trail)) {
1769 ++source;
1770 c=U16_GET_SUPPLEMENTARY(c, trail);
1771 /* convert this surrogate code point */
1772 /* exit this condition tree */
1773 } else {
1774 /* this is an unmatched lead code unit (1st surrogate) */
1775 /* callback(illegal) */
1776 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1777 goto endloop;
1778 }
1779 } else {
1780 /* no more input */
1781 break;
1782 }
1783 } else {
1784 /* this is an unmatched trail code unit (2nd surrogate) */
1785 /* callback(illegal) */
1786 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1787 goto endloop;
1788 }
1789
1790 /* compress supplementary character */
1791 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1792 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1793 ) {
1794 /*
1795 * there is a dynamic window that contains this character and
1796 * the following character is not uncompressible,
1797 * change to the window
1798 */
1799 isSingleByteMode=TRUE;
1800 dynamicWindow=window;
1801 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1802 useDynamicWindow(scsu, dynamicWindow);
1803 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1804 length=2;
1805 goto outputBytes;
1806 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1807 (code=getDynamicOffset(c, &offset))>=0
1808 ) {
1809 /* two supplementary characters in (probably) the same window - define an extended one */
1810 isSingleByteMode=TRUE;
1811 code-=0x200;
1812 dynamicWindow=getNextDynamicWindow(scsu);
1813 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1814 useDynamicWindow(scsu, dynamicWindow);
1815 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1816 length=4;
1817 goto outputBytes;
1818 } else {
1819 /* don't know how to compress this character, just write it directly */
1820 c=((uint32_t)lead<<16)|trail;
1821 length=4;
1822 goto outputBytes;
1823 }
1824 } else /* 0xe000<=c<0xf300 */ {
1825 /* quote to avoid SCSU tags */
1826 c|=UQU<<16;
1827 length=3;
1828 goto outputBytes;
1829 }
1830
1831 /* normal end of conversion: prepare for a new character */
1832 c=0;
1833 }
1834 }
1835 endloop:
1836
1837 /* set the converter state back into UConverter */
1838 scsu->fromUIsSingleByteMode=isSingleByteMode;
1839 scsu->fromUDynamicWindow=dynamicWindow;
1840
1841 cnv->fromUChar32=c;
1842
1843 /* write back the updated pointers */
1844 pArgs->source=source;
1845 pArgs->target=(char *)target;
1846 return;
1847
1848 outputBytes:
1849 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1850 /* from the first if in the loop we know that targetCapacity>0 */
1851 if(length<=targetCapacity) {
1852 switch(length) {
1853 /* each branch falls through to the next one */
1854 case 4:
1855 *target++=(uint8_t)(c>>24);
1856 case 3: /*fall through*/
1857 *target++=(uint8_t)(c>>16);
1858 case 2: /*fall through*/
1859 *target++=(uint8_t)(c>>8);
1860 case 1: /*fall through*/
1861 *target++=(uint8_t)c;
1862 default:
1863 /* will never occur */
1864 break;
1865 }
1866 targetCapacity-=length;
1867
1868 /* normal end of conversion: prepare for a new character */
1869 c=0;
1870 goto loop;
1871 } else {
1872 uint8_t *p;
1873
1874 /*
1875 * We actually do this backwards here:
1876 * In order to save an intermediate variable, we output
1877 * first to the overflow buffer what does not fit into the
1878 * regular target.
1879 */
1880 /* we know that 0<=targetCapacity<length<=4 */
1881 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1882 length-=targetCapacity;
1883 p=(uint8_t *)cnv->charErrorBuffer;
1884 switch(length) {
1885 /* each branch falls through to the next one */
1886 case 4:
1887 *p++=(uint8_t)(c>>24);
1888 case 3: /*fall through*/
1889 *p++=(uint8_t)(c>>16);
1890 case 2: /*fall through*/
1891 *p++=(uint8_t)(c>>8);
1892 case 1: /*fall through*/
1893 *p=(uint8_t)c;
1894 default:
1895 /* will never occur */
1896 break;
1897 }
1898 cnv->charErrorBufferLength=(int8_t)length;
1899
1900 /* now output what fits into the regular target */
1901 c>>=8*length; /* length was reduced by targetCapacity */
1902 switch(targetCapacity) {
1903 /* each branch falls through to the next one */
1904 case 3:
1905 *target++=(uint8_t)(c>>16);
1906 case 2: /*fall through*/
1907 *target++=(uint8_t)(c>>8);
1908 case 1: /*fall through*/
1909 *target++=(uint8_t)c;
1910 default:
1911 break;
1912 }
1913
1914 /* target overflow */
1915 targetCapacity=0;
1916 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1917 c=0;
1918 goto endloop;
1919 }
1920 }
1921
1922 /* miscellaneous ------------------------------------------------------------ */
1923
1924 static const char *
1925 _SCSUGetName(const UConverter *cnv) {
1926 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1927
1928 switch(scsu->locale) {
1929 case l_ja:
1930 return "SCSU,locale=ja";
1931 default:
1932 return "SCSU";
1933 }
1934 }
1935
1936 /* structure for SafeClone calculations */
1937 struct cloneSCSUStruct
1938 {
1939 UConverter cnv;
1940 SCSUData mydata;
1941 };
1942
1943 static UConverter *
1944 _SCSUSafeClone(const UConverter *cnv,
1945 void *stackBuffer,
1946 int32_t *pBufferSize,
1947 UErrorCode *status)
1948 {
1949 struct cloneSCSUStruct * localClone;
1950 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1951
1952 if (U_FAILURE(*status)){
1953 return 0;
1954 }
1955
1956 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1957 *pBufferSize = bufferSizeNeeded;
1958 return 0;
1959 }
1960
1961 localClone = (struct cloneSCSUStruct *)stackBuffer;
1962 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1963
1964 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1965 localClone->cnv.extraInfo = &localClone->mydata;
1966 localClone->cnv.isExtraLocal = TRUE;
1967
1968 return &localClone->cnv;
1969 }
1970
1971
1972 static const UConverterImpl _SCSUImpl={
1973 UCNV_SCSU,
1974
1975 NULL,
1976 NULL,
1977
1978 _SCSUOpen,
1979 _SCSUClose,
1980 _SCSUReset,
1981
1982 _SCSUToUnicode,
1983 _SCSUToUnicodeWithOffsets,
1984 _SCSUFromUnicode,
1985 _SCSUFromUnicodeWithOffsets,
1986 NULL,
1987
1988 NULL,
1989 _SCSUGetName,
1990 NULL,
1991 _SCSUSafeClone,
1992 ucnv_getCompleteUnicodeSet
1993 };
1994
1995 static const UConverterStaticData _SCSUStaticData={
1996 sizeof(UConverterStaticData),
1997 "SCSU",
1998 1212, /* CCSID for SCSU */
1999 UCNV_IBM, UCNV_SCSU,
2000 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2001 /*
2002 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2003 * substitution string.
2004 */
2005 { 0x0e, 0xff, 0xfd, 0 }, 3,
2006 FALSE, FALSE,
2007 0,
2008 0,
2009 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2010 };
2011
2012 const UConverterSharedData _SCSUData={
2013 sizeof(UConverterSharedData), ~((uint32_t)0),
2014 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
2015 0
2016 };
2017
2018 #endif