]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvscsu.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnvscsu.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4******************************************************************************
5*
2ca993e8 6* Copyright (C) 2000-2016, International Business Machines
b75a7d8f
A
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: ucnvscsu.c
f3c0d7a5 11* encoding: UTF-8
b75a7d8f
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2000nov18
16* created by: Markus W. Scherer
17*
18* This is an implementation of the Standard Compression Scheme for Unicode
19* as defined in http://www.unicode.org/unicode/reports/tr6/ .
20* Reserved commands and window settings are treated as illegal sequences and
21* will result in callback calls.
22*/
23
24#include "unicode/utypes.h"
374ca955 25
b331163b 26#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
374ca955 27
b75a7d8f
A
28#include "unicode/ucnv.h"
29#include "unicode/ucnv_cb.h"
4388f060 30#include "unicode/utf16.h"
b75a7d8f
A
31#include "ucnv_bld.h"
32#include "ucnv_cnv.h"
33#include "cmemory.h"
34
35/* SCSU definitions --------------------------------------------------------- */
36
37/* SCSU command byte values */
38enum {
39 SQ0=0x01, /* Quote from window pair 0 */
40 SQ7=0x08, /* Quote from window pair 7 */
41 SDX=0x0B, /* Define a window as extended */
42 Srs=0x0C, /* reserved */
43 SQU=0x0E, /* Quote a single Unicode character */
44 SCU=0x0F, /* Change to Unicode mode */
45 SC0=0x10, /* Select window 0 */
46 SC7=0x17, /* Select window 7 */
47 SD0=0x18, /* Define and select window 0 */
48 SD7=0x1F, /* Define and select window 7 */
49
50 UC0=0xE0, /* Select window 0 */
51 UC7=0xE7, /* Select window 7 */
52 UD0=0xE8, /* Define and select window 0 */
53 UD7=0xEF, /* Define and select window 7 */
54 UQU=0xF0, /* Quote a single Unicode character */
55 UDX=0xF1, /* Define a Window as extended */
56 Urs=0xF2 /* reserved */
57};
58
59enum {
60 /*
61 * Unicode code points from 3400 to E000 are not adressible by
62 * dynamic window, since in these areas no short run alphabets are
63 * found. Therefore add gapOffset to all values from gapThreshold.
64 */
65 gapThreshold=0x68,
66 gapOffset=0xAC00,
67
68 /* values between reservedStart and fixedThreshold are reserved */
69 reservedStart=0xA8,
70
71 /* use table of predefined fixed offsets for values from fixedThreshold */
72 fixedThreshold=0xF9
73};
74
75/* constant offsets for the 8 static windows */
76static const uint32_t staticOffsets[8]={
77 0x0000, /* ASCII for quoted tags */
78 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79 0x0100, /* Latin Extended-A */
80 0x0300, /* Combining Diacritical Marks */
81 0x2000, /* General Punctuation */
82 0x2080, /* Currency Symbols */
83 0x2100, /* Letterlike Symbols and Number Forms */
84 0x3000 /* CJK Symbols and punctuation */
85};
86
87/* initial offsets for the 8 dynamic (sliding) windows */
88static const uint32_t initialDynamicOffsets[8]={
89 0x0080, /* Latin-1 */
90 0x00C0, /* Latin Extended A */
91 0x0400, /* Cyrillic */
92 0x0600, /* Arabic */
93 0x0900, /* Devanagari */
94 0x3040, /* Hiragana */
95 0x30A0, /* Katakana */
96 0xFF00 /* Fullwidth ASCII */
97};
98
99/* Table of fixed predefined Offsets */
100static const uint32_t fixedOffsets[]={
101 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102 /* 0xFA */ 0x0250, /* IPA extensions */
103 /* 0xFB */ 0x0370, /* Greek */
104 /* 0xFC */ 0x0530, /* Armenian */
105 /* 0xFD */ 0x3040, /* Hiragana */
106 /* 0xFE */ 0x30A0, /* Katakana */
107 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
108};
109
110/* state values */
111enum {
112 readCommand,
113 quotePairOne,
114 quotePairTwo,
115 quoteOne,
116 definePairOne,
117 definePairTwo,
118 defineOne
119};
120
121typedef struct SCSUData {
122 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
123 uint32_t toUDynamicOffsets[8];
124 uint32_t fromUDynamicOffsets[8];
125
126 /* state machine state - toUnicode */
127 UBool toUIsSingleByteMode;
128 uint8_t toUState;
129 int8_t toUQuoteWindow, toUDynamicWindow;
130 uint8_t toUByteOne;
131 uint8_t toUPadding[3];
132
133 /* state machine state - fromUnicode */
134 UBool fromUIsSingleByteMode;
135 int8_t fromUDynamicWindow;
136
137 /*
138 * windowUse[] keeps track of the use of the dynamic windows:
139 * At nextWindowUseIndex there is the least recently used window,
140 * and the following windows (in a wrapping manner) are more and more
141 * recently used.
142 * At nextWindowUseIndex-1 there is the most recently used window.
143 */
144 uint8_t locale;
145 int8_t nextWindowUseIndex;
146 int8_t windowUse[8];
147} SCSUData;
148
149static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
151
152enum {
153 lGeneric, l_ja
154};
155
156/* SCSU setup functions ----------------------------------------------------- */
f3c0d7a5
A
157U_CDECL_BEGIN
158static void U_CALLCONV
b75a7d8f
A
159_SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
160 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
161
162 if(choice<=UCNV_RESET_TO_UNICODE) {
163 /* reset toUnicode */
164 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
165
166 scsu->toUIsSingleByteMode=TRUE;
167 scsu->toUState=readCommand;
168 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
169 scsu->toUByteOne=0;
170
171 cnv->toULength=0;
172 }
173 if(choice!=UCNV_RESET_TO_UNICODE) {
174 /* reset fromUnicode */
175 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
176
177 scsu->fromUIsSingleByteMode=TRUE;
178 scsu->fromUDynamicWindow=0;
179
180 scsu->nextWindowUseIndex=0;
181 switch(scsu->locale) {
182 case l_ja:
183 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
184 break;
185 default:
186 uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
187 break;
188 }
189
374ca955 190 cnv->fromUChar32=0;
b75a7d8f
A
191 }
192}
193
f3c0d7a5 194static void U_CALLCONV
b75a7d8f 195_SCSUOpen(UConverter *cnv,
729e4ab9 196 UConverterLoadArgs *pArgs,
b75a7d8f 197 UErrorCode *pErrorCode) {
729e4ab9
A
198 const char *locale=pArgs->locale;
199 if(pArgs->onlyTestIsLoadable) {
200 return;
201 }
b75a7d8f
A
202 cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
203 if(cnv->extraInfo!=NULL) {
204 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
205 ((SCSUData *)cnv->extraInfo)->locale=l_ja;
206 } else {
207 ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
208 }
209 _SCSUReset(cnv, UCNV_RESET_BOTH);
210 } else {
211 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
212 }
73c04bcf
A
213
214 /* Set the substitution character U+fffd as a Unicode string. */
215 cnv->subUChars[0]=0xfffd;
216 cnv->subCharLen=-1;
b75a7d8f
A
217}
218
f3c0d7a5 219static void U_CALLCONV
b75a7d8f
A
220_SCSUClose(UConverter *cnv) {
221 if(cnv->extraInfo!=NULL) {
222 if(!cnv->isExtraLocal) {
223 uprv_free(cnv->extraInfo);
224 }
225 cnv->extraInfo=NULL;
226 }
227}
228
229/* SCSU-to-Unicode conversion functions ------------------------------------- */
230
f3c0d7a5 231static void U_CALLCONV
b75a7d8f
A
232_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
233 UErrorCode *pErrorCode) {
234 UConverter *cnv;
235 SCSUData *scsu;
236 const uint8_t *source, *sourceLimit;
237 UChar *target;
238 const UChar *targetLimit;
239 int32_t *offsets;
240 UBool isSingleByteMode;
241 uint8_t state, byteOne;
242 int8_t quoteWindow, dynamicWindow;
243
244 int32_t sourceIndex, nextSourceIndex;
245
246 uint8_t b;
247
248 /* set up the local pointers */
249 cnv=pArgs->converter;
250 scsu=(SCSUData *)cnv->extraInfo;
251
252 source=(const uint8_t *)pArgs->source;
253 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
254 target=pArgs->target;
255 targetLimit=pArgs->targetLimit;
256 offsets=pArgs->offsets;
257
258 /* get the state machine state */
259 isSingleByteMode=scsu->toUIsSingleByteMode;
260 state=scsu->toUState;
261 quoteWindow=scsu->toUQuoteWindow;
262 dynamicWindow=scsu->toUDynamicWindow;
263 byteOne=scsu->toUByteOne;
264
265 /* sourceIndex=-1 if the current character began in the previous buffer */
266 sourceIndex=state==readCommand ? 0 : -1;
267 nextSourceIndex=0;
268
269 /*
270 * conversion "loop"
271 *
272 * For performance, this is not a normal C loop.
273 * Instead, there are two code blocks for the two SCSU modes.
274 * The function branches to either one, and a change of the mode is done with a goto to
275 * the other branch.
276 *
277 * Each branch has two conventional loops:
278 * - a fast-path loop for the most common codes in the mode
279 * - a loop for all other codes in the mode
280 * When the fast-path runs into a code that it cannot handle, its loop ends and it
281 * runs into the following loop to handle the other codes.
282 * The end of the input or output buffer is also handled by the slower loop.
283 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
284 *
374ca955
A
285 * The callback handling is done by returning with an error code.
286 * The conversion framework actually calls the callback function.
b75a7d8f 287 */
b75a7d8f
A
288 if(isSingleByteMode) {
289 /* fast path for single-byte mode */
290 if(state==readCommand) {
291fastSingle:
292 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
293 ++source;
294 ++nextSourceIndex;
295 if(b<=0x7f) {
296 /* write US-ASCII graphic character or DEL */
297 *target++=(UChar)b;
298 if(offsets!=NULL) {
299 *offsets++=sourceIndex;
300 }
301 } else {
302 /* write from dynamic window */
303 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
304 if(c<=0xffff) {
305 *target++=(UChar)c;
306 if(offsets!=NULL) {
307 *offsets++=sourceIndex;
308 }
309 } else {
310 /* output surrogate pair */
311 *target++=(UChar)(0xd7c0+(c>>10));
312 if(target<targetLimit) {
313 *target++=(UChar)(0xdc00|(c&0x3ff));
314 if(offsets!=NULL) {
315 *offsets++=sourceIndex;
316 *offsets++=sourceIndex;
317 }
318 } else {
319 /* target overflow */
320 if(offsets!=NULL) {
321 *offsets++=sourceIndex;
322 }
323 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
324 cnv->UCharErrorBufferLength=1;
325 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
326 goto endloop;
327 }
328 }
329 }
330 sourceIndex=nextSourceIndex;
331 }
332 }
333
334 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
335singleByteMode:
336 while(source<sourceLimit) {
337 if(target>=targetLimit) {
338 /* target is full */
339 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
340 break;
341 }
342 b=*source++;
343 ++nextSourceIndex;
344 switch(state) {
345 case readCommand:
346 /* redundant conditions are commented out */
347 /* here: b<0x20 because otherwise we would be in fastSingle */
348 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349 /* CR/LF/TAB/NUL */
350 *target++=(UChar)b;
351 if(offsets!=NULL) {
352 *offsets++=sourceIndex;
353 }
354 sourceIndex=nextSourceIndex;
355 goto fastSingle;
356 } else if(SC0<=b) {
357 if(b<=SC7) {
358 dynamicWindow=(int8_t)(b-SC0);
359 sourceIndex=nextSourceIndex;
360 goto fastSingle;
361 } else /* if(SD0<=b && b<=SD7) */ {
362 dynamicWindow=(int8_t)(b-SD0);
363 state=defineOne;
364 }
365 } else if(/* SQ0<=b && */ b<=SQ7) {
366 quoteWindow=(int8_t)(b-SQ0);
367 state=quoteOne;
368 } else if(b==SDX) {
369 state=definePairOne;
370 } else if(b==SQU) {
371 state=quotePairOne;
372 } else if(b==SCU) {
373 sourceIndex=nextSourceIndex;
374 isSingleByteMode=FALSE;
375 goto fastUnicode;
376 } else /* Srs */ {
377 /* callback(illegal) */
374ca955
A
378 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
379 cnv->toUBytes[0]=b;
380 cnv->toULength=1;
381 goto endloop;
b75a7d8f 382 }
374ca955
A
383
384 /* store the first byte of a multibyte sequence in toUBytes[] */
385 cnv->toUBytes[0]=b;
386 cnv->toULength=1;
b75a7d8f
A
387 break;
388 case quotePairOne:
389 byteOne=b;
374ca955
A
390 cnv->toUBytes[1]=b;
391 cnv->toULength=2;
b75a7d8f
A
392 state=quotePairTwo;
393 break;
394 case quotePairTwo:
395 *target++=(UChar)((byteOne<<8)|b);
396 if(offsets!=NULL) {
397 *offsets++=sourceIndex;
398 }
399 sourceIndex=nextSourceIndex;
400 state=readCommand;
401 goto fastSingle;
402 case quoteOne:
403 if(b<0x80) {
404 /* all static offsets are in the BMP */
405 *target++=(UChar)(staticOffsets[quoteWindow]+b);
406 if(offsets!=NULL) {
407 *offsets++=sourceIndex;
408 }
409 } else {
410 /* write from dynamic window */
411 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
412 if(c<=0xffff) {
413 *target++=(UChar)c;
414 if(offsets!=NULL) {
415 *offsets++=sourceIndex;
416 }
417 } else {
418 /* output surrogate pair */
419 *target++=(UChar)(0xd7c0+(c>>10));
420 if(target<targetLimit) {
421 *target++=(UChar)(0xdc00|(c&0x3ff));
422 if(offsets!=NULL) {
423 *offsets++=sourceIndex;
424 *offsets++=sourceIndex;
425 }
426 } else {
427 /* target overflow */
428 if(offsets!=NULL) {
429 *offsets++=sourceIndex;
430 }
431 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
432 cnv->UCharErrorBufferLength=1;
433 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
434 goto endloop;
435 }
436 }
437 }
438 sourceIndex=nextSourceIndex;
439 state=readCommand;
440 goto fastSingle;
441 case definePairOne:
442 dynamicWindow=(int8_t)((b>>5)&7);
443 byteOne=(uint8_t)(b&0x1f);
374ca955
A
444 cnv->toUBytes[1]=b;
445 cnv->toULength=2;
b75a7d8f
A
446 state=definePairTwo;
447 break;
448 case definePairTwo:
449 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
450 sourceIndex=nextSourceIndex;
451 state=readCommand;
452 goto fastSingle;
453 case defineOne:
454 if(b==0) {
455 /* callback(illegal): Reserved window offset value 0 */
374ca955
A
456 cnv->toUBytes[1]=b;
457 cnv->toULength=2;
458 goto endloop;
b75a7d8f
A
459 } else if(b<gapThreshold) {
460 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
461 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
462 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
463 } else if(b>=fixedThreshold) {
464 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
465 } else {
466 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
374ca955
A
467 cnv->toUBytes[1]=b;
468 cnv->toULength=2;
469 goto endloop;
b75a7d8f
A
470 }
471 sourceIndex=nextSourceIndex;
472 state=readCommand;
473 goto fastSingle;
474 }
475 }
476 } else {
477 /* fast path for Unicode mode */
478 if(state==readCommand) {
479fastUnicode:
480 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
481 *target++=(UChar)((b<<8)|source[1]);
482 if(offsets!=NULL) {
483 *offsets++=sourceIndex;
484 }
485 sourceIndex=nextSourceIndex;
486 nextSourceIndex+=2;
487 source+=2;
488 }
489 }
490
491 /* normal state machine for Unicode mode */
492/* unicodeByteMode: */
493 while(source<sourceLimit) {
494 if(target>=targetLimit) {
495 /* target is full */
496 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
497 break;
498 }
499 b=*source++;
500 ++nextSourceIndex;
501 switch(state) {
502 case readCommand:
503 if((uint8_t)(b-UC0)>(Urs-UC0)) {
504 byteOne=b;
374ca955
A
505 cnv->toUBytes[0]=b;
506 cnv->toULength=1;
b75a7d8f
A
507 state=quotePairTwo;
508 } else if(/* UC0<=b && */ b<=UC7) {
509 dynamicWindow=(int8_t)(b-UC0);
510 sourceIndex=nextSourceIndex;
511 isSingleByteMode=TRUE;
512 goto fastSingle;
513 } else if(/* UD0<=b && */ b<=UD7) {
514 dynamicWindow=(int8_t)(b-UD0);
515 isSingleByteMode=TRUE;
374ca955
A
516 cnv->toUBytes[0]=b;
517 cnv->toULength=1;
b75a7d8f
A
518 state=defineOne;
519 goto singleByteMode;
520 } else if(b==UDX) {
521 isSingleByteMode=TRUE;
374ca955
A
522 cnv->toUBytes[0]=b;
523 cnv->toULength=1;
b75a7d8f
A
524 state=definePairOne;
525 goto singleByteMode;
526 } else if(b==UQU) {
374ca955
A
527 cnv->toUBytes[0]=b;
528 cnv->toULength=1;
b75a7d8f
A
529 state=quotePairOne;
530 } else /* Urs */ {
531 /* callback(illegal) */
374ca955
A
532 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
533 cnv->toUBytes[0]=b;
534 cnv->toULength=1;
535 goto endloop;
b75a7d8f
A
536 }
537 break;
538 case quotePairOne:
539 byteOne=b;
374ca955
A
540 cnv->toUBytes[1]=b;
541 cnv->toULength=2;
b75a7d8f
A
542 state=quotePairTwo;
543 break;
544 case quotePairTwo:
545 *target++=(UChar)((byteOne<<8)|b);
546 if(offsets!=NULL) {
547 *offsets++=sourceIndex;
548 }
549 sourceIndex=nextSourceIndex;
550 state=readCommand;
551 goto fastUnicode;
552 }
553 }
554 }
555endloop:
556
374ca955
A
557 /* set the converter state back into UConverter */
558 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
559 /* reset to deal with the next character */
560 state=readCommand;
561 } else if(state==readCommand) {
562 /* not in a multi-byte sequence, reset toULength */
563 cnv->toULength=0;
b75a7d8f 564 }
374ca955
A
565 scsu->toUIsSingleByteMode=isSingleByteMode;
566 scsu->toUState=state;
567 scsu->toUQuoteWindow=quoteWindow;
568 scsu->toUDynamicWindow=dynamicWindow;
569 scsu->toUByteOne=byteOne;
b75a7d8f 570
b75a7d8f
A
571 /* write back the updated pointers */
572 pArgs->source=(const char *)source;
573 pArgs->target=target;
574 pArgs->offsets=offsets;
575 return;
b75a7d8f
A
576}
577
578/*
579 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
580 * If a change is made in the original function, then either
581 * change this function the same way or
582 * re-copy the original function and remove the variables
583 * offsets, sourceIndex, and nextSourceIndex.
584 */
f3c0d7a5 585static void U_CALLCONV
b75a7d8f
A
586_SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
587 UErrorCode *pErrorCode) {
588 UConverter *cnv;
589 SCSUData *scsu;
590 const uint8_t *source, *sourceLimit;
591 UChar *target;
592 const UChar *targetLimit;
b75a7d8f
A
593 UBool isSingleByteMode;
594 uint8_t state, byteOne;
595 int8_t quoteWindow, dynamicWindow;
596
597 uint8_t b;
598
599 /* set up the local pointers */
600 cnv=pArgs->converter;
601 scsu=(SCSUData *)cnv->extraInfo;
602
603 source=(const uint8_t *)pArgs->source;
604 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
605 target=pArgs->target;
606 targetLimit=pArgs->targetLimit;
607
608 /* get the state machine state */
609 isSingleByteMode=scsu->toUIsSingleByteMode;
610 state=scsu->toUState;
611 quoteWindow=scsu->toUQuoteWindow;
612 dynamicWindow=scsu->toUDynamicWindow;
613 byteOne=scsu->toUByteOne;
614
615 /*
616 * conversion "loop"
617 *
618 * For performance, this is not a normal C loop.
619 * Instead, there are two code blocks for the two SCSU modes.
620 * The function branches to either one, and a change of the mode is done with a goto to
621 * the other branch.
622 *
623 * Each branch has two conventional loops:
624 * - a fast-path loop for the most common codes in the mode
625 * - a loop for all other codes in the mode
626 * When the fast-path runs into a code that it cannot handle, its loop ends and it
627 * runs into the following loop to handle the other codes.
628 * The end of the input or output buffer is also handled by the slower loop.
629 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
630 *
374ca955
A
631 * The callback handling is done by returning with an error code.
632 * The conversion framework actually calls the callback function.
b75a7d8f 633 */
b75a7d8f
A
634 if(isSingleByteMode) {
635 /* fast path for single-byte mode */
636 if(state==readCommand) {
637fastSingle:
638 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
639 ++source;
640 if(b<=0x7f) {
641 /* write US-ASCII graphic character or DEL */
642 *target++=(UChar)b;
643 } else {
644 /* write from dynamic window */
645 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
646 if(c<=0xffff) {
647 *target++=(UChar)c;
648 } else {
649 /* output surrogate pair */
650 *target++=(UChar)(0xd7c0+(c>>10));
651 if(target<targetLimit) {
652 *target++=(UChar)(0xdc00|(c&0x3ff));
653 } else {
654 /* target overflow */
655 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
656 cnv->UCharErrorBufferLength=1;
657 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
658 goto endloop;
659 }
660 }
661 }
662 }
663 }
664
665 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
666singleByteMode:
667 while(source<sourceLimit) {
668 if(target>=targetLimit) {
669 /* target is full */
670 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
671 break;
672 }
673 b=*source++;
674 switch(state) {
675 case readCommand:
676 /* redundant conditions are commented out */
677 /* here: b<0x20 because otherwise we would be in fastSingle */
678 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
679 /* CR/LF/TAB/NUL */
680 *target++=(UChar)b;
681 goto fastSingle;
682 } else if(SC0<=b) {
683 if(b<=SC7) {
684 dynamicWindow=(int8_t)(b-SC0);
685 goto fastSingle;
686 } else /* if(SD0<=b && b<=SD7) */ {
687 dynamicWindow=(int8_t)(b-SD0);
688 state=defineOne;
689 }
690 } else if(/* SQ0<=b && */ b<=SQ7) {
691 quoteWindow=(int8_t)(b-SQ0);
692 state=quoteOne;
693 } else if(b==SDX) {
694 state=definePairOne;
695 } else if(b==SQU) {
696 state=quotePairOne;
697 } else if(b==SCU) {
698 isSingleByteMode=FALSE;
699 goto fastUnicode;
700 } else /* Srs */ {
701 /* callback(illegal) */
374ca955
A
702 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
703 cnv->toUBytes[0]=b;
704 cnv->toULength=1;
705 goto endloop;
b75a7d8f 706 }
374ca955
A
707
708 /* store the first byte of a multibyte sequence in toUBytes[] */
709 cnv->toUBytes[0]=b;
710 cnv->toULength=1;
b75a7d8f
A
711 break;
712 case quotePairOne:
713 byteOne=b;
374ca955
A
714 cnv->toUBytes[1]=b;
715 cnv->toULength=2;
b75a7d8f
A
716 state=quotePairTwo;
717 break;
718 case quotePairTwo:
719 *target++=(UChar)((byteOne<<8)|b);
720 state=readCommand;
721 goto fastSingle;
722 case quoteOne:
723 if(b<0x80) {
724 /* all static offsets are in the BMP */
725 *target++=(UChar)(staticOffsets[quoteWindow]+b);
726 } else {
727 /* write from dynamic window */
728 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
729 if(c<=0xffff) {
730 *target++=(UChar)c;
731 } else {
732 /* output surrogate pair */
733 *target++=(UChar)(0xd7c0+(c>>10));
734 if(target<targetLimit) {
735 *target++=(UChar)(0xdc00|(c&0x3ff));
736 } else {
737 /* target overflow */
738 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
739 cnv->UCharErrorBufferLength=1;
740 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
741 goto endloop;
742 }
743 }
744 }
745 state=readCommand;
746 goto fastSingle;
747 case definePairOne:
748 dynamicWindow=(int8_t)((b>>5)&7);
749 byteOne=(uint8_t)(b&0x1f);
374ca955
A
750 cnv->toUBytes[1]=b;
751 cnv->toULength=2;
b75a7d8f
A
752 state=definePairTwo;
753 break;
754 case definePairTwo:
755 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
756 state=readCommand;
757 goto fastSingle;
758 case defineOne:
759 if(b==0) {
760 /* callback(illegal): Reserved window offset value 0 */
374ca955
A
761 cnv->toUBytes[1]=b;
762 cnv->toULength=2;
763 goto endloop;
b75a7d8f
A
764 } else if(b<gapThreshold) {
765 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
766 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
767 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
768 } else if(b>=fixedThreshold) {
769 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
770 } else {
771 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
374ca955
A
772 cnv->toUBytes[1]=b;
773 cnv->toULength=2;
774 goto endloop;
b75a7d8f
A
775 }
776 state=readCommand;
777 goto fastSingle;
778 }
779 }
780 } else {
781 /* fast path for Unicode mode */
782 if(state==readCommand) {
783fastUnicode:
784 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
785 *target++=(UChar)((b<<8)|source[1]);
786 source+=2;
787 }
788 }
789
790 /* normal state machine for Unicode mode */
791/* unicodeByteMode: */
792 while(source<sourceLimit) {
793 if(target>=targetLimit) {
794 /* target is full */
795 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
796 break;
797 }
798 b=*source++;
799 switch(state) {
800 case readCommand:
801 if((uint8_t)(b-UC0)>(Urs-UC0)) {
802 byteOne=b;
374ca955
A
803 cnv->toUBytes[0]=b;
804 cnv->toULength=1;
b75a7d8f
A
805 state=quotePairTwo;
806 } else if(/* UC0<=b && */ b<=UC7) {
807 dynamicWindow=(int8_t)(b-UC0);
808 isSingleByteMode=TRUE;
809 goto fastSingle;
810 } else if(/* UD0<=b && */ b<=UD7) {
811 dynamicWindow=(int8_t)(b-UD0);
812 isSingleByteMode=TRUE;
374ca955
A
813 cnv->toUBytes[0]=b;
814 cnv->toULength=1;
b75a7d8f
A
815 state=defineOne;
816 goto singleByteMode;
817 } else if(b==UDX) {
818 isSingleByteMode=TRUE;
374ca955
A
819 cnv->toUBytes[0]=b;
820 cnv->toULength=1;
b75a7d8f
A
821 state=definePairOne;
822 goto singleByteMode;
823 } else if(b==UQU) {
374ca955
A
824 cnv->toUBytes[0]=b;
825 cnv->toULength=1;
b75a7d8f
A
826 state=quotePairOne;
827 } else /* Urs */ {
828 /* callback(illegal) */
374ca955
A
829 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
830 cnv->toUBytes[0]=b;
831 cnv->toULength=1;
832 goto endloop;
b75a7d8f
A
833 }
834 break;
835 case quotePairOne:
836 byteOne=b;
374ca955
A
837 cnv->toUBytes[1]=b;
838 cnv->toULength=2;
b75a7d8f
A
839 state=quotePairTwo;
840 break;
841 case quotePairTwo:
842 *target++=(UChar)((byteOne<<8)|b);
843 state=readCommand;
844 goto fastUnicode;
845 }
846 }
847 }
848endloop:
849
374ca955
A
850 /* set the converter state back into UConverter */
851 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
852 /* reset to deal with the next character */
853 state=readCommand;
854 } else if(state==readCommand) {
855 /* not in a multi-byte sequence, reset toULength */
856 cnv->toULength=0;
b75a7d8f 857 }
374ca955
A
858 scsu->toUIsSingleByteMode=isSingleByteMode;
859 scsu->toUState=state;
860 scsu->toUQuoteWindow=quoteWindow;
861 scsu->toUDynamicWindow=dynamicWindow;
862 scsu->toUByteOne=byteOne;
b75a7d8f 863
b75a7d8f
A
864 /* write back the updated pointers */
865 pArgs->source=(const char *)source;
866 pArgs->target=target;
867 return;
b75a7d8f 868}
f3c0d7a5 869U_CDECL_END
b75a7d8f
A
870/* SCSU-from-Unicode conversion functions ----------------------------------- */
871
872/*
873 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
874 * reasonable results. The lookahead is minimal.
875 * Many cases are simple:
876 * A character fits directly into the current mode, a dynamic or static window,
877 * or is not compressible. These cases are tested first.
878 * Real compression heuristics are applied to the rest, in code branches for
879 * single/Unicode mode and BMP/supplementary code points.
880 * The heuristics used here are extremely simple.
881 */
882
883/* get the number of the window that this character is in, or -1 */
884static int8_t
885getWindow(const uint32_t offsets[8], uint32_t c) {
886 int i;
887 for(i=0; i<8; ++i) {
888 if((uint32_t)(c-offsets[i])<=0x7f) {
889 return (int8_t)(i);
890 }
891 }
892 return -1;
893}
894
895/* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
896static UBool
897isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
898 return (UBool)(c<=offset+0x7f &&
899 (c>=offset || (c<=0x7f &&
900 (c>=0x20 || (1UL<<c)&0x2601))));
901 /* binary 0010 0110 0000 0001,
902 check for b==0xd || b==0xa || b==9 || b==0 */
903}
904
905/*
906 * getNextDynamicWindow returns the next dynamic window to be redefined
907 */
908static int8_t
909getNextDynamicWindow(SCSUData *scsu) {
910 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
911 if(++scsu->nextWindowUseIndex==8) {
912 scsu->nextWindowUseIndex=0;
913 }
914 return window;
915}
916
917/*
918 * useDynamicWindow() adjusts
919 * windowUse[] and nextWindowUseIndex for the algorithm to choose
920 * the next dynamic window to be defined;
921 * a subclass may override it and provide its own algorithm.
922 */
923static void
924useDynamicWindow(SCSUData *scsu, int8_t window) {
925 /*
926 * move the existing window, which just became the most recently used one,
927 * up in windowUse[] to nextWindowUseIndex-1
928 */
929
930 /* first, find the index of the window - backwards to favor the more recently used windows */
931 int i, j;
932
933 i=scsu->nextWindowUseIndex;
934 do {
935 if(--i<0) {
936 i=7;
937 }
938 } while(scsu->windowUse[i]!=window);
939
940 /* now copy each windowUse[i+1] to [i] */
941 j=i+1;
942 if(j==8) {
943 j=0;
944 }
945 while(j!=scsu->nextWindowUseIndex) {
946 scsu->windowUse[i]=scsu->windowUse[j];
947 i=j;
948 if(++j==8) { j=0; }
949 }
950
951 /* finally, set the window into the most recently used index */
952 scsu->windowUse[i]=window;
953}
954
955/*
956 * calculate the offset and the code for a dynamic window that contains the character
957 * takes fixed offsets into account
958 * the offset of the window is stored in the offset variable,
959 * the code is returned
960 *
961 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
962 */
963static int
964getDynamicOffset(uint32_t c, uint32_t *pOffset) {
965 int i;
966
967 for(i=0; i<7; ++i) {
968 if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
969 *pOffset=fixedOffsets[i];
970 return 0xf9+i;
971 }
972 }
973
974 if(c<0x80) {
975 /* No dynamic window for US-ASCII. */
976 return -1;
977 } else if(c<0x3400 ||
978 (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
979 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
980 ) {
981 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
982 *pOffset=c&0x7fffff80;
983 return (int)(c>>7);
984 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
985 /* For these characters we need to take the gapOffset into account. */
986 *pOffset=c&0x7fffff80;
987 return (int)((c-gapOffset)>>7);
988 } else {
989 return -1;
990 }
991}
f3c0d7a5 992U_CDECL_BEGIN
b75a7d8f
A
993/*
994 * Idea for compression:
995 * - save SCSUData and other state before really starting work
996 * - at endloop, see if compression could be better with just unicode mode
997 * - don't do this if a callback has been called
998 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
999 * - different buffer handling!
1000 *
1001 * Drawback or need for corrective handling:
1002 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1003 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1004 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1005 *
1006 * How to achieve both?
1007 * - Only replace the result after an SDX or SCU?
1008 */
1009
f3c0d7a5 1010static void U_CALLCONV
b75a7d8f
A
1011_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1012 UErrorCode *pErrorCode) {
1013 UConverter *cnv;
1014 SCSUData *scsu;
1015 const UChar *source, *sourceLimit;
1016 uint8_t *target;
1017 int32_t targetCapacity;
1018 int32_t *offsets;
1019
1020 UBool isSingleByteMode;
1021 uint8_t dynamicWindow;
1022 uint32_t currentOffset;
1023
1024 uint32_t c, delta;
1025
1026 int32_t sourceIndex, nextSourceIndex;
1027
b75a7d8f
A
1028 int32_t length;
1029
1030 /* variables for compression heuristics */
1031 uint32_t offset;
1032 UChar lead, trail;
1033 int code;
1034 int8_t window;
1035
1036 /* set up the local pointers */
1037 cnv=pArgs->converter;
1038 scsu=(SCSUData *)cnv->extraInfo;
1039
1040 /* set up the local pointers */
1041 source=pArgs->source;
1042 sourceLimit=pArgs->sourceLimit;
1043 target=(uint8_t *)pArgs->target;
1044 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1045 offsets=pArgs->offsets;
1046
1047 /* get the state machine state */
1048 isSingleByteMode=scsu->fromUIsSingleByteMode;
1049 dynamicWindow=scsu->fromUDynamicWindow;
1050 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1051
374ca955 1052 c=cnv->fromUChar32;
b75a7d8f
A
1053
1054 /* sourceIndex=-1 if the current character began in the previous buffer */
1055 sourceIndex= c==0 ? 0 : -1;
1056 nextSourceIndex=0;
1057
1058 /* similar conversion "loop" as in toUnicode */
1059loop:
1060 if(isSingleByteMode) {
1061 if(c!=0 && targetCapacity>0) {
1062 goto getTrailSingle;
1063 }
1064
1065 /* state machine for single-byte mode */
1066/* singleByteMode: */
1067 while(source<sourceLimit) {
1068 if(targetCapacity<=0) {
1069 /* target is full */
1070 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1071 break;
1072 }
1073 c=*source++;
1074 ++nextSourceIndex;
1075
1076 if((c-0x20)<=0x5f) {
1077 /* pass US-ASCII graphic character through */
1078 *target++=(uint8_t)c;
1079 if(offsets!=NULL) {
1080 *offsets++=sourceIndex;
1081 }
1082 --targetCapacity;
1083 } else if(c<0x20) {
1084 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1085 /* CR/LF/TAB/NUL */
1086 *target++=(uint8_t)c;
1087 if(offsets!=NULL) {
1088 *offsets++=sourceIndex;
1089 }
1090 --targetCapacity;
1091 } else {
1092 /* quote C0 control character */
1093 c|=SQ0<<8;
1094 length=2;
1095 goto outputBytes;
1096 }
1097 } else if((delta=c-currentOffset)<=0x7f) {
1098 /* use the current dynamic window */
1099 *target++=(uint8_t)(delta|0x80);
1100 if(offsets!=NULL) {
1101 *offsets++=sourceIndex;
1102 }
1103 --targetCapacity;
4388f060
A
1104 } else if(U16_IS_SURROGATE(c)) {
1105 if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f
A
1106getTrailSingle:
1107 lead=(UChar)c;
1108 if(source<sourceLimit) {
1109 /* test the following code unit */
1110 trail=*source;
4388f060 1111 if(U16_IS_TRAIL(trail)) {
b75a7d8f
A
1112 ++source;
1113 ++nextSourceIndex;
4388f060 1114 c=U16_GET_SUPPLEMENTARY(c, trail);
b75a7d8f
A
1115 /* convert this surrogate code point */
1116 /* exit this condition tree */
1117 } else {
1118 /* this is an unmatched lead code unit (1st surrogate) */
1119 /* callback(illegal) */
374ca955
A
1120 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121 goto endloop;
b75a7d8f
A
1122 }
1123 } else {
1124 /* no more input */
1125 break;
1126 }
1127 } else {
1128 /* this is an unmatched trail code unit (2nd surrogate) */
1129 /* callback(illegal) */
374ca955
A
1130 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1131 goto endloop;
b75a7d8f
A
1132 }
1133
1134 /* compress supplementary character U+10000..U+10ffff */
1135 if((delta=c-currentOffset)<=0x7f) {
1136 /* use the current dynamic window */
1137 *target++=(uint8_t)(delta|0x80);
1138 if(offsets!=NULL) {
1139 *offsets++=sourceIndex;
1140 }
1141 --targetCapacity;
1142 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1143 /* there is a dynamic window that contains this character, change to it */
1144 dynamicWindow=window;
1145 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1146 useDynamicWindow(scsu, dynamicWindow);
1147 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1148 length=2;
1149 goto outputBytes;
1150 } else if((code=getDynamicOffset(c, &offset))>=0) {
1151 /* might check if there are more characters in this window to come */
1152 /* define an extended window with this character */
1153 code-=0x200;
1154 dynamicWindow=getNextDynamicWindow(scsu);
1155 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1156 useDynamicWindow(scsu, dynamicWindow);
1157 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1158 length=4;
1159 goto outputBytes;
1160 } else {
1161 /* change to Unicode mode and output this (lead, trail) pair */
1162 isSingleByteMode=FALSE;
1163 *target++=(uint8_t)SCU;
1164 if(offsets!=NULL) {
1165 *offsets++=sourceIndex;
1166 }
1167 --targetCapacity;
1168 c=((uint32_t)lead<<16)|trail;
1169 length=4;
1170 goto outputBytes;
1171 }
1172 } else if(c<0xa0) {
1173 /* quote C1 control character */
1174 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1175 length=2;
1176 goto outputBytes;
1177 } else if(c==0xfeff || c>=0xfff0) {
1178 /* quote signature character=byte order mark and specials */
1179 c|=SQU<<16;
1180 length=3;
1181 goto outputBytes;
1182 } else {
1183 /* compress all other BMP characters */
1184 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1185 /* there is a window defined that contains this character - switch to it or quote from it? */
1186 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1187 /* change to dynamic window */
1188 dynamicWindow=window;
1189 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1190 useDynamicWindow(scsu, dynamicWindow);
1191 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1192 length=2;
1193 goto outputBytes;
1194 } else {
1195 /* quote from dynamic window */
1196 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1197 length=2;
1198 goto outputBytes;
1199 }
1200 } else if((window=getWindow(staticOffsets, c))>=0) {
1201 /* quote from static window */
1202 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1203 length=2;
1204 goto outputBytes;
1205 } else if((code=getDynamicOffset(c, &offset))>=0) {
1206 /* define a dynamic window with this character */
1207 dynamicWindow=getNextDynamicWindow(scsu);
1208 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1209 useDynamicWindow(scsu, dynamicWindow);
1210 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1211 length=3;
1212 goto outputBytes;
1213 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1214 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1215 ) {
1216 /*
1217 * this character is not compressible (a BMP ideograph or similar);
1218 * switch to Unicode mode if this is the last character in the block
1219 * or there is at least one more ideograph following immediately
1220 */
1221 isSingleByteMode=FALSE;
1222 c|=SCU<<16;
1223 length=3;
1224 goto outputBytes;
1225 } else {
1226 /* quote Unicode */
1227 c|=SQU<<16;
1228 length=3;
1229 goto outputBytes;
1230 }
1231 }
1232
1233 /* normal end of conversion: prepare for a new character */
1234 c=0;
1235 sourceIndex=nextSourceIndex;
1236 }
1237 } else {
1238 if(c!=0 && targetCapacity>0) {
1239 goto getTrailUnicode;
1240 }
1241
1242 /* state machine for Unicode mode */
1243/* unicodeByteMode: */
1244 while(source<sourceLimit) {
1245 if(targetCapacity<=0) {
1246 /* target is full */
1247 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1248 break;
1249 }
1250 c=*source++;
1251 ++nextSourceIndex;
1252
1253 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1254 /* not compressible, write character directly */
1255 if(targetCapacity>=2) {
1256 *target++=(uint8_t)(c>>8);
1257 *target++=(uint8_t)c;
1258 if(offsets!=NULL) {
1259 *offsets++=sourceIndex;
1260 *offsets++=sourceIndex;
1261 }
1262 targetCapacity-=2;
1263 } else {
1264 length=2;
1265 goto outputBytes;
1266 }
1267 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1268 /* compress BMP character if the following one is not an uncompressible ideograph */
1269 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1270 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1271 /* ASCII digit or letter */
1272 isSingleByteMode=TRUE;
1273 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1274 length=2;
1275 goto outputBytes;
1276 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1277 /* there is a dynamic window that contains this character, change to it */
1278 isSingleByteMode=TRUE;
1279 dynamicWindow=window;
1280 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1281 useDynamicWindow(scsu, dynamicWindow);
1282 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1283 length=2;
1284 goto outputBytes;
1285 } else if((code=getDynamicOffset(c, &offset))>=0) {
1286 /* define a dynamic window with this character */
1287 isSingleByteMode=TRUE;
1288 dynamicWindow=getNextDynamicWindow(scsu);
1289 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1290 useDynamicWindow(scsu, dynamicWindow);
1291 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1292 length=3;
1293 goto outputBytes;
1294 }
1295 }
1296
1297 /* don't know how to compress this character, just write it directly */
1298 length=2;
1299 goto outputBytes;
1300 } else if(c<0xe000) {
1301 /* c is a surrogate */
4388f060 1302 if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f
A
1303getTrailUnicode:
1304 lead=(UChar)c;
1305 if(source<sourceLimit) {
1306 /* test the following code unit */
1307 trail=*source;
4388f060 1308 if(U16_IS_TRAIL(trail)) {
b75a7d8f
A
1309 ++source;
1310 ++nextSourceIndex;
4388f060 1311 c=U16_GET_SUPPLEMENTARY(c, trail);
b75a7d8f
A
1312 /* convert this surrogate code point */
1313 /* exit this condition tree */
1314 } else {
1315 /* this is an unmatched lead code unit (1st surrogate) */
1316 /* callback(illegal) */
374ca955
A
1317 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1318 goto endloop;
b75a7d8f
A
1319 }
1320 } else {
1321 /* no more input */
1322 break;
1323 }
1324 } else {
1325 /* this is an unmatched trail code unit (2nd surrogate) */
1326 /* callback(illegal) */
374ca955
A
1327 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1328 goto endloop;
b75a7d8f
A
1329 }
1330
1331 /* compress supplementary character */
1332 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1333 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1334 ) {
1335 /*
1336 * there is a dynamic window that contains this character and
1337 * the following character is not uncompressible,
1338 * change to the window
1339 */
1340 isSingleByteMode=TRUE;
1341 dynamicWindow=window;
1342 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1343 useDynamicWindow(scsu, dynamicWindow);
1344 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1345 length=2;
1346 goto outputBytes;
1347 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1348 (code=getDynamicOffset(c, &offset))>=0
1349 ) {
1350 /* two supplementary characters in (probably) the same window - define an extended one */
1351 isSingleByteMode=TRUE;
1352 code-=0x200;
1353 dynamicWindow=getNextDynamicWindow(scsu);
1354 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1355 useDynamicWindow(scsu, dynamicWindow);
1356 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1357 length=4;
1358 goto outputBytes;
1359 } else {
1360 /* don't know how to compress this character, just write it directly */
1361 c=((uint32_t)lead<<16)|trail;
1362 length=4;
1363 goto outputBytes;
1364 }
1365 } else /* 0xe000<=c<0xf300 */ {
1366 /* quote to avoid SCSU tags */
1367 c|=UQU<<16;
1368 length=3;
1369 goto outputBytes;
1370 }
1371
1372 /* normal end of conversion: prepare for a new character */
1373 c=0;
1374 sourceIndex=nextSourceIndex;
1375 }
1376 }
1377endloop:
1378
374ca955
A
1379 /* set the converter state back into UConverter */
1380 scsu->fromUIsSingleByteMode=isSingleByteMode;
1381 scsu->fromUDynamicWindow=dynamicWindow;
b75a7d8f 1382
374ca955 1383 cnv->fromUChar32=c;
b75a7d8f 1384
b75a7d8f
A
1385 /* write back the updated pointers */
1386 pArgs->source=source;
1387 pArgs->target=(char *)target;
1388 pArgs->offsets=offsets;
1389 return;
1390
1391outputBytes:
1392 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1393 /* from the first if in the loop we know that targetCapacity>0 */
1394 if(length<=targetCapacity) {
1395 if(offsets==NULL) {
1396 switch(length) {
1397 /* each branch falls through to the next one */
1398 case 4:
1399 *target++=(uint8_t)(c>>24);
2ca993e8
A
1400 U_FALLTHROUGH;
1401 case 3:
b75a7d8f 1402 *target++=(uint8_t)(c>>16);
2ca993e8
A
1403 U_FALLTHROUGH;
1404 case 2:
b75a7d8f 1405 *target++=(uint8_t)(c>>8);
2ca993e8
A
1406 U_FALLTHROUGH;
1407 case 1:
b75a7d8f 1408 *target++=(uint8_t)c;
2ca993e8 1409 U_FALLTHROUGH;
b75a7d8f
A
1410 default:
1411 /* will never occur */
1412 break;
1413 }
1414 } else {
1415 switch(length) {
1416 /* each branch falls through to the next one */
1417 case 4:
1418 *target++=(uint8_t)(c>>24);
1419 *offsets++=sourceIndex;
2ca993e8
A
1420 U_FALLTHROUGH;
1421 case 3:
b75a7d8f
A
1422 *target++=(uint8_t)(c>>16);
1423 *offsets++=sourceIndex;
2ca993e8
A
1424 U_FALLTHROUGH;
1425 case 2:
b75a7d8f
A
1426 *target++=(uint8_t)(c>>8);
1427 *offsets++=sourceIndex;
2ca993e8
A
1428 U_FALLTHROUGH;
1429 case 1:
b75a7d8f
A
1430 *target++=(uint8_t)c;
1431 *offsets++=sourceIndex;
2ca993e8 1432 U_FALLTHROUGH;
b75a7d8f
A
1433 default:
1434 /* will never occur */
1435 break;
1436 }
1437 }
1438 targetCapacity-=length;
1439
1440 /* normal end of conversion: prepare for a new character */
1441 c=0;
1442 sourceIndex=nextSourceIndex;
1443 goto loop;
1444 } else {
1445 uint8_t *p;
1446
1447 /*
1448 * We actually do this backwards here:
1449 * In order to save an intermediate variable, we output
1450 * first to the overflow buffer what does not fit into the
1451 * regular target.
1452 */
73c04bcf
A
1453 /* we know that 0<=targetCapacity<length<=4 */
1454 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
b75a7d8f
A
1455 length-=targetCapacity;
1456 p=(uint8_t *)cnv->charErrorBuffer;
1457 switch(length) {
1458 /* each branch falls through to the next one */
73c04bcf
A
1459 case 4:
1460 *p++=(uint8_t)(c>>24);
2ca993e8
A
1461 U_FALLTHROUGH;
1462 case 3:
b75a7d8f 1463 *p++=(uint8_t)(c>>16);
2ca993e8
A
1464 U_FALLTHROUGH;
1465 case 2:
b75a7d8f 1466 *p++=(uint8_t)(c>>8);
2ca993e8
A
1467 U_FALLTHROUGH;
1468 case 1:
b75a7d8f 1469 *p=(uint8_t)c;
2ca993e8 1470 U_FALLTHROUGH;
b75a7d8f
A
1471 default:
1472 /* will never occur */
1473 break;
1474 }
1475 cnv->charErrorBufferLength=(int8_t)length;
1476
1477 /* now output what fits into the regular target */
1478 c>>=8*length; /* length was reduced by targetCapacity */
1479 switch(targetCapacity) {
1480 /* each branch falls through to the next one */
1481 case 3:
1482 *target++=(uint8_t)(c>>16);
1483 if(offsets!=NULL) {
1484 *offsets++=sourceIndex;
1485 }
2ca993e8
A
1486 U_FALLTHROUGH;
1487 case 2:
b75a7d8f
A
1488 *target++=(uint8_t)(c>>8);
1489 if(offsets!=NULL) {
1490 *offsets++=sourceIndex;
1491 }
2ca993e8
A
1492 U_FALLTHROUGH;
1493 case 1:
b75a7d8f
A
1494 *target++=(uint8_t)c;
1495 if(offsets!=NULL) {
1496 *offsets++=sourceIndex;
1497 }
2ca993e8 1498 U_FALLTHROUGH;
b75a7d8f 1499 default:
b75a7d8f
A
1500 break;
1501 }
1502
1503 /* target overflow */
1504 targetCapacity=0;
1505 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1506 c=0;
1507 goto endloop;
1508 }
b75a7d8f
A
1509}
1510
1511/*
1512 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1513 * If a change is made in the original function, then either
1514 * change this function the same way or
1515 * re-copy the original function and remove the variables
1516 * offsets, sourceIndex, and nextSourceIndex.
1517 */
f3c0d7a5 1518static void U_CALLCONV
b75a7d8f
A
1519_SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1520 UErrorCode *pErrorCode) {
1521 UConverter *cnv;
1522 SCSUData *scsu;
1523 const UChar *source, *sourceLimit;
1524 uint8_t *target;
1525 int32_t targetCapacity;
1526
1527 UBool isSingleByteMode;
1528 uint8_t dynamicWindow;
1529 uint32_t currentOffset;
1530
1531 uint32_t c, delta;
1532
b75a7d8f
A
1533 int32_t length;
1534
1535 /* variables for compression heuristics */
1536 uint32_t offset;
1537 UChar lead, trail;
1538 int code;
1539 int8_t window;
1540
1541 /* set up the local pointers */
1542 cnv=pArgs->converter;
1543 scsu=(SCSUData *)cnv->extraInfo;
1544
1545 /* set up the local pointers */
1546 source=pArgs->source;
1547 sourceLimit=pArgs->sourceLimit;
1548 target=(uint8_t *)pArgs->target;
1549 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1550
1551 /* get the state machine state */
1552 isSingleByteMode=scsu->fromUIsSingleByteMode;
1553 dynamicWindow=scsu->fromUDynamicWindow;
1554 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1555
374ca955 1556 c=cnv->fromUChar32;
b75a7d8f
A
1557
1558 /* similar conversion "loop" as in toUnicode */
1559loop:
1560 if(isSingleByteMode) {
1561 if(c!=0 && targetCapacity>0) {
1562 goto getTrailSingle;
1563 }
1564
1565 /* state machine for single-byte mode */
1566/* singleByteMode: */
1567 while(source<sourceLimit) {
1568 if(targetCapacity<=0) {
1569 /* target is full */
1570 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1571 break;
1572 }
1573 c=*source++;
1574
1575 if((c-0x20)<=0x5f) {
1576 /* pass US-ASCII graphic character through */
1577 *target++=(uint8_t)c;
1578 --targetCapacity;
1579 } else if(c<0x20) {
1580 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1581 /* CR/LF/TAB/NUL */
1582 *target++=(uint8_t)c;
1583 --targetCapacity;
1584 } else {
1585 /* quote C0 control character */
1586 c|=SQ0<<8;
1587 length=2;
1588 goto outputBytes;
1589 }
1590 } else if((delta=c-currentOffset)<=0x7f) {
1591 /* use the current dynamic window */
1592 *target++=(uint8_t)(delta|0x80);
1593 --targetCapacity;
4388f060
A
1594 } else if(U16_IS_SURROGATE(c)) {
1595 if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f
A
1596getTrailSingle:
1597 lead=(UChar)c;
1598 if(source<sourceLimit) {
1599 /* test the following code unit */
1600 trail=*source;
4388f060 1601 if(U16_IS_TRAIL(trail)) {
b75a7d8f 1602 ++source;
4388f060 1603 c=U16_GET_SUPPLEMENTARY(c, trail);
b75a7d8f
A
1604 /* convert this surrogate code point */
1605 /* exit this condition tree */
1606 } else {
1607 /* this is an unmatched lead code unit (1st surrogate) */
1608 /* callback(illegal) */
374ca955
A
1609 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1610 goto endloop;
b75a7d8f
A
1611 }
1612 } else {
1613 /* no more input */
1614 break;
1615 }
1616 } else {
1617 /* this is an unmatched trail code unit (2nd surrogate) */
1618 /* callback(illegal) */
374ca955
A
1619 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1620 goto endloop;
b75a7d8f
A
1621 }
1622
1623 /* compress supplementary character U+10000..U+10ffff */
1624 if((delta=c-currentOffset)<=0x7f) {
1625 /* use the current dynamic window */
1626 *target++=(uint8_t)(delta|0x80);
1627 --targetCapacity;
1628 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1629 /* there is a dynamic window that contains this character, change to it */
1630 dynamicWindow=window;
1631 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1632 useDynamicWindow(scsu, dynamicWindow);
1633 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1634 length=2;
1635 goto outputBytes;
1636 } else if((code=getDynamicOffset(c, &offset))>=0) {
1637 /* might check if there are more characters in this window to come */
1638 /* define an extended window with this character */
1639 code-=0x200;
1640 dynamicWindow=getNextDynamicWindow(scsu);
1641 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1642 useDynamicWindow(scsu, dynamicWindow);
1643 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1644 length=4;
1645 goto outputBytes;
1646 } else {
1647 /* change to Unicode mode and output this (lead, trail) pair */
1648 isSingleByteMode=FALSE;
1649 *target++=(uint8_t)SCU;
1650 --targetCapacity;
1651 c=((uint32_t)lead<<16)|trail;
1652 length=4;
1653 goto outputBytes;
1654 }
1655 } else if(c<0xa0) {
1656 /* quote C1 control character */
1657 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1658 length=2;
1659 goto outputBytes;
1660 } else if(c==0xfeff || c>=0xfff0) {
1661 /* quote signature character=byte order mark and specials */
1662 c|=SQU<<16;
1663 length=3;
1664 goto outputBytes;
1665 } else {
1666 /* compress all other BMP characters */
1667 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1668 /* there is a window defined that contains this character - switch to it or quote from it? */
1669 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1670 /* change to dynamic window */
1671 dynamicWindow=window;
1672 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1673 useDynamicWindow(scsu, dynamicWindow);
1674 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1675 length=2;
1676 goto outputBytes;
1677 } else {
1678 /* quote from dynamic window */
1679 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1680 length=2;
1681 goto outputBytes;
1682 }
1683 } else if((window=getWindow(staticOffsets, c))>=0) {
1684 /* quote from static window */
1685 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1686 length=2;
1687 goto outputBytes;
1688 } else if((code=getDynamicOffset(c, &offset))>=0) {
1689 /* define a dynamic window with this character */
1690 dynamicWindow=getNextDynamicWindow(scsu);
1691 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1692 useDynamicWindow(scsu, dynamicWindow);
1693 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1694 length=3;
1695 goto outputBytes;
1696 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1697 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1698 ) {
1699 /*
1700 * this character is not compressible (a BMP ideograph or similar);
1701 * switch to Unicode mode if this is the last character in the block
1702 * or there is at least one more ideograph following immediately
1703 */
1704 isSingleByteMode=FALSE;
1705 c|=SCU<<16;
1706 length=3;
1707 goto outputBytes;
1708 } else {
1709 /* quote Unicode */
1710 c|=SQU<<16;
1711 length=3;
1712 goto outputBytes;
1713 }
1714 }
1715
1716 /* normal end of conversion: prepare for a new character */
1717 c=0;
1718 }
1719 } else {
1720 if(c!=0 && targetCapacity>0) {
1721 goto getTrailUnicode;
1722 }
1723
1724 /* state machine for Unicode mode */
1725/* unicodeByteMode: */
1726 while(source<sourceLimit) {
1727 if(targetCapacity<=0) {
1728 /* target is full */
1729 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1730 break;
1731 }
1732 c=*source++;
1733
1734 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1735 /* not compressible, write character directly */
1736 if(targetCapacity>=2) {
1737 *target++=(uint8_t)(c>>8);
1738 *target++=(uint8_t)c;
1739 targetCapacity-=2;
1740 } else {
1741 length=2;
1742 goto outputBytes;
1743 }
1744 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1745 /* compress BMP character if the following one is not an uncompressible ideograph */
1746 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1747 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1748 /* ASCII digit or letter */
1749 isSingleByteMode=TRUE;
1750 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1751 length=2;
1752 goto outputBytes;
1753 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1754 /* there is a dynamic window that contains this character, change to it */
1755 isSingleByteMode=TRUE;
1756 dynamicWindow=window;
1757 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1758 useDynamicWindow(scsu, dynamicWindow);
1759 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1760 length=2;
1761 goto outputBytes;
1762 } else if((code=getDynamicOffset(c, &offset))>=0) {
1763 /* define a dynamic window with this character */
1764 isSingleByteMode=TRUE;
1765 dynamicWindow=getNextDynamicWindow(scsu);
1766 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1767 useDynamicWindow(scsu, dynamicWindow);
1768 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1769 length=3;
1770 goto outputBytes;
1771 }
1772 }
1773
1774 /* don't know how to compress this character, just write it directly */
1775 length=2;
1776 goto outputBytes;
1777 } else if(c<0xe000) {
1778 /* c is a surrogate */
4388f060 1779 if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f
A
1780getTrailUnicode:
1781 lead=(UChar)c;
1782 if(source<sourceLimit) {
1783 /* test the following code unit */
1784 trail=*source;
4388f060 1785 if(U16_IS_TRAIL(trail)) {
b75a7d8f 1786 ++source;
4388f060 1787 c=U16_GET_SUPPLEMENTARY(c, trail);
b75a7d8f
A
1788 /* convert this surrogate code point */
1789 /* exit this condition tree */
1790 } else {
1791 /* this is an unmatched lead code unit (1st surrogate) */
1792 /* callback(illegal) */
374ca955
A
1793 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1794 goto endloop;
b75a7d8f
A
1795 }
1796 } else {
1797 /* no more input */
1798 break;
1799 }
1800 } else {
1801 /* this is an unmatched trail code unit (2nd surrogate) */
1802 /* callback(illegal) */
374ca955
A
1803 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1804 goto endloop;
b75a7d8f
A
1805 }
1806
1807 /* compress supplementary character */
1808 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1809 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1810 ) {
1811 /*
1812 * there is a dynamic window that contains this character and
1813 * the following character is not uncompressible,
1814 * change to the window
1815 */
1816 isSingleByteMode=TRUE;
1817 dynamicWindow=window;
1818 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1819 useDynamicWindow(scsu, dynamicWindow);
1820 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1821 length=2;
1822 goto outputBytes;
1823 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1824 (code=getDynamicOffset(c, &offset))>=0
1825 ) {
1826 /* two supplementary characters in (probably) the same window - define an extended one */
1827 isSingleByteMode=TRUE;
1828 code-=0x200;
1829 dynamicWindow=getNextDynamicWindow(scsu);
1830 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1831 useDynamicWindow(scsu, dynamicWindow);
1832 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1833 length=4;
1834 goto outputBytes;
1835 } else {
1836 /* don't know how to compress this character, just write it directly */
1837 c=((uint32_t)lead<<16)|trail;
1838 length=4;
1839 goto outputBytes;
1840 }
1841 } else /* 0xe000<=c<0xf300 */ {
1842 /* quote to avoid SCSU tags */
1843 c|=UQU<<16;
1844 length=3;
1845 goto outputBytes;
1846 }
1847
1848 /* normal end of conversion: prepare for a new character */
1849 c=0;
1850 }
1851 }
1852endloop:
1853
374ca955
A
1854 /* set the converter state back into UConverter */
1855 scsu->fromUIsSingleByteMode=isSingleByteMode;
1856 scsu->fromUDynamicWindow=dynamicWindow;
b75a7d8f 1857
374ca955 1858 cnv->fromUChar32=c;
b75a7d8f 1859
b75a7d8f
A
1860 /* write back the updated pointers */
1861 pArgs->source=source;
1862 pArgs->target=(char *)target;
1863 return;
1864
1865outputBytes:
1866 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1867 /* from the first if in the loop we know that targetCapacity>0 */
1868 if(length<=targetCapacity) {
1869 switch(length) {
1870 /* each branch falls through to the next one */
1871 case 4:
1872 *target++=(uint8_t)(c>>24);
2ca993e8
A
1873 U_FALLTHROUGH;
1874 case 3:
b75a7d8f 1875 *target++=(uint8_t)(c>>16);
2ca993e8
A
1876 U_FALLTHROUGH;
1877 case 2:
b75a7d8f 1878 *target++=(uint8_t)(c>>8);
2ca993e8
A
1879 U_FALLTHROUGH;
1880 case 1:
b75a7d8f 1881 *target++=(uint8_t)c;
2ca993e8 1882 U_FALLTHROUGH;
b75a7d8f
A
1883 default:
1884 /* will never occur */
1885 break;
1886 }
1887 targetCapacity-=length;
1888
1889 /* normal end of conversion: prepare for a new character */
1890 c=0;
1891 goto loop;
1892 } else {
1893 uint8_t *p;
1894
1895 /*
1896 * We actually do this backwards here:
1897 * In order to save an intermediate variable, we output
1898 * first to the overflow buffer what does not fit into the
1899 * regular target.
1900 */
73c04bcf
A
1901 /* we know that 0<=targetCapacity<length<=4 */
1902 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
b75a7d8f
A
1903 length-=targetCapacity;
1904 p=(uint8_t *)cnv->charErrorBuffer;
1905 switch(length) {
1906 /* each branch falls through to the next one */
73c04bcf
A
1907 case 4:
1908 *p++=(uint8_t)(c>>24);
2ca993e8
A
1909 U_FALLTHROUGH;
1910 case 3:
b75a7d8f 1911 *p++=(uint8_t)(c>>16);
2ca993e8
A
1912 U_FALLTHROUGH;
1913 case 2:
b75a7d8f 1914 *p++=(uint8_t)(c>>8);
2ca993e8
A
1915 U_FALLTHROUGH;
1916 case 1:
b75a7d8f 1917 *p=(uint8_t)c;
2ca993e8 1918 U_FALLTHROUGH;
b75a7d8f
A
1919 default:
1920 /* will never occur */
1921 break;
1922 }
1923 cnv->charErrorBufferLength=(int8_t)length;
1924
1925 /* now output what fits into the regular target */
1926 c>>=8*length; /* length was reduced by targetCapacity */
1927 switch(targetCapacity) {
1928 /* each branch falls through to the next one */
1929 case 3:
1930 *target++=(uint8_t)(c>>16);
2ca993e8
A
1931 U_FALLTHROUGH;
1932 case 2:
b75a7d8f 1933 *target++=(uint8_t)(c>>8);
2ca993e8
A
1934 U_FALLTHROUGH;
1935 case 1:
b75a7d8f 1936 *target++=(uint8_t)c;
2ca993e8 1937 U_FALLTHROUGH;
b75a7d8f 1938 default:
b75a7d8f
A
1939 break;
1940 }
1941
1942 /* target overflow */
1943 targetCapacity=0;
1944 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1945 c=0;
1946 goto endloop;
1947 }
b75a7d8f
A
1948}
1949
1950/* miscellaneous ------------------------------------------------------------ */
1951
f3c0d7a5 1952static const char * U_CALLCONV
b75a7d8f
A
1953_SCSUGetName(const UConverter *cnv) {
1954 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1955
1956 switch(scsu->locale) {
1957 case l_ja:
1958 return "SCSU,locale=ja";
1959 default:
1960 return "SCSU";
1961 }
1962}
1963
b75a7d8f 1964/* structure for SafeClone calculations */
374ca955 1965struct cloneSCSUStruct
b75a7d8f
A
1966{
1967 UConverter cnv;
1968 SCSUData mydata;
1969};
1970
f3c0d7a5 1971static UConverter * U_CALLCONV
b75a7d8f
A
1972_SCSUSafeClone(const UConverter *cnv,
1973 void *stackBuffer,
1974 int32_t *pBufferSize,
1975 UErrorCode *status)
1976{
374ca955
A
1977 struct cloneSCSUStruct * localClone;
1978 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
b75a7d8f
A
1979
1980 if (U_FAILURE(*status)){
1981 return 0;
1982 }
1983
1984 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1985 *pBufferSize = bufferSizeNeeded;
1986 return 0;
1987 }
1988
374ca955
A
1989 localClone = (struct cloneSCSUStruct *)stackBuffer;
1990 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
b75a7d8f
A
1991
1992 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1993 localClone->cnv.extraInfo = &localClone->mydata;
1994 localClone->cnv.isExtraLocal = TRUE;
1995
1996 return &localClone->cnv;
1997}
f3c0d7a5 1998U_CDECL_END
b75a7d8f 1999
b75a7d8f
A
2000static const UConverterImpl _SCSUImpl={
2001 UCNV_SCSU,
2002
2003 NULL,
2004 NULL,
2005
2006 _SCSUOpen,
2007 _SCSUClose,
2008 _SCSUReset,
2009
2010 _SCSUToUnicode,
2011 _SCSUToUnicodeWithOffsets,
2012 _SCSUFromUnicode,
2013 _SCSUFromUnicodeWithOffsets,
374ca955 2014 NULL,
b75a7d8f
A
2015
2016 NULL,
2017 _SCSUGetName,
73c04bcf 2018 NULL,
b75a7d8f 2019 _SCSUSafeClone,
f3c0d7a5
A
2020 ucnv_getCompleteUnicodeSet,
2021 NULL,
2022 NULL
b75a7d8f
A
2023};
2024
2025static const UConverterStaticData _SCSUStaticData={
2026 sizeof(UConverterStaticData),
2027 "SCSU",
73c04bcf 2028 1212, /* CCSID for SCSU */
b75a7d8f
A
2029 UCNV_IBM, UCNV_SCSU,
2030 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
374ca955 2031 /*
73c04bcf
A
2032 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2033 * substitution string.
374ca955
A
2034 */
2035 { 0x0e, 0xff, 0xfd, 0 }, 3,
b75a7d8f
A
2036 FALSE, FALSE,
2037 0,
2038 0,
2039 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2040};
2041
2ca993e8
A
2042const UConverterSharedData _SCSUData=
2043 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
b75a7d8f 2044
374ca955 2045#endif