]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvbocu.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / ucnvbocu.c
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
3*
374ca955 4* Copyright (C) 2002-2004, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: ucnvbocu.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002mar27
14* created by: Markus W. Scherer
15*
16* This is an implementation of the Binary Ordered Compression for Unicode,
374ca955 17* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
b75a7d8f
A
18*/
19
20#include "unicode/utypes.h"
374ca955
A
21
22#if !UCONFIG_NO_CONVERSION
23
b75a7d8f
A
24#include "unicode/ucnv.h"
25#include "unicode/ucnv_cb.h"
26#include "ucnv_bld.h"
27#include "ucnv_cnv.h"
28
29/* BOCU-1 constants and macros ---------------------------------------------- */
30
31/*
32 * BOCU-1 encodes the code points of a Unicode string as
33 * a sequence of byte-encoded differences (slope detection),
34 * preserving lexical order.
35 *
36 * Optimize the difference-taking for runs of Unicode text within
37 * small scripts:
38 *
39 * Most small scripts are allocated within aligned 128-blocks of Unicode
40 * code points. Lexical order is preserved if the "previous code point" state
41 * is always moved into the middle of such a block.
42 *
43 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
44 * areas into the middle of those areas.
45 *
46 * C0 control codes and space are encoded with their US-ASCII bytes.
47 * "prev" is reset for C0 controls but not for space.
48 */
49
50/* initial value for "prev": middle of the ASCII range */
51#define BOCU1_ASCII_PREV 0x40
52
53/* bounding byte values for differences */
54#define BOCU1_MIN 0x21
55#define BOCU1_MIDDLE 0x90
56#define BOCU1_MAX_LEAD 0xfe
57#define BOCU1_MAX_TRAIL 0xff
58#define BOCU1_RESET 0xff
59
60/* number of lead bytes */
61#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
62
63/* adjust trail byte counts for the use of some C0 control byte values */
64#define BOCU1_TRAIL_CONTROLS_COUNT 20
65#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
66
67/* number of trail bytes */
68#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
69
70/*
71 * number of positive and negative single-byte codes
72 * (counting 0==BOCU1_MIDDLE among the positive ones)
73 */
74#define BOCU1_SINGLE 64
75
76/* number of lead bytes for positive and negative 2/3/4-byte sequences */
77#define BOCU1_LEAD_2 43
78#define BOCU1_LEAD_3 3
79#define BOCU1_LEAD_4 1
80
81/* The difference value range for single-byters. */
82#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
83#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
84
85/* The difference value range for double-byters. */
86#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
87#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
88
89/* The difference value range for 3-byters. */
90#define BOCU1_REACH_POS_3 \
91 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
92
93#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
94
95/* The lead byte start values. */
96#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
97#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
98#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
99 /* ==BOCU1_MAX_LEAD */
100
101#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
102#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
103#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
104 /* ==BOCU1_MIN+1 */
105
106/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
107#define BOCU1_LENGTH_FROM_LEAD(lead) \
108 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
109 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
110 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
111
112/* The length of a byte sequence, according to its packed form. */
113#define BOCU1_LENGTH_FROM_PACKED(packed) \
114 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
115
116/*
117 * 12 commonly used C0 control codes (and space) are only used to encode
118 * themselves directly,
119 * which makes BOCU-1 MIME-usable and reasonably safe for
120 * ASCII-oriented software.
121 *
122 * These controls are
123 * 0 NUL
124 *
125 * 7 BEL
126 * 8 BS
127 *
128 * 9 TAB
129 * a LF
130 * b VT
131 * c FF
132 * d CR
133 *
134 * e SO
135 * f SI
136 *
137 * 1a SUB
138 * 1b ESC
139 *
140 * The other 20 C0 controls are also encoded directly (to preserve order)
141 * but are also used as trail bytes in difference encoding
142 * (for better compression).
143 */
144#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
145
146/*
147 * Byte value map for control codes,
148 * from external byte values 0x00..0x20
149 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
150 * External byte values that are illegal as trail bytes are mapped to -1.
151 */
152static const int8_t
153bocu1ByteToTrail[BOCU1_MIN]={
154/* 0 1 2 3 4 5 6 7 */
155 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
156
157/* 8 9 a b c d e f */
158 -1, -1, -1, -1, -1, -1, -1, -1,
159
160/* 10 11 12 13 14 15 16 17 */
161 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
162
163/* 18 19 1a 1b 1c 1d 1e 1f */
164 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
165
166/* 20 */
167 -1
168};
169
170/*
171 * Byte value map for control codes,
172 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
173 * to external byte values 0x00..0x20.
174 */
175static const int8_t
176bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
177/* 0 1 2 3 4 5 6 7 */
178 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
179
180/* 8 9 a b c d e f */
181 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
182
183/* 10 11 12 13 */
184 0x1c, 0x1d, 0x1e, 0x1f
185};
186
187/**
188 * Integer division and modulo with negative numerators
189 * yields negative modulo results and quotients that are one more than
190 * what we need here.
191 * This macro adjust the results so that the modulo-value m is always >=0.
192 *
193 * For positive n, the if() condition is always FALSE.
194 *
195 * @param n Number to be split into quotient and rest.
196 * Will be modified to contain the quotient.
197 * @param d Divisor.
198 * @param m Output variable for the rest (modulo result).
199 */
200#define NEGDIVMOD(n, d, m) { \
201 (m)=(n)%(d); \
202 (n)/=(d); \
203 if((m)<0) { \
204 --(n); \
205 (m)+=(d); \
206 } \
207}
208
209/* BOCU-1 implementation functions ------------------------------------------ */
210
211#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
212
213/**
214 * Compute the next "previous" value for differencing
215 * from the current code point.
216 *
217 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
218 * @return "previous code point" state value
219 */
220static U_INLINE int32_t
221bocu1Prev(int32_t c) {
222 /* compute new prev */
223 if(/* 0x3040<=c && */ c<=0x309f) {
224 /* Hiragana is not 128-aligned */
225 return 0x3070;
226 } else if(0x4e00<=c && c<=0x9fa5) {
227 /* CJK Unihan */
228 return 0x4e00-BOCU1_REACH_NEG_2;
229 } else if(0xac00<=c /* && c<=0xd7a3 */) {
230 /* Korean Hangul */
231 return (0xd7a3+0xac00)/2;
232 } else {
233 /* mostly small scripts */
234 return BOCU1_SIMPLE_PREV(c);
235 }
236}
237
238/** Fast version of bocu1Prev() for most scripts. */
239#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
240
241/*
242 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
243 * The UConverter fields are used as follows:
244 *
245 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
246 *
247 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
248 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
249 */
250
251/* BOCU-1-from-Unicode conversion functions --------------------------------- */
252
253/**
254 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
255 * and return a packed integer with them.
256 *
257 * The encoding favors small absolut differences with short encodings
258 * to compress runs of same-script characters.
259 *
260 * Optimized version with unrolled loops and fewer floating-point operations
261 * than the standard packDiff().
262 *
263 * @param diff difference value -0x10ffff..0x10ffff
264 * @return
265 * 0x010000zz for 1-byte sequence zz
266 * 0x0200yyzz for 2-byte sequence yy zz
267 * 0x03xxyyzz for 3-byte sequence xx yy zz
268 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
269 */
270static int32_t
271packDiff(int32_t diff) {
272 int32_t result, m;
273
274 if(diff>=BOCU1_REACH_NEG_1) {
275 /* mostly positive differences, and single-byte negative ones */
276#if 0 /* single-byte case handled in macros, see below */
277 if(diff<=BOCU1_REACH_POS_1) {
278 /* single byte */
279 return 0x01000000|(BOCU1_MIDDLE+diff);
280 } else
281#endif
282 if(diff<=BOCU1_REACH_POS_2) {
283 /* two bytes */
284 diff-=BOCU1_REACH_POS_1+1;
285 result=0x02000000;
286
287 m=diff%BOCU1_TRAIL_COUNT;
288 diff/=BOCU1_TRAIL_COUNT;
289 result|=BOCU1_TRAIL_TO_BYTE(m);
290
291 result|=(BOCU1_START_POS_2+diff)<<8;
292 } else if(diff<=BOCU1_REACH_POS_3) {
293 /* three bytes */
294 diff-=BOCU1_REACH_POS_2+1;
295 result=0x03000000;
296
297 m=diff%BOCU1_TRAIL_COUNT;
298 diff/=BOCU1_TRAIL_COUNT;
299 result|=BOCU1_TRAIL_TO_BYTE(m);
300
301 m=diff%BOCU1_TRAIL_COUNT;
302 diff/=BOCU1_TRAIL_COUNT;
303 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
304
305 result|=(BOCU1_START_POS_3+diff)<<16;
306 } else {
307 /* four bytes */
308 diff-=BOCU1_REACH_POS_3+1;
309
310 m=diff%BOCU1_TRAIL_COUNT;
311 diff/=BOCU1_TRAIL_COUNT;
312 result=BOCU1_TRAIL_TO_BYTE(m);
313
314 m=diff%BOCU1_TRAIL_COUNT;
315 diff/=BOCU1_TRAIL_COUNT;
316 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
317
318 /*
319 * We know that / and % would deliver quotient 0 and rest=diff.
320 * Avoid division and modulo for performance.
321 */
322 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
323
324 result|=((uint32_t)BOCU1_START_POS_4)<<24;
325 }
326 } else {
327 /* two- to four-byte negative differences */
328 if(diff>=BOCU1_REACH_NEG_2) {
329 /* two bytes */
330 diff-=BOCU1_REACH_NEG_1;
331 result=0x02000000;
332
333 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
334 result|=BOCU1_TRAIL_TO_BYTE(m);
335
336 result|=(BOCU1_START_NEG_2+diff)<<8;
337 } else if(diff>=BOCU1_REACH_NEG_3) {
338 /* three bytes */
339 diff-=BOCU1_REACH_NEG_2;
340 result=0x03000000;
341
342 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
343 result|=BOCU1_TRAIL_TO_BYTE(m);
344
345 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
346 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
347
348 result|=(BOCU1_START_NEG_3+diff)<<16;
349 } else {
350 /* four bytes */
351 diff-=BOCU1_REACH_NEG_3;
352
353 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
354 result=BOCU1_TRAIL_TO_BYTE(m);
355
356 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
357 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
358
359 /*
360 * We know that NEGDIVMOD would deliver
361 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
362 * Avoid division and modulo for performance.
363 */
364 m=diff+BOCU1_TRAIL_COUNT;
365 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
366
367 result|=BOCU1_MIN<<24;
368 }
369 }
370 return result;
371}
372
373/* Faster versions of packDiff() for single-byte-encoded diff values. */
374
375/** Is a diff value encodable in a single byte? */
376#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
377
378/** Encode a diff value in a single byte. */
379#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
380
381/** Is a diff value encodable in two bytes? */
382#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
383
384static void
385_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
386 UErrorCode *pErrorCode) {
387 UConverter *cnv;
388 const UChar *source, *sourceLimit;
389 uint8_t *target;
390 int32_t targetCapacity;
391 int32_t *offsets;
392
393 int32_t prev, c, diff;
394
395 int32_t sourceIndex, nextSourceIndex;
396
397U_ALIGN_CODE(16)
398
399 /* set up the local pointers */
400 cnv=pArgs->converter;
401 source=pArgs->source;
402 sourceLimit=pArgs->sourceLimit;
403 target=(uint8_t *)pArgs->target;
404 targetCapacity=pArgs->targetLimit-pArgs->target;
405 offsets=pArgs->offsets;
406
407 /* get the converter state from UConverter */
374ca955 408 c=cnv->fromUChar32;
b75a7d8f
A
409 prev=(int32_t)cnv->fromUnicodeStatus;
410 if(prev==0) {
411 prev=BOCU1_ASCII_PREV;
412 }
413
414 /* sourceIndex=-1 if the current character began in the previous buffer */
415 sourceIndex= c==0 ? 0 : -1;
416 nextSourceIndex=0;
417
418 /* conversion loop */
419 if(c!=0 && targetCapacity>0) {
420 goto getTrail;
421 }
422
423fastSingle:
424 /* fast loop for single-byte differences */
425 /* use only one loop counter variable, targetCapacity, not also source */
426 diff=sourceLimit-source;
427 if(targetCapacity>diff) {
428 targetCapacity=diff;
429 }
374ca955
A
430 while(targetCapacity>0 && (c=*source)<0x3000) {
431 if(c<=0x20) {
432 if(c!=0x20) {
433 prev=BOCU1_ASCII_PREV;
b75a7d8f 434 }
374ca955
A
435 *target++=(uint8_t)c;
436 *offsets++=nextSourceIndex++;
b75a7d8f
A
437 ++source;
438 --targetCapacity;
374ca955
A
439 } else {
440 diff=c-prev;
441 if(DIFF_IS_SINGLE(diff)) {
442 prev=BOCU1_SIMPLE_PREV(c);
443 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
b75a7d8f
A
444 *offsets++=nextSourceIndex++;
445 ++source;
446 --targetCapacity;
447 } else {
374ca955 448 break;
b75a7d8f
A
449 }
450 }
451 }
452 /* restore real values */
453 targetCapacity=(const uint8_t *)pArgs->targetLimit-target;
454 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
455
456 /* regular loop for all cases */
457 while(source<sourceLimit) {
458 if(targetCapacity>0) {
459 c=*source++;
460 ++nextSourceIndex;
461
462 if(c<=0x20) {
463 /*
464 * ISO C0 control & space:
465 * Encode directly for MIME compatibility,
466 * and reset state except for space, to not disrupt compression.
467 */
468 if(c!=0x20) {
469 prev=BOCU1_ASCII_PREV;
470 }
471 *target++=(uint8_t)c;
374ca955 472 *offsets++=sourceIndex;
b75a7d8f
A
473 --targetCapacity;
474
475 sourceIndex=nextSourceIndex;
476 continue;
477 }
478
479 if(UTF_IS_LEAD(c)) {
480getTrail:
481 if(source<sourceLimit) {
482 /* test the following code unit */
483 UChar trail=*source;
484 if(UTF_IS_SECOND_SURROGATE(trail)) {
485 ++source;
486 ++nextSourceIndex;
487 c=UTF16_GET_PAIR_VALUE(c, trail);
488 }
489 } else {
490 /* no more input */
491 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
492 break;
493 }
494 }
495
496 /*
497 * all other Unicode code points c==U+0021..U+10ffff
498 * are encoded with the difference c-prev
499 *
500 * a new prev is computed from c,
501 * placed in the middle of a 0x80-block (for most small scripts) or
502 * in the middle of the Unihan and Hangul blocks
503 * to statistically minimize the following difference
504 */
505 diff=c-prev;
506 prev=BOCU1_PREV(c);
507 if(DIFF_IS_SINGLE(diff)) {
508 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
374ca955 509 *offsets++=sourceIndex;
b75a7d8f
A
510 --targetCapacity;
511 sourceIndex=nextSourceIndex;
512 if(c<0x3000) {
513 goto fastSingle;
514 }
515 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
516 /* optimize 2-byte case */
517 int32_t m;
518
519 if(diff>=0) {
520 diff-=BOCU1_REACH_POS_1+1;
521 m=diff%BOCU1_TRAIL_COUNT;
522 diff/=BOCU1_TRAIL_COUNT;
523 diff+=BOCU1_START_POS_2;
524 } else {
525 diff-=BOCU1_REACH_NEG_1;
526 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
527 diff+=BOCU1_START_NEG_2;
528 }
529 *target++=(uint8_t)diff;
530 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
374ca955
A
531 *offsets++=sourceIndex;
532 *offsets++=sourceIndex;
b75a7d8f
A
533 targetCapacity-=2;
534 sourceIndex=nextSourceIndex;
535 } else {
536 int32_t length; /* will be 2..4 */
537
538 diff=packDiff(diff);
539 length=BOCU1_LENGTH_FROM_PACKED(diff);
540
541 /* write the output character bytes from diff and length */
542 /* from the first if in the loop we know that targetCapacity>0 */
543 if(length<=targetCapacity) {
374ca955
A
544 switch(length) {
545 /* each branch falls through to the next one */
546 case 4:
547 *target++=(uint8_t)(diff>>24);
548 *offsets++=sourceIndex;
549 case 3:
550 *target++=(uint8_t)(diff>>16);
551 *offsets++=sourceIndex;
552 case 2:
553 *target++=(uint8_t)(diff>>8);
554 *offsets++=sourceIndex;
555 /* case 1: handled above */
556 *target++=(uint8_t)diff;
557 *offsets++=sourceIndex;
558 default:
559 /* will never occur */
560 break;
b75a7d8f
A
561 }
562 targetCapacity-=length;
563 sourceIndex=nextSourceIndex;
564 } else {
565 uint8_t *charErrorBuffer;
566
567 /*
568 * We actually do this backwards here:
569 * In order to save an intermediate variable, we output
570 * first to the overflow buffer what does not fit into the
571 * regular target.
572 */
573 /* we know that 1<=targetCapacity<length<=4 */
574 length-=targetCapacity;
575 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
576 switch(length) {
577 /* each branch falls through to the next one */
578 case 3:
579 *charErrorBuffer++=(uint8_t)(diff>>16);
580 case 2:
581 *charErrorBuffer++=(uint8_t)(diff>>8);
582 case 1:
583 *charErrorBuffer=(uint8_t)diff;
584 default:
585 /* will never occur */
586 break;
587 }
588 cnv->charErrorBufferLength=(int8_t)length;
589
590 /* now output what fits into the regular target */
591 diff>>=8*length; /* length was reduced by targetCapacity */
592 switch(targetCapacity) {
593 /* each branch falls through to the next one */
594 case 3:
595 *target++=(uint8_t)(diff>>16);
374ca955 596 *offsets++=sourceIndex;
b75a7d8f
A
597 case 2:
598 *target++=(uint8_t)(diff>>8);
374ca955 599 *offsets++=sourceIndex;
b75a7d8f
A
600 case 1:
601 *target++=(uint8_t)diff;
374ca955 602 *offsets++=sourceIndex;
b75a7d8f
A
603 default:
604 /* will never occur */
605 break;
606 }
607
608 /* target overflow */
609 targetCapacity=0;
610 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
611 break;
612 }
613 }
614 } else {
615 /* target is full */
616 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
617 break;
618 }
619 }
620
374ca955
A
621 /* set the converter state back into UConverter */
622 cnv->fromUChar32= c<0 ? -c : 0;
623 cnv->fromUnicodeStatus=(uint32_t)prev;
b75a7d8f
A
624
625 /* write back the updated pointers */
626 pArgs->source=source;
627 pArgs->target=(char *)target;
628 pArgs->offsets=offsets;
629}
630
631/*
632 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
633 * If a change is made in the original function, then either
634 * change this function the same way or
635 * re-copy the original function and remove the variables
636 * offsets, sourceIndex, and nextSourceIndex.
637 */
638static void
639_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
640 UErrorCode *pErrorCode) {
641 UConverter *cnv;
642 const UChar *source, *sourceLimit;
643 uint8_t *target;
644 int32_t targetCapacity;
645
646 int32_t prev, c, diff;
647
648 /* set up the local pointers */
649 cnv=pArgs->converter;
650 source=pArgs->source;
651 sourceLimit=pArgs->sourceLimit;
652 target=(uint8_t *)pArgs->target;
653 targetCapacity=pArgs->targetLimit-pArgs->target;
654
655 /* get the converter state from UConverter */
374ca955 656 c=cnv->fromUChar32;
b75a7d8f
A
657 prev=(int32_t)cnv->fromUnicodeStatus;
658 if(prev==0) {
659 prev=BOCU1_ASCII_PREV;
660 }
661
662 /* conversion loop */
663 if(c!=0 && targetCapacity>0) {
664 goto getTrail;
665 }
666
667fastSingle:
668 /* fast loop for single-byte differences */
669 /* use only one loop counter variable, targetCapacity, not also source */
670 diff=sourceLimit-source;
671 if(targetCapacity>diff) {
672 targetCapacity=diff;
673 }
674 while(targetCapacity>0 && (c=*source)<0x3000) {
675 if(c<=0x20) {
676 if(c!=0x20) {
677 prev=BOCU1_ASCII_PREV;
678 }
679 *target++=(uint8_t)c;
680 } else {
681 diff=c-prev;
682 if(DIFF_IS_SINGLE(diff)) {
683 prev=BOCU1_SIMPLE_PREV(c);
684 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
685 } else {
686 break;
687 }
688 }
689 ++source;
690 --targetCapacity;
691 }
692 /* restore real values */
693 targetCapacity=(const uint8_t *)pArgs->targetLimit-target;
694
695 /* regular loop for all cases */
696 while(source<sourceLimit) {
697 if(targetCapacity>0) {
698 c=*source++;
699
700 if(c<=0x20) {
701 /*
702 * ISO C0 control & space:
703 * Encode directly for MIME compatibility,
704 * and reset state except for space, to not disrupt compression.
705 */
706 if(c!=0x20) {
707 prev=BOCU1_ASCII_PREV;
708 }
709 *target++=(uint8_t)c;
710 --targetCapacity;
711 continue;
712 }
713
714 if(UTF_IS_LEAD(c)) {
715getTrail:
716 if(source<sourceLimit) {
717 /* test the following code unit */
718 UChar trail=*source;
719 if(UTF_IS_SECOND_SURROGATE(trail)) {
720 ++source;
721 c=UTF16_GET_PAIR_VALUE(c, trail);
722 }
723 } else {
724 /* no more input */
725 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
726 break;
727 }
728 }
729
730 /*
731 * all other Unicode code points c==U+0021..U+10ffff
732 * are encoded with the difference c-prev
733 *
734 * a new prev is computed from c,
735 * placed in the middle of a 0x80-block (for most small scripts) or
736 * in the middle of the Unihan and Hangul blocks
737 * to statistically minimize the following difference
738 */
739 diff=c-prev;
740 prev=BOCU1_PREV(c);
741 if(DIFF_IS_SINGLE(diff)) {
742 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
743 --targetCapacity;
744 if(c<0x3000) {
745 goto fastSingle;
746 }
747 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
748 /* optimize 2-byte case */
749 int32_t m;
750
751 if(diff>=0) {
752 diff-=BOCU1_REACH_POS_1+1;
753 m=diff%BOCU1_TRAIL_COUNT;
754 diff/=BOCU1_TRAIL_COUNT;
755 diff+=BOCU1_START_POS_2;
756 } else {
757 diff-=BOCU1_REACH_NEG_1;
758 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
759 diff+=BOCU1_START_NEG_2;
760 }
761 *target++=(uint8_t)diff;
762 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
763 targetCapacity-=2;
764 } else {
765 int32_t length; /* will be 2..4 */
766
767 diff=packDiff(diff);
768 length=BOCU1_LENGTH_FROM_PACKED(diff);
769
770 /* write the output character bytes from diff and length */
771 /* from the first if in the loop we know that targetCapacity>0 */
772 if(length<=targetCapacity) {
773 switch(length) {
774 /* each branch falls through to the next one */
775 case 4:
776 *target++=(uint8_t)(diff>>24);
777 case 3:
778 *target++=(uint8_t)(diff>>16);
779 /* case 2: handled above */
780 *target++=(uint8_t)(diff>>8);
781 /* case 1: handled above */
782 *target++=(uint8_t)diff;
783 default:
784 /* will never occur */
785 break;
786 }
787 targetCapacity-=length;
788 } else {
789 uint8_t *charErrorBuffer;
790
791 /*
792 * We actually do this backwards here:
793 * In order to save an intermediate variable, we output
794 * first to the overflow buffer what does not fit into the
795 * regular target.
796 */
797 /* we know that 1<=targetCapacity<length<=4 */
798 length-=targetCapacity;
799 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
800 switch(length) {
801 /* each branch falls through to the next one */
802 case 3:
803 *charErrorBuffer++=(uint8_t)(diff>>16);
804 case 2:
805 *charErrorBuffer++=(uint8_t)(diff>>8);
806 case 1:
807 *charErrorBuffer=(uint8_t)diff;
808 default:
809 /* will never occur */
810 break;
811 }
812 cnv->charErrorBufferLength=(int8_t)length;
813
814 /* now output what fits into the regular target */
815 diff>>=8*length; /* length was reduced by targetCapacity */
816 switch(targetCapacity) {
817 /* each branch falls through to the next one */
818 case 3:
819 *target++=(uint8_t)(diff>>16);
820 case 2:
821 *target++=(uint8_t)(diff>>8);
822 case 1:
823 *target++=(uint8_t)diff;
824 default:
825 /* will never occur */
826 break;
827 }
828
829 /* target overflow */
830 targetCapacity=0;
831 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
832 break;
833 }
834 }
835 } else {
836 /* target is full */
837 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
838 break;
839 }
840 }
841
374ca955
A
842 /* set the converter state back into UConverter */
843 cnv->fromUChar32= c<0 ? -c : 0;
844 cnv->fromUnicodeStatus=(uint32_t)prev;
b75a7d8f
A
845
846 /* write back the updated pointers */
847 pArgs->source=source;
848 pArgs->target=(char *)target;
849}
850
851/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
852
853/**
854 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
855 *
856 * @param b lead byte;
857 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
858 * @return (diff<<2)|count
859 */
860static U_INLINE int32_t
861decodeBocu1LeadByte(int32_t b) {
862 int32_t diff, count;
863
864 if(b>=BOCU1_START_NEG_2) {
865 /* positive difference */
866 if(b<BOCU1_START_POS_3) {
867 /* two bytes */
868 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
869 count=1;
870 } else if(b<BOCU1_START_POS_4) {
871 /* three bytes */
872 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
873 count=2;
874 } else {
875 /* four bytes */
876 diff=BOCU1_REACH_POS_3+1;
877 count=3;
878 }
879 } else {
880 /* negative difference */
881 if(b>=BOCU1_START_NEG_3) {
882 /* two bytes */
883 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
884 count=1;
885 } else if(b>BOCU1_MIN) {
886 /* three bytes */
887 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
888 count=2;
889 } else {
890 /* four bytes */
891 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
892 count=3;
893 }
894 }
895
896 /* return the state for decoding the trail byte(s) */
897 return (diff<<2)|count;
898}
899
900/**
901 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
902 *
903 * @param count number of remaining trail bytes including this one
904 * @param b trail byte
905 * @return new delta for diff including b - <0 indicates an error
906 *
907 * @see decodeBocu1
908 */
909static U_INLINE int32_t
910decodeBocu1TrailByte(int32_t count, int32_t b) {
911 if(b<=0x20) {
912 /* skip some C0 controls and make the trail byte range contiguous */
913 b=bocu1ByteToTrail[b];
914 /* b<0 for an illegal trail byte value will result in return<0 below */
915#if BOCU1_MAX_TRAIL<0xff
916 } else if(b>BOCU1_MAX_TRAIL) {
917 return -99;
918#endif
919 } else {
920 b-=BOCU1_TRAIL_BYTE_OFFSET;
921 }
922
923 /* add trail byte into difference and decrement count */
924 if(count==1) {
925 return b;
926 } else if(count==2) {
927 return b*BOCU1_TRAIL_COUNT;
928 } else /* count==3 */ {
929 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
930 }
931}
932
933static void
934_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
935 UErrorCode *pErrorCode) {
936 UConverter *cnv;
937 const uint8_t *source, *sourceLimit;
938 UChar *target;
939 const UChar *targetLimit;
940 int32_t *offsets;
941
942 int32_t prev, count, diff, c;
943
944 int8_t byteIndex;
945 uint8_t *bytes;
946
947 int32_t sourceIndex, nextSourceIndex;
948
949 /* set up the local pointers */
950 cnv=pArgs->converter;
951 source=(const uint8_t *)pArgs->source;
952 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
953 target=pArgs->target;
954 targetLimit=pArgs->targetLimit;
955 offsets=pArgs->offsets;
956
957 /* get the converter state from UConverter */
958 prev=(int32_t)cnv->toUnicodeStatus;
959 if(prev==0) {
960 prev=BOCU1_ASCII_PREV;
961 }
962 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
963 count=diff&3;
964 diff>>=2;
965
966 byteIndex=cnv->toULength;
967 bytes=cnv->toUBytes;
968
969 /* sourceIndex=-1 if the current character began in the previous buffer */
970 sourceIndex=byteIndex==0 ? 0 : -1;
971 nextSourceIndex=0;
972
973 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
b75a7d8f
A
974 if(count>0 && byteIndex>0 && target<targetLimit) {
975 goto getTrail;
976 }
977
978fastSingle:
979 /* fast loop for single-byte differences */
980 /* use count as the only loop counter variable */
981 diff=sourceLimit-source;
982 count=pArgs->targetLimit-target;
983 if(count>diff) {
984 count=diff;
985 }
374ca955
A
986 while(count>0) {
987 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
988 c=prev+(c-BOCU1_MIDDLE);
989 if(c<0x3000) {
b75a7d8f
A
990 *target++=(UChar)c;
991 *offsets++=nextSourceIndex++;
374ca955 992 prev=BOCU1_SIMPLE_PREV(c);
b75a7d8f
A
993 } else {
994 break;
995 }
374ca955
A
996 } else if(c<=0x20) {
997 if(c!=0x20) {
998 prev=BOCU1_ASCII_PREV;
999 }
1000 *target++=(UChar)c;
1001 *offsets++=nextSourceIndex++;
1002 } else {
1003 break;
b75a7d8f 1004 }
374ca955
A
1005 ++source;
1006 --count;
b75a7d8f 1007 }
374ca955 1008 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
b75a7d8f
A
1009
1010 /* decode a sequence of single and lead bytes */
1011 while(source<sourceLimit) {
1012 if(target>=targetLimit) {
1013 /* target is full */
1014 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1015 break;
1016 }
1017
1018 ++nextSourceIndex;
1019 c=*source++;
1020 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1021 /* Write a code point directly from a single-byte difference. */
1022 c=prev+(c-BOCU1_MIDDLE);
1023 if(c<0x3000) {
1024 *target++=(UChar)c;
374ca955 1025 *offsets++=sourceIndex;
b75a7d8f
A
1026 prev=BOCU1_SIMPLE_PREV(c);
1027 sourceIndex=nextSourceIndex;
1028 goto fastSingle;
1029 }
1030 } else if(c<=0x20) {
1031 /*
1032 * Direct-encoded C0 control code or space.
1033 * Reset prev for C0 control codes but not for space.
1034 */
1035 if(c!=0x20) {
1036 prev=BOCU1_ASCII_PREV;
1037 }
1038 *target++=(UChar)c;
374ca955 1039 *offsets++=sourceIndex;
b75a7d8f
A
1040 sourceIndex=nextSourceIndex;
1041 continue;
1042 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1043 /* Optimize two-byte case. */
1044 if(c>=BOCU1_MIDDLE) {
1045 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1046 } else {
1047 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1048 }
1049
1050 /* trail byte */
1051 ++nextSourceIndex;
1052 c=decodeBocu1TrailByte(1, *source++);
1053 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1054 bytes[0]=source[-2];
1055 bytes[1]=source[-1];
1056 byteIndex=2;
374ca955
A
1057 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1058 break;
b75a7d8f
A
1059 }
1060 } else if(c==BOCU1_RESET) {
1061 /* only reset the state, no code point */
1062 prev=BOCU1_ASCII_PREV;
1063 sourceIndex=nextSourceIndex;
1064 continue;
1065 } else {
1066 /*
1067 * For multi-byte difference lead bytes, set the decoder state
1068 * with the partial difference value from the lead byte and
1069 * with the number of trail bytes.
1070 */
1071 bytes[0]=(uint8_t)c;
1072 byteIndex=1;
1073
1074 diff=decodeBocu1LeadByte(c);
1075 count=diff&3;
1076 diff>>=2;
1077getTrail:
1078 for(;;) {
1079 if(source>=sourceLimit) {
1080 goto endloop;
1081 }
1082 ++nextSourceIndex;
1083 c=bytes[byteIndex++]=*source++;
1084
1085 /* trail byte in any position */
1086 c=decodeBocu1TrailByte(count, c);
1087 if(c<0) {
374ca955
A
1088 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1089 goto endloop;
b75a7d8f
A
1090 }
1091
1092 diff+=c;
1093 if(--count==0) {
1094 /* final trail byte, deliver a code point */
1095 byteIndex=0;
1096 c=prev+diff;
1097 if((uint32_t)c>0x10ffff) {
374ca955
A
1098 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1099 goto endloop;
b75a7d8f
A
1100 }
1101 break;
1102 }
1103 }
1104 }
1105
1106 /* calculate the next prev and output c */
1107 prev=BOCU1_PREV(c);
1108 if(c<=0xffff) {
1109 *target++=(UChar)c;
374ca955 1110 *offsets++=sourceIndex;
b75a7d8f
A
1111 } else {
1112 /* output surrogate pair */
1113 *target++=UTF16_LEAD(c);
1114 if(target<targetLimit) {
1115 *target++=UTF16_TRAIL(c);
374ca955
A
1116 *offsets++=sourceIndex;
1117 *offsets++=sourceIndex;
b75a7d8f
A
1118 } else {
1119 /* target overflow */
374ca955 1120 *offsets++=sourceIndex;
b75a7d8f
A
1121 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1122 cnv->UCharErrorBufferLength=1;
1123 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1124 break;
1125 }
1126 }
1127 sourceIndex=nextSourceIndex;
1128 }
1129endloop:
1130
374ca955
A
1131 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1132 /* set the converter state in UConverter to deal with the next character */
b75a7d8f
A
1133 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1134 cnv->mode=0;
b75a7d8f
A
1135 } else {
1136 /* set the converter state back into UConverter */
1137 cnv->toUnicodeStatus=(uint32_t)prev;
1138 cnv->mode=(diff<<2)|count;
b75a7d8f 1139 }
374ca955 1140 cnv->toULength=byteIndex;
b75a7d8f 1141
b75a7d8f
A
1142 /* write back the updated pointers */
1143 pArgs->source=(const char *)source;
1144 pArgs->target=target;
1145 pArgs->offsets=offsets;
1146 return;
b75a7d8f
A
1147}
1148
1149/*
1150 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1151 * If a change is made in the original function, then either
1152 * change this function the same way or
1153 * re-copy the original function and remove the variables
1154 * offsets, sourceIndex, and nextSourceIndex.
1155 */
1156static void
1157_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1158 UErrorCode *pErrorCode) {
1159 UConverter *cnv;
1160 const uint8_t *source, *sourceLimit;
1161 UChar *target;
1162 const UChar *targetLimit;
1163
1164 int32_t prev, count, diff, c;
1165
1166 int8_t byteIndex;
1167 uint8_t *bytes;
1168
1169U_ALIGN_CODE(16)
1170
1171 /* set up the local pointers */
1172 cnv=pArgs->converter;
1173 source=(const uint8_t *)pArgs->source;
1174 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1175 target=pArgs->target;
1176 targetLimit=pArgs->targetLimit;
1177
1178 /* get the converter state from UConverter */
1179 prev=(int32_t)cnv->toUnicodeStatus;
1180 if(prev==0) {
1181 prev=BOCU1_ASCII_PREV;
1182 }
1183 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1184 count=diff&3;
1185 diff>>=2;
1186
1187 byteIndex=cnv->toULength;
1188 bytes=cnv->toUBytes;
1189
1190 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
b75a7d8f
A
1191 if(count>0 && byteIndex>0 && target<targetLimit) {
1192 goto getTrail;
1193 }
1194
1195fastSingle:
1196 /* fast loop for single-byte differences */
1197 /* use count as the only loop counter variable */
1198 diff=sourceLimit-source;
1199 count=pArgs->targetLimit-target;
1200 if(count>diff) {
1201 count=diff;
1202 }
1203 while(count>0) {
1204 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1205 c=prev+(c-BOCU1_MIDDLE);
1206 if(c<0x3000) {
1207 *target++=(UChar)c;
1208 prev=BOCU1_SIMPLE_PREV(c);
1209 } else {
1210 break;
1211 }
1212 } else if(c<=0x20) {
1213 if(c!=0x20) {
1214 prev=BOCU1_ASCII_PREV;
1215 }
1216 *target++=(UChar)c;
1217 } else {
1218 break;
1219 }
1220 ++source;
1221 --count;
1222 }
1223
1224 /* decode a sequence of single and lead bytes */
1225 while(source<sourceLimit) {
1226 if(target>=targetLimit) {
1227 /* target is full */
1228 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1229 break;
1230 }
1231
1232 c=*source++;
1233 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1234 /* Write a code point directly from a single-byte difference. */
1235 c=prev+(c-BOCU1_MIDDLE);
1236 if(c<0x3000) {
1237 *target++=(UChar)c;
1238 prev=BOCU1_SIMPLE_PREV(c);
1239 goto fastSingle;
1240 }
1241 } else if(c<=0x20) {
1242 /*
1243 * Direct-encoded C0 control code or space.
1244 * Reset prev for C0 control codes but not for space.
1245 */
1246 if(c!=0x20) {
1247 prev=BOCU1_ASCII_PREV;
1248 }
1249 *target++=(UChar)c;
1250 continue;
1251 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1252 /* Optimize two-byte case. */
1253 if(c>=BOCU1_MIDDLE) {
1254 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1255 } else {
1256 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1257 }
1258
1259 /* trail byte */
1260 c=decodeBocu1TrailByte(1, *source++);
1261 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1262 bytes[0]=source[-2];
1263 bytes[1]=source[-1];
1264 byteIndex=2;
374ca955
A
1265 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1266 break;
b75a7d8f
A
1267 }
1268 } else if(c==BOCU1_RESET) {
1269 /* only reset the state, no code point */
1270 prev=BOCU1_ASCII_PREV;
1271 continue;
1272 } else {
1273 /*
1274 * For multi-byte difference lead bytes, set the decoder state
1275 * with the partial difference value from the lead byte and
1276 * with the number of trail bytes.
1277 */
1278 bytes[0]=(uint8_t)c;
1279 byteIndex=1;
1280
1281 diff=decodeBocu1LeadByte(c);
1282 count=diff&3;
1283 diff>>=2;
1284getTrail:
1285 for(;;) {
1286 if(source>=sourceLimit) {
1287 goto endloop;
1288 }
1289 c=bytes[byteIndex++]=*source++;
1290
1291 /* trail byte in any position */
1292 c=decodeBocu1TrailByte(count, c);
1293 if(c<0) {
374ca955
A
1294 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1295 goto endloop;
b75a7d8f
A
1296 }
1297
1298 diff+=c;
1299 if(--count==0) {
1300 /* final trail byte, deliver a code point */
1301 byteIndex=0;
1302 c=prev+diff;
1303 if((uint32_t)c>0x10ffff) {
374ca955
A
1304 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1305 goto endloop;
b75a7d8f
A
1306 }
1307 break;
1308 }
1309 }
1310 }
1311
1312 /* calculate the next prev and output c */
1313 prev=BOCU1_PREV(c);
1314 if(c<=0xffff) {
1315 *target++=(UChar)c;
1316 } else {
1317 /* output surrogate pair */
1318 *target++=UTF16_LEAD(c);
1319 if(target<targetLimit) {
1320 *target++=UTF16_TRAIL(c);
1321 } else {
1322 /* target overflow */
1323 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1324 cnv->UCharErrorBufferLength=1;
1325 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1326 break;
1327 }
1328 }
1329 }
1330endloop:
1331
374ca955
A
1332 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1333 /* set the converter state in UConverter to deal with the next character */
b75a7d8f
A
1334 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1335 cnv->mode=0;
b75a7d8f
A
1336 } else {
1337 /* set the converter state back into UConverter */
1338 cnv->toUnicodeStatus=(uint32_t)prev;
1339 cnv->mode=(diff<<2)|count;
b75a7d8f 1340 }
374ca955 1341 cnv->toULength=byteIndex;
b75a7d8f 1342
b75a7d8f
A
1343 /* write back the updated pointers */
1344 pArgs->source=(const char *)source;
1345 pArgs->target=target;
1346 return;
b75a7d8f
A
1347}
1348
1349/* miscellaneous ------------------------------------------------------------ */
1350
1351static const UConverterImpl _Bocu1Impl={
1352 UCNV_BOCU1,
1353
1354 NULL,
1355 NULL,
1356
1357 NULL,
1358 NULL,
1359 NULL,
1360
1361 _Bocu1ToUnicode,
1362 _Bocu1ToUnicodeWithOffsets,
1363 _Bocu1FromUnicode,
1364 _Bocu1FromUnicodeWithOffsets,
1365 NULL,
1366
1367 NULL,
1368 NULL,
1369 NULL,
1370 NULL,
1371 ucnv_getCompleteUnicodeSet
1372};
1373
1374static const UConverterStaticData _Bocu1StaticData={
1375 sizeof(UConverterStaticData),
1376 "BOCU-1",
1377 0, /* CCSID for BOCU-1 */
1378 UCNV_IBM, UCNV_BOCU1,
1379 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1380 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1381 FALSE, FALSE,
1382 0,
1383 0,
1384 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1385};
1386
1387const UConverterSharedData _Bocu1Data={
1388 sizeof(UConverterSharedData), ~((uint32_t)0),
1389 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1390 0
1391};
374ca955
A
1392
1393#endif