]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnvbocu.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnvbocu.cpp
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2002-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ucnvbocu.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar27
14 * created by: Markus W. Scherer
15 *
16 * This is an implementation of the Binary Ordered Compression for Unicode,
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
23
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
27 #include "putilimp.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "uassert.h"
31
32 /* BOCU-1 constants and macros ---------------------------------------------- */
33
34 /*
35 * BOCU-1 encodes the code points of a Unicode string as
36 * a sequence of byte-encoded differences (slope detection),
37 * preserving lexical order.
38 *
39 * Optimize the difference-taking for runs of Unicode text within
40 * small scripts:
41 *
42 * Most small scripts are allocated within aligned 128-blocks of Unicode
43 * code points. Lexical order is preserved if the "previous code point" state
44 * is always moved into the middle of such a block.
45 *
46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47 * areas into the middle of those areas.
48 *
49 * C0 control codes and space are encoded with their US-ASCII bytes.
50 * "prev" is reset for C0 controls but not for space.
51 */
52
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV 0x40
55
56 /* bounding byte values for differences */
57 #define BOCU1_MIN 0x21
58 #define BOCU1_MIDDLE 0x90
59 #define BOCU1_MAX_LEAD 0xfe
60 #define BOCU1_MAX_TRAIL 0xff
61 #define BOCU1_RESET 0xff
62
63 /* number of lead bytes */
64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
65
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT 20
68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
69
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
72
73 /*
74 * number of positive and negative single-byte codes
75 * (counting 0==BOCU1_MIDDLE among the positive ones)
76 */
77 #define BOCU1_SINGLE 64
78
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2 43
81 #define BOCU1_LEAD_3 3
82 #define BOCU1_LEAD_4 1
83
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
87
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
91
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3 \
94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
95
96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
102 /* ==BOCU1_MAX_LEAD */
103
104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
107 /* ==BOCU1_MIN+1 */
108
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
114
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
118
119 /*
120 * 12 commonly used C0 control codes (and space) are only used to encode
121 * themselves directly,
122 * which makes BOCU-1 MIME-usable and reasonably safe for
123 * ASCII-oriented software.
124 *
125 * These controls are
126 * 0 NUL
127 *
128 * 7 BEL
129 * 8 BS
130 *
131 * 9 TAB
132 * a LF
133 * b VT
134 * c FF
135 * d CR
136 *
137 * e SO
138 * f SI
139 *
140 * 1a SUB
141 * 1b ESC
142 *
143 * The other 20 C0 controls are also encoded directly (to preserve order)
144 * but are also used as trail bytes in difference encoding
145 * (for better compression).
146 */
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
148
149 /*
150 * Byte value map for control codes,
151 * from external byte values 0x00..0x20
152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153 * External byte values that are illegal as trail bytes are mapped to -1.
154 */
155 static const int8_t
156 bocu1ByteToTrail[BOCU1_MIN]={
157 /* 0 1 2 3 4 5 6 7 */
158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
159
160 /* 8 9 a b c d e f */
161 -1, -1, -1, -1, -1, -1, -1, -1,
162
163 /* 10 11 12 13 14 15 16 17 */
164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
165
166 /* 18 19 1a 1b 1c 1d 1e 1f */
167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
168
169 /* 20 */
170 -1
171 };
172
173 /*
174 * Byte value map for control codes,
175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176 * to external byte values 0x00..0x20.
177 */
178 static const int8_t
179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
180 /* 0 1 2 3 4 5 6 7 */
181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
182
183 /* 8 9 a b c d e f */
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
185
186 /* 10 11 12 13 */
187 0x1c, 0x1d, 0x1e, 0x1f
188 };
189
190 /**
191 * Integer division and modulo with negative numerators
192 * yields negative modulo results and quotients that are one more than
193 * what we need here.
194 * This macro adjust the results so that the modulo-value m is always >=0.
195 *
196 * For positive n, the if() condition is always FALSE.
197 *
198 * @param n Number to be split into quotient and rest.
199 * Will be modified to contain the quotient.
200 * @param d Divisor.
201 * @param m Output variable for the rest (modulo result).
202 */
203 #define NEGDIVMOD(n, d, m) { \
204 (m)=(n)%(d); \
205 (n)/=(d); \
206 if((m)<0) { \
207 --(n); \
208 (m)+=(d); \
209 } \
210 }
211
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
213
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
216
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
219
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
222
223 /* BOCU-1 implementation functions ------------------------------------------ */
224
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
226
227 /**
228 * Compute the next "previous" value for differencing
229 * from the current code point.
230 *
231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232 * @return "previous code point" state value
233 */
234 static inline int32_t
235 bocu1Prev(int32_t c) {
236 /* compute new prev */
237 if(/* 0x3040<=c && */ c<=0x309f) {
238 /* Hiragana is not 128-aligned */
239 return 0x3070;
240 } else if(0x4e00<=c && c<=0x9fa5) {
241 /* CJK Unihan */
242 return 0x4e00-BOCU1_REACH_NEG_2;
243 } else if(0xac00<=c /* && c<=0xd7a3 */) {
244 /* Korean Hangul */
245 return (0xd7a3+0xac00)/2;
246 } else {
247 /* mostly small scripts */
248 return BOCU1_SIMPLE_PREV(c);
249 }
250 }
251
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
254
255 /*
256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257 * The UConverter fields are used as follows:
258 *
259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
260 *
261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
263 */
264
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
266
267 /**
268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269 * and return a packed integer with them.
270 *
271 * The encoding favors small absolute differences with short encodings
272 * to compress runs of same-script characters.
273 *
274 * Optimized version with unrolled loops and fewer floating-point operations
275 * than the standard packDiff().
276 *
277 * @param diff difference value -0x10ffff..0x10ffff
278 * @return
279 * 0x010000zz for 1-byte sequence zz
280 * 0x0200yyzz for 2-byte sequence yy zz
281 * 0x03xxyyzz for 3-byte sequence xx yy zz
282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
283 */
284 static int32_t
285 packDiff(int32_t diff) {
286 int32_t result, m;
287
288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289 if(diff>=BOCU1_REACH_NEG_1) {
290 /* mostly positive differences, and single-byte negative ones */
291 #if 0 /* single-byte case handled in macros, see below */
292 if(diff<=BOCU1_REACH_POS_1) {
293 /* single byte */
294 return 0x01000000|(BOCU1_MIDDLE+diff);
295 } else
296 #endif
297 if(diff<=BOCU1_REACH_POS_2) {
298 /* two bytes */
299 diff-=BOCU1_REACH_POS_1+1;
300 result=0x02000000;
301
302 m=diff%BOCU1_TRAIL_COUNT;
303 diff/=BOCU1_TRAIL_COUNT;
304 result|=BOCU1_TRAIL_TO_BYTE(m);
305
306 result|=(BOCU1_START_POS_2+diff)<<8;
307 } else if(diff<=BOCU1_REACH_POS_3) {
308 /* three bytes */
309 diff-=BOCU1_REACH_POS_2+1;
310 result=0x03000000;
311
312 m=diff%BOCU1_TRAIL_COUNT;
313 diff/=BOCU1_TRAIL_COUNT;
314 result|=BOCU1_TRAIL_TO_BYTE(m);
315
316 m=diff%BOCU1_TRAIL_COUNT;
317 diff/=BOCU1_TRAIL_COUNT;
318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
319
320 result|=(BOCU1_START_POS_3+diff)<<16;
321 } else {
322 /* four bytes */
323 diff-=BOCU1_REACH_POS_3+1;
324
325 m=diff%BOCU1_TRAIL_COUNT;
326 diff/=BOCU1_TRAIL_COUNT;
327 result=BOCU1_TRAIL_TO_BYTE(m);
328
329 m=diff%BOCU1_TRAIL_COUNT;
330 diff/=BOCU1_TRAIL_COUNT;
331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
332
333 /*
334 * We know that / and % would deliver quotient 0 and rest=diff.
335 * Avoid division and modulo for performance.
336 */
337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
338
339 result|=((uint32_t)BOCU1_START_POS_4)<<24;
340 }
341 } else {
342 /* two- to four-byte negative differences */
343 if(diff>=BOCU1_REACH_NEG_2) {
344 /* two bytes */
345 diff-=BOCU1_REACH_NEG_1;
346 result=0x02000000;
347
348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
349 result|=BOCU1_TRAIL_TO_BYTE(m);
350
351 result|=(BOCU1_START_NEG_2+diff)<<8;
352 } else if(diff>=BOCU1_REACH_NEG_3) {
353 /* three bytes */
354 diff-=BOCU1_REACH_NEG_2;
355 result=0x03000000;
356
357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
358 result|=BOCU1_TRAIL_TO_BYTE(m);
359
360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
362
363 result|=(BOCU1_START_NEG_3+diff)<<16;
364 } else {
365 /* four bytes */
366 diff-=BOCU1_REACH_NEG_3;
367
368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
369 result=BOCU1_TRAIL_TO_BYTE(m);
370
371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
373
374 /*
375 * We know that NEGDIVMOD would deliver
376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377 * Avoid division and modulo for performance.
378 */
379 m=diff+BOCU1_TRAIL_COUNT;
380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
381
382 result|=BOCU1_MIN<<24;
383 }
384 }
385 return result;
386 }
387
388
389 static void
390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
391 UErrorCode *pErrorCode) {
392 UConverter *cnv;
393 const UChar *source, *sourceLimit;
394 uint8_t *target;
395 int32_t targetCapacity;
396 int32_t *offsets;
397
398 int32_t prev, c, diff;
399
400 int32_t sourceIndex, nextSourceIndex;
401
402 /* set up the local pointers */
403 cnv=pArgs->converter;
404 source=pArgs->source;
405 sourceLimit=pArgs->sourceLimit;
406 target=(uint8_t *)pArgs->target;
407 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
408 offsets=pArgs->offsets;
409
410 /* get the converter state from UConverter */
411 c=cnv->fromUChar32;
412 prev=(int32_t)cnv->fromUnicodeStatus;
413 if(prev==0) {
414 prev=BOCU1_ASCII_PREV;
415 }
416
417 /* sourceIndex=-1 if the current character began in the previous buffer */
418 sourceIndex= c==0 ? 0 : -1;
419 nextSourceIndex=0;
420
421 /* conversion loop */
422 if(c!=0 && targetCapacity>0) {
423 goto getTrail;
424 }
425
426 fastSingle:
427 /* fast loop for single-byte differences */
428 /* use only one loop counter variable, targetCapacity, not also source */
429 diff=(int32_t)(sourceLimit-source);
430 if(targetCapacity>diff) {
431 targetCapacity=diff;
432 }
433 while(targetCapacity>0 && (c=*source)<0x3000) {
434 if(c<=0x20) {
435 if(c!=0x20) {
436 prev=BOCU1_ASCII_PREV;
437 }
438 *target++=(uint8_t)c;
439 *offsets++=nextSourceIndex++;
440 ++source;
441 --targetCapacity;
442 } else {
443 diff=c-prev;
444 if(DIFF_IS_SINGLE(diff)) {
445 prev=BOCU1_SIMPLE_PREV(c);
446 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
447 *offsets++=nextSourceIndex++;
448 ++source;
449 --targetCapacity;
450 } else {
451 break;
452 }
453 }
454 }
455 /* restore real values */
456 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
457 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
458
459 /* regular loop for all cases */
460 while(source<sourceLimit) {
461 if(targetCapacity>0) {
462 c=*source++;
463 ++nextSourceIndex;
464
465 if(c<=0x20) {
466 /*
467 * ISO C0 control & space:
468 * Encode directly for MIME compatibility,
469 * and reset state except for space, to not disrupt compression.
470 */
471 if(c!=0x20) {
472 prev=BOCU1_ASCII_PREV;
473 }
474 *target++=(uint8_t)c;
475 *offsets++=sourceIndex;
476 --targetCapacity;
477
478 sourceIndex=nextSourceIndex;
479 continue;
480 }
481
482 if(U16_IS_LEAD(c)) {
483 getTrail:
484 if(source<sourceLimit) {
485 /* test the following code unit */
486 UChar trail=*source;
487 if(U16_IS_TRAIL(trail)) {
488 ++source;
489 ++nextSourceIndex;
490 c=U16_GET_SUPPLEMENTARY(c, trail);
491 }
492 } else {
493 /* no more input */
494 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
495 break;
496 }
497 }
498
499 /*
500 * all other Unicode code points c==U+0021..U+10ffff
501 * are encoded with the difference c-prev
502 *
503 * a new prev is computed from c,
504 * placed in the middle of a 0x80-block (for most small scripts) or
505 * in the middle of the Unihan and Hangul blocks
506 * to statistically minimize the following difference
507 */
508 diff=c-prev;
509 prev=BOCU1_PREV(c);
510 if(DIFF_IS_SINGLE(diff)) {
511 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
512 *offsets++=sourceIndex;
513 --targetCapacity;
514 sourceIndex=nextSourceIndex;
515 if(c<0x3000) {
516 goto fastSingle;
517 }
518 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
519 /* optimize 2-byte case */
520 int32_t m;
521
522 if(diff>=0) {
523 diff-=BOCU1_REACH_POS_1+1;
524 m=diff%BOCU1_TRAIL_COUNT;
525 diff/=BOCU1_TRAIL_COUNT;
526 diff+=BOCU1_START_POS_2;
527 } else {
528 diff-=BOCU1_REACH_NEG_1;
529 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
530 diff+=BOCU1_START_NEG_2;
531 }
532 *target++=(uint8_t)diff;
533 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
534 *offsets++=sourceIndex;
535 *offsets++=sourceIndex;
536 targetCapacity-=2;
537 sourceIndex=nextSourceIndex;
538 } else {
539 int32_t length; /* will be 2..4 */
540
541 diff=packDiff(diff);
542 length=BOCU1_LENGTH_FROM_PACKED(diff);
543
544 /* write the output character bytes from diff and length */
545 /* from the first if in the loop we know that targetCapacity>0 */
546 if(length<=targetCapacity) {
547 switch(length) {
548 /* each branch falls through to the next one */
549 case 4:
550 *target++=(uint8_t)(diff>>24);
551 *offsets++=sourceIndex;
552 U_FALLTHROUGH;
553 case 3:
554 *target++=(uint8_t)(diff>>16);
555 *offsets++=sourceIndex;
556 U_FALLTHROUGH;
557 case 2:
558 *target++=(uint8_t)(diff>>8);
559 *offsets++=sourceIndex;
560 /* case 1: handled above */
561 *target++=(uint8_t)diff;
562 *offsets++=sourceIndex;
563 U_FALLTHROUGH;
564 default:
565 /* will never occur */
566 break;
567 }
568 targetCapacity-=length;
569 sourceIndex=nextSourceIndex;
570 } else {
571 uint8_t *charErrorBuffer;
572
573 /*
574 * We actually do this backwards here:
575 * In order to save an intermediate variable, we output
576 * first to the overflow buffer what does not fit into the
577 * regular target.
578 */
579 /* we know that 1<=targetCapacity<length<=4 */
580 length-=targetCapacity;
581 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
582 switch(length) {
583 /* each branch falls through to the next one */
584 case 3:
585 *charErrorBuffer++=(uint8_t)(diff>>16);
586 U_FALLTHROUGH;
587 case 2:
588 *charErrorBuffer++=(uint8_t)(diff>>8);
589 U_FALLTHROUGH;
590 case 1:
591 *charErrorBuffer=(uint8_t)diff;
592 U_FALLTHROUGH;
593 default:
594 /* will never occur */
595 break;
596 }
597 cnv->charErrorBufferLength=(int8_t)length;
598
599 /* now output what fits into the regular target */
600 diff>>=8*length; /* length was reduced by targetCapacity */
601 switch(targetCapacity) {
602 /* each branch falls through to the next one */
603 case 3:
604 *target++=(uint8_t)(diff>>16);
605 *offsets++=sourceIndex;
606 U_FALLTHROUGH;
607 case 2:
608 *target++=(uint8_t)(diff>>8);
609 *offsets++=sourceIndex;
610 U_FALLTHROUGH;
611 case 1:
612 *target++=(uint8_t)diff;
613 *offsets++=sourceIndex;
614 U_FALLTHROUGH;
615 default:
616 /* will never occur */
617 break;
618 }
619
620 /* target overflow */
621 targetCapacity=0;
622 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
623 break;
624 }
625 }
626 } else {
627 /* target is full */
628 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
629 break;
630 }
631 }
632
633 /* set the converter state back into UConverter */
634 cnv->fromUChar32= c<0 ? -c : 0;
635 cnv->fromUnicodeStatus=(uint32_t)prev;
636
637 /* write back the updated pointers */
638 pArgs->source=source;
639 pArgs->target=(char *)target;
640 pArgs->offsets=offsets;
641 }
642
643 /*
644 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
645 * If a change is made in the original function, then either
646 * change this function the same way or
647 * re-copy the original function and remove the variables
648 * offsets, sourceIndex, and nextSourceIndex.
649 */
650 static void
651 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
652 UErrorCode *pErrorCode) {
653 UConverter *cnv;
654 const UChar *source, *sourceLimit;
655 uint8_t *target;
656 int32_t targetCapacity;
657
658 int32_t prev, c, diff;
659
660 /* set up the local pointers */
661 cnv=pArgs->converter;
662 source=pArgs->source;
663 sourceLimit=pArgs->sourceLimit;
664 target=(uint8_t *)pArgs->target;
665 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
666
667 /* get the converter state from UConverter */
668 c=cnv->fromUChar32;
669 prev=(int32_t)cnv->fromUnicodeStatus;
670 if(prev==0) {
671 prev=BOCU1_ASCII_PREV;
672 }
673
674 /* conversion loop */
675 if(c!=0 && targetCapacity>0) {
676 goto getTrail;
677 }
678
679 fastSingle:
680 /* fast loop for single-byte differences */
681 /* use only one loop counter variable, targetCapacity, not also source */
682 diff=(int32_t)(sourceLimit-source);
683 if(targetCapacity>diff) {
684 targetCapacity=diff;
685 }
686 while(targetCapacity>0 && (c=*source)<0x3000) {
687 if(c<=0x20) {
688 if(c!=0x20) {
689 prev=BOCU1_ASCII_PREV;
690 }
691 *target++=(uint8_t)c;
692 } else {
693 diff=c-prev;
694 if(DIFF_IS_SINGLE(diff)) {
695 prev=BOCU1_SIMPLE_PREV(c);
696 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
697 } else {
698 break;
699 }
700 }
701 ++source;
702 --targetCapacity;
703 }
704 /* restore real values */
705 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
706
707 /* regular loop for all cases */
708 while(source<sourceLimit) {
709 if(targetCapacity>0) {
710 c=*source++;
711
712 if(c<=0x20) {
713 /*
714 * ISO C0 control & space:
715 * Encode directly for MIME compatibility,
716 * and reset state except for space, to not disrupt compression.
717 */
718 if(c!=0x20) {
719 prev=BOCU1_ASCII_PREV;
720 }
721 *target++=(uint8_t)c;
722 --targetCapacity;
723 continue;
724 }
725
726 if(U16_IS_LEAD(c)) {
727 getTrail:
728 if(source<sourceLimit) {
729 /* test the following code unit */
730 UChar trail=*source;
731 if(U16_IS_TRAIL(trail)) {
732 ++source;
733 c=U16_GET_SUPPLEMENTARY(c, trail);
734 }
735 } else {
736 /* no more input */
737 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
738 break;
739 }
740 }
741
742 /*
743 * all other Unicode code points c==U+0021..U+10ffff
744 * are encoded with the difference c-prev
745 *
746 * a new prev is computed from c,
747 * placed in the middle of a 0x80-block (for most small scripts) or
748 * in the middle of the Unihan and Hangul blocks
749 * to statistically minimize the following difference
750 */
751 diff=c-prev;
752 prev=BOCU1_PREV(c);
753 if(DIFF_IS_SINGLE(diff)) {
754 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
755 --targetCapacity;
756 if(c<0x3000) {
757 goto fastSingle;
758 }
759 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
760 /* optimize 2-byte case */
761 int32_t m;
762
763 if(diff>=0) {
764 diff-=BOCU1_REACH_POS_1+1;
765 m=diff%BOCU1_TRAIL_COUNT;
766 diff/=BOCU1_TRAIL_COUNT;
767 diff+=BOCU1_START_POS_2;
768 } else {
769 diff-=BOCU1_REACH_NEG_1;
770 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
771 diff+=BOCU1_START_NEG_2;
772 }
773 *target++=(uint8_t)diff;
774 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
775 targetCapacity-=2;
776 } else {
777 int32_t length; /* will be 2..4 */
778
779 diff=packDiff(diff);
780 length=BOCU1_LENGTH_FROM_PACKED(diff);
781
782 /* write the output character bytes from diff and length */
783 /* from the first if in the loop we know that targetCapacity>0 */
784 if(length<=targetCapacity) {
785 switch(length) {
786 /* each branch falls through to the next one */
787 case 4:
788 *target++=(uint8_t)(diff>>24);
789 U_FALLTHROUGH;
790 case 3:
791 *target++=(uint8_t)(diff>>16);
792 /* case 2: handled above */
793 *target++=(uint8_t)(diff>>8);
794 /* case 1: handled above */
795 *target++=(uint8_t)diff;
796 U_FALLTHROUGH;
797 default:
798 /* will never occur */
799 break;
800 }
801 targetCapacity-=length;
802 } else {
803 uint8_t *charErrorBuffer;
804
805 /*
806 * We actually do this backwards here:
807 * In order to save an intermediate variable, we output
808 * first to the overflow buffer what does not fit into the
809 * regular target.
810 */
811 /* we know that 1<=targetCapacity<length<=4 */
812 length-=targetCapacity;
813 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
814 switch(length) {
815 /* each branch falls through to the next one */
816 case 3:
817 *charErrorBuffer++=(uint8_t)(diff>>16);
818 U_FALLTHROUGH;
819 case 2:
820 *charErrorBuffer++=(uint8_t)(diff>>8);
821 U_FALLTHROUGH;
822 case 1:
823 *charErrorBuffer=(uint8_t)diff;
824 U_FALLTHROUGH;
825 default:
826 /* will never occur */
827 break;
828 }
829 cnv->charErrorBufferLength=(int8_t)length;
830
831 /* now output what fits into the regular target */
832 diff>>=8*length; /* length was reduced by targetCapacity */
833 switch(targetCapacity) {
834 /* each branch falls through to the next one */
835 case 3:
836 *target++=(uint8_t)(diff>>16);
837 U_FALLTHROUGH;
838 case 2:
839 *target++=(uint8_t)(diff>>8);
840 U_FALLTHROUGH;
841 case 1:
842 *target++=(uint8_t)diff;
843 U_FALLTHROUGH;
844 default:
845 /* will never occur */
846 break;
847 }
848
849 /* target overflow */
850 targetCapacity=0;
851 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
852 break;
853 }
854 }
855 } else {
856 /* target is full */
857 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
858 break;
859 }
860 }
861
862 /* set the converter state back into UConverter */
863 cnv->fromUChar32= c<0 ? -c : 0;
864 cnv->fromUnicodeStatus=(uint32_t)prev;
865
866 /* write back the updated pointers */
867 pArgs->source=source;
868 pArgs->target=(char *)target;
869 }
870
871 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
872
873 /**
874 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
875 *
876 * @param b lead byte;
877 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
878 * @return (diff<<2)|count
879 */
880 static inline int32_t
881 decodeBocu1LeadByte(int32_t b) {
882 int32_t diff, count;
883
884 if(b>=BOCU1_START_NEG_2) {
885 /* positive difference */
886 if(b<BOCU1_START_POS_3) {
887 /* two bytes */
888 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
889 count=1;
890 } else if(b<BOCU1_START_POS_4) {
891 /* three bytes */
892 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
893 count=2;
894 } else {
895 /* four bytes */
896 diff=BOCU1_REACH_POS_3+1;
897 count=3;
898 }
899 } else {
900 /* negative difference */
901 if(b>=BOCU1_START_NEG_3) {
902 /* two bytes */
903 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
904 count=1;
905 } else if(b>BOCU1_MIN) {
906 /* three bytes */
907 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
908 count=2;
909 } else {
910 /* four bytes */
911 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
912 count=3;
913 }
914 }
915
916 /* return the state for decoding the trail byte(s) */
917 return (diff<<2)|count;
918 }
919
920 /**
921 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
922 *
923 * @param count number of remaining trail bytes including this one
924 * @param b trail byte
925 * @return new delta for diff including b - <0 indicates an error
926 *
927 * @see decodeBocu1
928 */
929 static inline int32_t
930 decodeBocu1TrailByte(int32_t count, int32_t b) {
931 if(b<=0x20) {
932 /* skip some C0 controls and make the trail byte range contiguous */
933 b=bocu1ByteToTrail[b];
934 /* b<0 for an illegal trail byte value will result in return<0 below */
935 #if BOCU1_MAX_TRAIL<0xff
936 } else if(b>BOCU1_MAX_TRAIL) {
937 return -99;
938 #endif
939 } else {
940 b-=BOCU1_TRAIL_BYTE_OFFSET;
941 }
942
943 /* add trail byte into difference and decrement count */
944 if(count==1) {
945 return b;
946 } else if(count==2) {
947 return b*BOCU1_TRAIL_COUNT;
948 } else /* count==3 */ {
949 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
950 }
951 }
952
953 static void
954 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
955 UErrorCode *pErrorCode) {
956 UConverter *cnv;
957 const uint8_t *source, *sourceLimit;
958 UChar *target;
959 const UChar *targetLimit;
960 int32_t *offsets;
961
962 int32_t prev, count, diff, c;
963
964 int8_t byteIndex;
965 uint8_t *bytes;
966
967 int32_t sourceIndex, nextSourceIndex;
968
969 /* set up the local pointers */
970 cnv=pArgs->converter;
971 source=(const uint8_t *)pArgs->source;
972 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
973 target=pArgs->target;
974 targetLimit=pArgs->targetLimit;
975 offsets=pArgs->offsets;
976
977 /* get the converter state from UConverter */
978 prev=(int32_t)cnv->toUnicodeStatus;
979 if(prev==0) {
980 prev=BOCU1_ASCII_PREV;
981 }
982 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
983 count=diff&3;
984 diff>>=2;
985
986 byteIndex=cnv->toULength;
987 bytes=cnv->toUBytes;
988
989 /* sourceIndex=-1 if the current character began in the previous buffer */
990 sourceIndex=byteIndex==0 ? 0 : -1;
991 nextSourceIndex=0;
992
993 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
994 if(count>0 && byteIndex>0 && target<targetLimit) {
995 goto getTrail;
996 }
997
998 fastSingle:
999 /* fast loop for single-byte differences */
1000 /* use count as the only loop counter variable */
1001 diff=(int32_t)(sourceLimit-source);
1002 count=(int32_t)(pArgs->targetLimit-target);
1003 if(count>diff) {
1004 count=diff;
1005 }
1006 while(count>0) {
1007 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1008 c=prev+(c-BOCU1_MIDDLE);
1009 if(c<0x3000) {
1010 *target++=(UChar)c;
1011 *offsets++=nextSourceIndex++;
1012 prev=BOCU1_SIMPLE_PREV(c);
1013 } else {
1014 break;
1015 }
1016 } else if(c<=0x20) {
1017 if(c!=0x20) {
1018 prev=BOCU1_ASCII_PREV;
1019 }
1020 *target++=(UChar)c;
1021 *offsets++=nextSourceIndex++;
1022 } else {
1023 break;
1024 }
1025 ++source;
1026 --count;
1027 }
1028 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1029
1030 /* decode a sequence of single and lead bytes */
1031 while(source<sourceLimit) {
1032 if(target>=targetLimit) {
1033 /* target is full */
1034 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1035 break;
1036 }
1037
1038 ++nextSourceIndex;
1039 c=*source++;
1040 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1041 /* Write a code point directly from a single-byte difference. */
1042 c=prev+(c-BOCU1_MIDDLE);
1043 if(c<0x3000) {
1044 *target++=(UChar)c;
1045 *offsets++=sourceIndex;
1046 prev=BOCU1_SIMPLE_PREV(c);
1047 sourceIndex=nextSourceIndex;
1048 goto fastSingle;
1049 }
1050 } else if(c<=0x20) {
1051 /*
1052 * Direct-encoded C0 control code or space.
1053 * Reset prev for C0 control codes but not for space.
1054 */
1055 if(c!=0x20) {
1056 prev=BOCU1_ASCII_PREV;
1057 }
1058 *target++=(UChar)c;
1059 *offsets++=sourceIndex;
1060 sourceIndex=nextSourceIndex;
1061 continue;
1062 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1063 /* Optimize two-byte case. */
1064 if(c>=BOCU1_MIDDLE) {
1065 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1066 } else {
1067 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1068 }
1069
1070 /* trail byte */
1071 ++nextSourceIndex;
1072 c=decodeBocu1TrailByte(1, *source++);
1073 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1074 bytes[0]=source[-2];
1075 bytes[1]=source[-1];
1076 byteIndex=2;
1077 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1078 break;
1079 }
1080 } else if(c==BOCU1_RESET) {
1081 /* only reset the state, no code point */
1082 prev=BOCU1_ASCII_PREV;
1083 sourceIndex=nextSourceIndex;
1084 continue;
1085 } else {
1086 /*
1087 * For multi-byte difference lead bytes, set the decoder state
1088 * with the partial difference value from the lead byte and
1089 * with the number of trail bytes.
1090 */
1091 bytes[0]=(uint8_t)c;
1092 byteIndex=1;
1093
1094 diff=decodeBocu1LeadByte(c);
1095 count=diff&3;
1096 diff>>=2;
1097 getTrail:
1098 for(;;) {
1099 if(source>=sourceLimit) {
1100 goto endloop;
1101 }
1102 ++nextSourceIndex;
1103 c=bytes[byteIndex++]=*source++;
1104
1105 /* trail byte in any position */
1106 c=decodeBocu1TrailByte(count, c);
1107 if(c<0) {
1108 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1109 goto endloop;
1110 }
1111
1112 diff+=c;
1113 if(--count==0) {
1114 /* final trail byte, deliver a code point */
1115 byteIndex=0;
1116 c=prev+diff;
1117 if((uint32_t)c>0x10ffff) {
1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1119 goto endloop;
1120 }
1121 break;
1122 }
1123 }
1124 }
1125
1126 /* calculate the next prev and output c */
1127 prev=BOCU1_PREV(c);
1128 if(c<=0xffff) {
1129 *target++=(UChar)c;
1130 *offsets++=sourceIndex;
1131 } else {
1132 /* output surrogate pair */
1133 *target++=U16_LEAD(c);
1134 if(target<targetLimit) {
1135 *target++=U16_TRAIL(c);
1136 *offsets++=sourceIndex;
1137 *offsets++=sourceIndex;
1138 } else {
1139 /* target overflow */
1140 *offsets++=sourceIndex;
1141 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1142 cnv->UCharErrorBufferLength=1;
1143 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1144 break;
1145 }
1146 }
1147 sourceIndex=nextSourceIndex;
1148 }
1149 endloop:
1150
1151 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1152 /* set the converter state in UConverter to deal with the next character */
1153 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1154 cnv->mode=0;
1155 } else {
1156 /* set the converter state back into UConverter */
1157 cnv->toUnicodeStatus=(uint32_t)prev;
1158 cnv->mode=(diff<<2)|count;
1159 }
1160 cnv->toULength=byteIndex;
1161
1162 /* write back the updated pointers */
1163 pArgs->source=(const char *)source;
1164 pArgs->target=target;
1165 pArgs->offsets=offsets;
1166 return;
1167 }
1168
1169 /*
1170 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1171 * If a change is made in the original function, then either
1172 * change this function the same way or
1173 * re-copy the original function and remove the variables
1174 * offsets, sourceIndex, and nextSourceIndex.
1175 */
1176 static void
1177 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1178 UErrorCode *pErrorCode) {
1179 UConverter *cnv;
1180 const uint8_t *source, *sourceLimit;
1181 UChar *target;
1182 const UChar *targetLimit;
1183
1184 int32_t prev, count, diff, c;
1185
1186 int8_t byteIndex;
1187 uint8_t *bytes;
1188
1189 /* set up the local pointers */
1190 cnv=pArgs->converter;
1191 source=(const uint8_t *)pArgs->source;
1192 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1193 target=pArgs->target;
1194 targetLimit=pArgs->targetLimit;
1195
1196 /* get the converter state from UConverter */
1197 prev=(int32_t)cnv->toUnicodeStatus;
1198 if(prev==0) {
1199 prev=BOCU1_ASCII_PREV;
1200 }
1201 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1202 count=diff&3;
1203 diff>>=2;
1204
1205 byteIndex=cnv->toULength;
1206 bytes=cnv->toUBytes;
1207
1208 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1209 if(count>0 && byteIndex>0 && target<targetLimit) {
1210 goto getTrail;
1211 }
1212
1213 fastSingle:
1214 /* fast loop for single-byte differences */
1215 /* use count as the only loop counter variable */
1216 diff=(int32_t)(sourceLimit-source);
1217 count=(int32_t)(pArgs->targetLimit-target);
1218 if(count>diff) {
1219 count=diff;
1220 }
1221 while(count>0) {
1222 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1223 c=prev+(c-BOCU1_MIDDLE);
1224 if(c<0x3000) {
1225 *target++=(UChar)c;
1226 prev=BOCU1_SIMPLE_PREV(c);
1227 } else {
1228 break;
1229 }
1230 } else if(c<=0x20) {
1231 if(c!=0x20) {
1232 prev=BOCU1_ASCII_PREV;
1233 }
1234 *target++=(UChar)c;
1235 } else {
1236 break;
1237 }
1238 ++source;
1239 --count;
1240 }
1241
1242 /* decode a sequence of single and lead bytes */
1243 while(source<sourceLimit) {
1244 if(target>=targetLimit) {
1245 /* target is full */
1246 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1247 break;
1248 }
1249
1250 c=*source++;
1251 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1252 /* Write a code point directly from a single-byte difference. */
1253 c=prev+(c-BOCU1_MIDDLE);
1254 if(c<0x3000) {
1255 *target++=(UChar)c;
1256 prev=BOCU1_SIMPLE_PREV(c);
1257 goto fastSingle;
1258 }
1259 } else if(c<=0x20) {
1260 /*
1261 * Direct-encoded C0 control code or space.
1262 * Reset prev for C0 control codes but not for space.
1263 */
1264 if(c!=0x20) {
1265 prev=BOCU1_ASCII_PREV;
1266 }
1267 *target++=(UChar)c;
1268 continue;
1269 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1270 /* Optimize two-byte case. */
1271 if(c>=BOCU1_MIDDLE) {
1272 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1273 } else {
1274 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1275 }
1276
1277 /* trail byte */
1278 c=decodeBocu1TrailByte(1, *source++);
1279 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1280 bytes[0]=source[-2];
1281 bytes[1]=source[-1];
1282 byteIndex=2;
1283 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1284 break;
1285 }
1286 } else if(c==BOCU1_RESET) {
1287 /* only reset the state, no code point */
1288 prev=BOCU1_ASCII_PREV;
1289 continue;
1290 } else {
1291 /*
1292 * For multi-byte difference lead bytes, set the decoder state
1293 * with the partial difference value from the lead byte and
1294 * with the number of trail bytes.
1295 */
1296 bytes[0]=(uint8_t)c;
1297 byteIndex=1;
1298
1299 diff=decodeBocu1LeadByte(c);
1300 count=diff&3;
1301 diff>>=2;
1302 getTrail:
1303 for(;;) {
1304 if(source>=sourceLimit) {
1305 goto endloop;
1306 }
1307 c=bytes[byteIndex++]=*source++;
1308
1309 /* trail byte in any position */
1310 c=decodeBocu1TrailByte(count, c);
1311 if(c<0) {
1312 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1313 goto endloop;
1314 }
1315
1316 diff+=c;
1317 if(--count==0) {
1318 /* final trail byte, deliver a code point */
1319 byteIndex=0;
1320 c=prev+diff;
1321 if((uint32_t)c>0x10ffff) {
1322 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1323 goto endloop;
1324 }
1325 break;
1326 }
1327 }
1328 }
1329
1330 /* calculate the next prev and output c */
1331 prev=BOCU1_PREV(c);
1332 if(c<=0xffff) {
1333 *target++=(UChar)c;
1334 } else {
1335 /* output surrogate pair */
1336 *target++=U16_LEAD(c);
1337 if(target<targetLimit) {
1338 *target++=U16_TRAIL(c);
1339 } else {
1340 /* target overflow */
1341 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1342 cnv->UCharErrorBufferLength=1;
1343 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344 break;
1345 }
1346 }
1347 }
1348 endloop:
1349
1350 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1351 /* set the converter state in UConverter to deal with the next character */
1352 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1353 cnv->mode=0;
1354 } else {
1355 /* set the converter state back into UConverter */
1356 cnv->toUnicodeStatus=(uint32_t)prev;
1357 cnv->mode=(diff<<2)|count;
1358 }
1359 cnv->toULength=byteIndex;
1360
1361 /* write back the updated pointers */
1362 pArgs->source=(const char *)source;
1363 pArgs->target=target;
1364 return;
1365 }
1366
1367 /* miscellaneous ------------------------------------------------------------ */
1368
1369 static const UConverterImpl _Bocu1Impl={
1370 UCNV_BOCU1,
1371
1372 NULL,
1373 NULL,
1374
1375 NULL,
1376 NULL,
1377 NULL,
1378
1379 _Bocu1ToUnicode,
1380 _Bocu1ToUnicodeWithOffsets,
1381 _Bocu1FromUnicode,
1382 _Bocu1FromUnicodeWithOffsets,
1383 NULL,
1384
1385 NULL,
1386 NULL,
1387 NULL,
1388 NULL,
1389 ucnv_getCompleteUnicodeSet,
1390
1391 NULL,
1392 NULL
1393 };
1394
1395 static const UConverterStaticData _Bocu1StaticData={
1396 sizeof(UConverterStaticData),
1397 "BOCU-1",
1398 1214, /* CCSID for BOCU-1 */
1399 UCNV_IBM, UCNV_BOCU1,
1400 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1401 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1402 FALSE, FALSE,
1403 0,
1404 0,
1405 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1406 };
1407
1408 const UConverterSharedData _Bocu1Data=
1409 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1410
1411 #endif