]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
374ca955 | 4 | * Copyright (C) 2002-2004, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: ucnvbocu.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2002mar27 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This is an implementation of the Binary Ordered Compression for Unicode, | |
374ca955 | 17 | * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ |
b75a7d8f A |
18 | */ |
19 | ||
20 | #include "unicode/utypes.h" | |
374ca955 A |
21 | |
22 | #if !UCONFIG_NO_CONVERSION | |
23 | ||
b75a7d8f A |
24 | #include "unicode/ucnv.h" |
25 | #include "unicode/ucnv_cb.h" | |
26 | #include "ucnv_bld.h" | |
27 | #include "ucnv_cnv.h" | |
28 | ||
29 | /* BOCU-1 constants and macros ---------------------------------------------- */ | |
30 | ||
31 | /* | |
32 | * BOCU-1 encodes the code points of a Unicode string as | |
33 | * a sequence of byte-encoded differences (slope detection), | |
34 | * preserving lexical order. | |
35 | * | |
36 | * Optimize the difference-taking for runs of Unicode text within | |
37 | * small scripts: | |
38 | * | |
39 | * Most small scripts are allocated within aligned 128-blocks of Unicode | |
40 | * code points. Lexical order is preserved if the "previous code point" state | |
41 | * is always moved into the middle of such a block. | |
42 | * | |
43 | * Additionally, "prev" is moved from anywhere in the Unihan and Hangul | |
44 | * areas into the middle of those areas. | |
45 | * | |
46 | * C0 control codes and space are encoded with their US-ASCII bytes. | |
47 | * "prev" is reset for C0 controls but not for space. | |
48 | */ | |
49 | ||
50 | /* initial value for "prev": middle of the ASCII range */ | |
51 | #define BOCU1_ASCII_PREV 0x40 | |
52 | ||
53 | /* bounding byte values for differences */ | |
54 | #define BOCU1_MIN 0x21 | |
55 | #define BOCU1_MIDDLE 0x90 | |
56 | #define BOCU1_MAX_LEAD 0xfe | |
57 | #define BOCU1_MAX_TRAIL 0xff | |
58 | #define BOCU1_RESET 0xff | |
59 | ||
60 | /* number of lead bytes */ | |
61 | #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) | |
62 | ||
63 | /* adjust trail byte counts for the use of some C0 control byte values */ | |
64 | #define BOCU1_TRAIL_CONTROLS_COUNT 20 | |
65 | #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) | |
66 | ||
67 | /* number of trail bytes */ | |
68 | #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) | |
69 | ||
70 | /* | |
71 | * number of positive and negative single-byte codes | |
72 | * (counting 0==BOCU1_MIDDLE among the positive ones) | |
73 | */ | |
74 | #define BOCU1_SINGLE 64 | |
75 | ||
76 | /* number of lead bytes for positive and negative 2/3/4-byte sequences */ | |
77 | #define BOCU1_LEAD_2 43 | |
78 | #define BOCU1_LEAD_3 3 | |
79 | #define BOCU1_LEAD_4 1 | |
80 | ||
81 | /* The difference value range for single-byters. */ | |
82 | #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) | |
83 | #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) | |
84 | ||
85 | /* The difference value range for double-byters. */ | |
86 | #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) | |
87 | #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) | |
88 | ||
89 | /* The difference value range for 3-byters. */ | |
90 | #define BOCU1_REACH_POS_3 \ | |
91 | (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) | |
92 | ||
93 | #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) | |
94 | ||
95 | /* The lead byte start values. */ | |
96 | #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) | |
97 | #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) | |
98 | #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) | |
99 | /* ==BOCU1_MAX_LEAD */ | |
100 | ||
101 | #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) | |
102 | #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) | |
103 | #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) | |
104 | /* ==BOCU1_MIN+1 */ | |
105 | ||
106 | /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ | |
107 | #define BOCU1_LENGTH_FROM_LEAD(lead) \ | |
108 | ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ | |
109 | (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ | |
110 | (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) | |
111 | ||
112 | /* The length of a byte sequence, according to its packed form. */ | |
113 | #define BOCU1_LENGTH_FROM_PACKED(packed) \ | |
114 | ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) | |
115 | ||
116 | /* | |
117 | * 12 commonly used C0 control codes (and space) are only used to encode | |
118 | * themselves directly, | |
119 | * which makes BOCU-1 MIME-usable and reasonably safe for | |
120 | * ASCII-oriented software. | |
121 | * | |
122 | * These controls are | |
123 | * 0 NUL | |
124 | * | |
125 | * 7 BEL | |
126 | * 8 BS | |
127 | * | |
128 | * 9 TAB | |
129 | * a LF | |
130 | * b VT | |
131 | * c FF | |
132 | * d CR | |
133 | * | |
134 | * e SO | |
135 | * f SI | |
136 | * | |
137 | * 1a SUB | |
138 | * 1b ESC | |
139 | * | |
140 | * The other 20 C0 controls are also encoded directly (to preserve order) | |
141 | * but are also used as trail bytes in difference encoding | |
142 | * (for better compression). | |
143 | */ | |
144 | #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) | |
145 | ||
146 | /* | |
147 | * Byte value map for control codes, | |
148 | * from external byte values 0x00..0x20 | |
149 | * to trail byte values 0..19 (0..0x13) as used in the difference calculation. | |
150 | * External byte values that are illegal as trail bytes are mapped to -1. | |
151 | */ | |
152 | static const int8_t | |
153 | bocu1ByteToTrail[BOCU1_MIN]={ | |
154 | /* 0 1 2 3 4 5 6 7 */ | |
155 | -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, | |
156 | ||
157 | /* 8 9 a b c d e f */ | |
158 | -1, -1, -1, -1, -1, -1, -1, -1, | |
159 | ||
160 | /* 10 11 12 13 14 15 16 17 */ | |
161 | 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, | |
162 | ||
163 | /* 18 19 1a 1b 1c 1d 1e 1f */ | |
164 | 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, | |
165 | ||
166 | /* 20 */ | |
167 | -1 | |
168 | }; | |
169 | ||
170 | /* | |
171 | * Byte value map for control codes, | |
172 | * from trail byte values 0..19 (0..0x13) as used in the difference calculation | |
173 | * to external byte values 0x00..0x20. | |
174 | */ | |
175 | static const int8_t | |
176 | bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ | |
177 | /* 0 1 2 3 4 5 6 7 */ | |
178 | 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, | |
179 | ||
180 | /* 8 9 a b c d e f */ | |
181 | 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, | |
182 | ||
183 | /* 10 11 12 13 */ | |
184 | 0x1c, 0x1d, 0x1e, 0x1f | |
185 | }; | |
186 | ||
187 | /** | |
188 | * Integer division and modulo with negative numerators | |
189 | * yields negative modulo results and quotients that are one more than | |
190 | * what we need here. | |
191 | * This macro adjust the results so that the modulo-value m is always >=0. | |
192 | * | |
193 | * For positive n, the if() condition is always FALSE. | |
194 | * | |
195 | * @param n Number to be split into quotient and rest. | |
196 | * Will be modified to contain the quotient. | |
197 | * @param d Divisor. | |
198 | * @param m Output variable for the rest (modulo result). | |
199 | */ | |
200 | #define NEGDIVMOD(n, d, m) { \ | |
201 | (m)=(n)%(d); \ | |
202 | (n)/=(d); \ | |
203 | if((m)<0) { \ | |
204 | --(n); \ | |
205 | (m)+=(d); \ | |
206 | } \ | |
207 | } | |
208 | ||
209 | /* BOCU-1 implementation functions ------------------------------------------ */ | |
210 | ||
211 | #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) | |
212 | ||
213 | /** | |
214 | * Compute the next "previous" value for differencing | |
215 | * from the current code point. | |
216 | * | |
217 | * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) | |
218 | * @return "previous code point" state value | |
219 | */ | |
220 | static U_INLINE int32_t | |
221 | bocu1Prev(int32_t c) { | |
222 | /* compute new prev */ | |
223 | if(/* 0x3040<=c && */ c<=0x309f) { | |
224 | /* Hiragana is not 128-aligned */ | |
225 | return 0x3070; | |
226 | } else if(0x4e00<=c && c<=0x9fa5) { | |
227 | /* CJK Unihan */ | |
228 | return 0x4e00-BOCU1_REACH_NEG_2; | |
229 | } else if(0xac00<=c /* && c<=0xd7a3 */) { | |
230 | /* Korean Hangul */ | |
231 | return (0xd7a3+0xac00)/2; | |
232 | } else { | |
233 | /* mostly small scripts */ | |
234 | return BOCU1_SIMPLE_PREV(c); | |
235 | } | |
236 | } | |
237 | ||
238 | /** Fast version of bocu1Prev() for most scripts. */ | |
239 | #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) | |
240 | ||
241 | /* | |
242 | * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. | |
243 | * The UConverter fields are used as follows: | |
244 | * | |
245 | * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) | |
246 | * | |
247 | * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) | |
248 | * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) | |
249 | */ | |
250 | ||
251 | /* BOCU-1-from-Unicode conversion functions --------------------------------- */ | |
252 | ||
253 | /** | |
254 | * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes | |
255 | * and return a packed integer with them. | |
256 | * | |
257 | * The encoding favors small absolut differences with short encodings | |
258 | * to compress runs of same-script characters. | |
259 | * | |
260 | * Optimized version with unrolled loops and fewer floating-point operations | |
261 | * than the standard packDiff(). | |
262 | * | |
263 | * @param diff difference value -0x10ffff..0x10ffff | |
264 | * @return | |
265 | * 0x010000zz for 1-byte sequence zz | |
266 | * 0x0200yyzz for 2-byte sequence yy zz | |
267 | * 0x03xxyyzz for 3-byte sequence xx yy zz | |
268 | * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) | |
269 | */ | |
270 | static int32_t | |
271 | packDiff(int32_t diff) { | |
272 | int32_t result, m; | |
273 | ||
274 | if(diff>=BOCU1_REACH_NEG_1) { | |
275 | /* mostly positive differences, and single-byte negative ones */ | |
276 | #if 0 /* single-byte case handled in macros, see below */ | |
277 | if(diff<=BOCU1_REACH_POS_1) { | |
278 | /* single byte */ | |
279 | return 0x01000000|(BOCU1_MIDDLE+diff); | |
280 | } else | |
281 | #endif | |
282 | if(diff<=BOCU1_REACH_POS_2) { | |
283 | /* two bytes */ | |
284 | diff-=BOCU1_REACH_POS_1+1; | |
285 | result=0x02000000; | |
286 | ||
287 | m=diff%BOCU1_TRAIL_COUNT; | |
288 | diff/=BOCU1_TRAIL_COUNT; | |
289 | result|=BOCU1_TRAIL_TO_BYTE(m); | |
290 | ||
291 | result|=(BOCU1_START_POS_2+diff)<<8; | |
292 | } else if(diff<=BOCU1_REACH_POS_3) { | |
293 | /* three bytes */ | |
294 | diff-=BOCU1_REACH_POS_2+1; | |
295 | result=0x03000000; | |
296 | ||
297 | m=diff%BOCU1_TRAIL_COUNT; | |
298 | diff/=BOCU1_TRAIL_COUNT; | |
299 | result|=BOCU1_TRAIL_TO_BYTE(m); | |
300 | ||
301 | m=diff%BOCU1_TRAIL_COUNT; | |
302 | diff/=BOCU1_TRAIL_COUNT; | |
303 | result|=BOCU1_TRAIL_TO_BYTE(m)<<8; | |
304 | ||
305 | result|=(BOCU1_START_POS_3+diff)<<16; | |
306 | } else { | |
307 | /* four bytes */ | |
308 | diff-=BOCU1_REACH_POS_3+1; | |
309 | ||
310 | m=diff%BOCU1_TRAIL_COUNT; | |
311 | diff/=BOCU1_TRAIL_COUNT; | |
312 | result=BOCU1_TRAIL_TO_BYTE(m); | |
313 | ||
314 | m=diff%BOCU1_TRAIL_COUNT; | |
315 | diff/=BOCU1_TRAIL_COUNT; | |
316 | result|=BOCU1_TRAIL_TO_BYTE(m)<<8; | |
317 | ||
318 | /* | |
319 | * We know that / and % would deliver quotient 0 and rest=diff. | |
320 | * Avoid division and modulo for performance. | |
321 | */ | |
322 | result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; | |
323 | ||
324 | result|=((uint32_t)BOCU1_START_POS_4)<<24; | |
325 | } | |
326 | } else { | |
327 | /* two- to four-byte negative differences */ | |
328 | if(diff>=BOCU1_REACH_NEG_2) { | |
329 | /* two bytes */ | |
330 | diff-=BOCU1_REACH_NEG_1; | |
331 | result=0x02000000; | |
332 | ||
333 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
334 | result|=BOCU1_TRAIL_TO_BYTE(m); | |
335 | ||
336 | result|=(BOCU1_START_NEG_2+diff)<<8; | |
337 | } else if(diff>=BOCU1_REACH_NEG_3) { | |
338 | /* three bytes */ | |
339 | diff-=BOCU1_REACH_NEG_2; | |
340 | result=0x03000000; | |
341 | ||
342 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
343 | result|=BOCU1_TRAIL_TO_BYTE(m); | |
344 | ||
345 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
346 | result|=BOCU1_TRAIL_TO_BYTE(m)<<8; | |
347 | ||
348 | result|=(BOCU1_START_NEG_3+diff)<<16; | |
349 | } else { | |
350 | /* four bytes */ | |
351 | diff-=BOCU1_REACH_NEG_3; | |
352 | ||
353 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
354 | result=BOCU1_TRAIL_TO_BYTE(m); | |
355 | ||
356 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
357 | result|=BOCU1_TRAIL_TO_BYTE(m)<<8; | |
358 | ||
359 | /* | |
360 | * We know that NEGDIVMOD would deliver | |
361 | * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. | |
362 | * Avoid division and modulo for performance. | |
363 | */ | |
364 | m=diff+BOCU1_TRAIL_COUNT; | |
365 | result|=BOCU1_TRAIL_TO_BYTE(m)<<16; | |
366 | ||
367 | result|=BOCU1_MIN<<24; | |
368 | } | |
369 | } | |
370 | return result; | |
371 | } | |
372 | ||
373 | /* Faster versions of packDiff() for single-byte-encoded diff values. */ | |
374 | ||
375 | /** Is a diff value encodable in a single byte? */ | |
376 | #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) | |
377 | ||
378 | /** Encode a diff value in a single byte. */ | |
379 | #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) | |
380 | ||
381 | /** Is a diff value encodable in two bytes? */ | |
382 | #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) | |
383 | ||
384 | static void | |
385 | _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
386 | UErrorCode *pErrorCode) { | |
387 | UConverter *cnv; | |
388 | const UChar *source, *sourceLimit; | |
389 | uint8_t *target; | |
390 | int32_t targetCapacity; | |
391 | int32_t *offsets; | |
392 | ||
393 | int32_t prev, c, diff; | |
394 | ||
395 | int32_t sourceIndex, nextSourceIndex; | |
396 | ||
397 | U_ALIGN_CODE(16) | |
398 | ||
399 | /* set up the local pointers */ | |
400 | cnv=pArgs->converter; | |
401 | source=pArgs->source; | |
402 | sourceLimit=pArgs->sourceLimit; | |
403 | target=(uint8_t *)pArgs->target; | |
404 | targetCapacity=pArgs->targetLimit-pArgs->target; | |
405 | offsets=pArgs->offsets; | |
406 | ||
407 | /* get the converter state from UConverter */ | |
374ca955 | 408 | c=cnv->fromUChar32; |
b75a7d8f A |
409 | prev=(int32_t)cnv->fromUnicodeStatus; |
410 | if(prev==0) { | |
411 | prev=BOCU1_ASCII_PREV; | |
412 | } | |
413 | ||
414 | /* sourceIndex=-1 if the current character began in the previous buffer */ | |
415 | sourceIndex= c==0 ? 0 : -1; | |
416 | nextSourceIndex=0; | |
417 | ||
418 | /* conversion loop */ | |
419 | if(c!=0 && targetCapacity>0) { | |
420 | goto getTrail; | |
421 | } | |
422 | ||
423 | fastSingle: | |
424 | /* fast loop for single-byte differences */ | |
425 | /* use only one loop counter variable, targetCapacity, not also source */ | |
426 | diff=sourceLimit-source; | |
427 | if(targetCapacity>diff) { | |
428 | targetCapacity=diff; | |
429 | } | |
374ca955 A |
430 | while(targetCapacity>0 && (c=*source)<0x3000) { |
431 | if(c<=0x20) { | |
432 | if(c!=0x20) { | |
433 | prev=BOCU1_ASCII_PREV; | |
b75a7d8f | 434 | } |
374ca955 A |
435 | *target++=(uint8_t)c; |
436 | *offsets++=nextSourceIndex++; | |
b75a7d8f A |
437 | ++source; |
438 | --targetCapacity; | |
374ca955 A |
439 | } else { |
440 | diff=c-prev; | |
441 | if(DIFF_IS_SINGLE(diff)) { | |
442 | prev=BOCU1_SIMPLE_PREV(c); | |
443 | *target++=(uint8_t)PACK_SINGLE_DIFF(diff); | |
b75a7d8f A |
444 | *offsets++=nextSourceIndex++; |
445 | ++source; | |
446 | --targetCapacity; | |
447 | } else { | |
374ca955 | 448 | break; |
b75a7d8f A |
449 | } |
450 | } | |
451 | } | |
452 | /* restore real values */ | |
453 | targetCapacity=(const uint8_t *)pArgs->targetLimit-target; | |
454 | sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ | |
455 | ||
456 | /* regular loop for all cases */ | |
457 | while(source<sourceLimit) { | |
458 | if(targetCapacity>0) { | |
459 | c=*source++; | |
460 | ++nextSourceIndex; | |
461 | ||
462 | if(c<=0x20) { | |
463 | /* | |
464 | * ISO C0 control & space: | |
465 | * Encode directly for MIME compatibility, | |
466 | * and reset state except for space, to not disrupt compression. | |
467 | */ | |
468 | if(c!=0x20) { | |
469 | prev=BOCU1_ASCII_PREV; | |
470 | } | |
471 | *target++=(uint8_t)c; | |
374ca955 | 472 | *offsets++=sourceIndex; |
b75a7d8f A |
473 | --targetCapacity; |
474 | ||
475 | sourceIndex=nextSourceIndex; | |
476 | continue; | |
477 | } | |
478 | ||
479 | if(UTF_IS_LEAD(c)) { | |
480 | getTrail: | |
481 | if(source<sourceLimit) { | |
482 | /* test the following code unit */ | |
483 | UChar trail=*source; | |
484 | if(UTF_IS_SECOND_SURROGATE(trail)) { | |
485 | ++source; | |
486 | ++nextSourceIndex; | |
487 | c=UTF16_GET_PAIR_VALUE(c, trail); | |
488 | } | |
489 | } else { | |
490 | /* no more input */ | |
491 | c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ | |
492 | break; | |
493 | } | |
494 | } | |
495 | ||
496 | /* | |
497 | * all other Unicode code points c==U+0021..U+10ffff | |
498 | * are encoded with the difference c-prev | |
499 | * | |
500 | * a new prev is computed from c, | |
501 | * placed in the middle of a 0x80-block (for most small scripts) or | |
502 | * in the middle of the Unihan and Hangul blocks | |
503 | * to statistically minimize the following difference | |
504 | */ | |
505 | diff=c-prev; | |
506 | prev=BOCU1_PREV(c); | |
507 | if(DIFF_IS_SINGLE(diff)) { | |
508 | *target++=(uint8_t)PACK_SINGLE_DIFF(diff); | |
374ca955 | 509 | *offsets++=sourceIndex; |
b75a7d8f A |
510 | --targetCapacity; |
511 | sourceIndex=nextSourceIndex; | |
512 | if(c<0x3000) { | |
513 | goto fastSingle; | |
514 | } | |
515 | } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { | |
516 | /* optimize 2-byte case */ | |
517 | int32_t m; | |
518 | ||
519 | if(diff>=0) { | |
520 | diff-=BOCU1_REACH_POS_1+1; | |
521 | m=diff%BOCU1_TRAIL_COUNT; | |
522 | diff/=BOCU1_TRAIL_COUNT; | |
523 | diff+=BOCU1_START_POS_2; | |
524 | } else { | |
525 | diff-=BOCU1_REACH_NEG_1; | |
526 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
527 | diff+=BOCU1_START_NEG_2; | |
528 | } | |
529 | *target++=(uint8_t)diff; | |
530 | *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); | |
374ca955 A |
531 | *offsets++=sourceIndex; |
532 | *offsets++=sourceIndex; | |
b75a7d8f A |
533 | targetCapacity-=2; |
534 | sourceIndex=nextSourceIndex; | |
535 | } else { | |
536 | int32_t length; /* will be 2..4 */ | |
537 | ||
538 | diff=packDiff(diff); | |
539 | length=BOCU1_LENGTH_FROM_PACKED(diff); | |
540 | ||
541 | /* write the output character bytes from diff and length */ | |
542 | /* from the first if in the loop we know that targetCapacity>0 */ | |
543 | if(length<=targetCapacity) { | |
374ca955 A |
544 | switch(length) { |
545 | /* each branch falls through to the next one */ | |
546 | case 4: | |
547 | *target++=(uint8_t)(diff>>24); | |
548 | *offsets++=sourceIndex; | |
549 | case 3: | |
550 | *target++=(uint8_t)(diff>>16); | |
551 | *offsets++=sourceIndex; | |
552 | case 2: | |
553 | *target++=(uint8_t)(diff>>8); | |
554 | *offsets++=sourceIndex; | |
555 | /* case 1: handled above */ | |
556 | *target++=(uint8_t)diff; | |
557 | *offsets++=sourceIndex; | |
558 | default: | |
559 | /* will never occur */ | |
560 | break; | |
b75a7d8f A |
561 | } |
562 | targetCapacity-=length; | |
563 | sourceIndex=nextSourceIndex; | |
564 | } else { | |
565 | uint8_t *charErrorBuffer; | |
566 | ||
567 | /* | |
568 | * We actually do this backwards here: | |
569 | * In order to save an intermediate variable, we output | |
570 | * first to the overflow buffer what does not fit into the | |
571 | * regular target. | |
572 | */ | |
573 | /* we know that 1<=targetCapacity<length<=4 */ | |
574 | length-=targetCapacity; | |
575 | charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; | |
576 | switch(length) { | |
577 | /* each branch falls through to the next one */ | |
578 | case 3: | |
579 | *charErrorBuffer++=(uint8_t)(diff>>16); | |
580 | case 2: | |
581 | *charErrorBuffer++=(uint8_t)(diff>>8); | |
582 | case 1: | |
583 | *charErrorBuffer=(uint8_t)diff; | |
584 | default: | |
585 | /* will never occur */ | |
586 | break; | |
587 | } | |
588 | cnv->charErrorBufferLength=(int8_t)length; | |
589 | ||
590 | /* now output what fits into the regular target */ | |
591 | diff>>=8*length; /* length was reduced by targetCapacity */ | |
592 | switch(targetCapacity) { | |
593 | /* each branch falls through to the next one */ | |
594 | case 3: | |
595 | *target++=(uint8_t)(diff>>16); | |
374ca955 | 596 | *offsets++=sourceIndex; |
b75a7d8f A |
597 | case 2: |
598 | *target++=(uint8_t)(diff>>8); | |
374ca955 | 599 | *offsets++=sourceIndex; |
b75a7d8f A |
600 | case 1: |
601 | *target++=(uint8_t)diff; | |
374ca955 | 602 | *offsets++=sourceIndex; |
b75a7d8f A |
603 | default: |
604 | /* will never occur */ | |
605 | break; | |
606 | } | |
607 | ||
608 | /* target overflow */ | |
609 | targetCapacity=0; | |
610 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
611 | break; | |
612 | } | |
613 | } | |
614 | } else { | |
615 | /* target is full */ | |
616 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
617 | break; | |
618 | } | |
619 | } | |
620 | ||
374ca955 A |
621 | /* set the converter state back into UConverter */ |
622 | cnv->fromUChar32= c<0 ? -c : 0; | |
623 | cnv->fromUnicodeStatus=(uint32_t)prev; | |
b75a7d8f A |
624 | |
625 | /* write back the updated pointers */ | |
626 | pArgs->source=source; | |
627 | pArgs->target=(char *)target; | |
628 | pArgs->offsets=offsets; | |
629 | } | |
630 | ||
631 | /* | |
632 | * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. | |
633 | * If a change is made in the original function, then either | |
634 | * change this function the same way or | |
635 | * re-copy the original function and remove the variables | |
636 | * offsets, sourceIndex, and nextSourceIndex. | |
637 | */ | |
638 | static void | |
639 | _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, | |
640 | UErrorCode *pErrorCode) { | |
641 | UConverter *cnv; | |
642 | const UChar *source, *sourceLimit; | |
643 | uint8_t *target; | |
644 | int32_t targetCapacity; | |
645 | ||
646 | int32_t prev, c, diff; | |
647 | ||
648 | /* set up the local pointers */ | |
649 | cnv=pArgs->converter; | |
650 | source=pArgs->source; | |
651 | sourceLimit=pArgs->sourceLimit; | |
652 | target=(uint8_t *)pArgs->target; | |
653 | targetCapacity=pArgs->targetLimit-pArgs->target; | |
654 | ||
655 | /* get the converter state from UConverter */ | |
374ca955 | 656 | c=cnv->fromUChar32; |
b75a7d8f A |
657 | prev=(int32_t)cnv->fromUnicodeStatus; |
658 | if(prev==0) { | |
659 | prev=BOCU1_ASCII_PREV; | |
660 | } | |
661 | ||
662 | /* conversion loop */ | |
663 | if(c!=0 && targetCapacity>0) { | |
664 | goto getTrail; | |
665 | } | |
666 | ||
667 | fastSingle: | |
668 | /* fast loop for single-byte differences */ | |
669 | /* use only one loop counter variable, targetCapacity, not also source */ | |
670 | diff=sourceLimit-source; | |
671 | if(targetCapacity>diff) { | |
672 | targetCapacity=diff; | |
673 | } | |
674 | while(targetCapacity>0 && (c=*source)<0x3000) { | |
675 | if(c<=0x20) { | |
676 | if(c!=0x20) { | |
677 | prev=BOCU1_ASCII_PREV; | |
678 | } | |
679 | *target++=(uint8_t)c; | |
680 | } else { | |
681 | diff=c-prev; | |
682 | if(DIFF_IS_SINGLE(diff)) { | |
683 | prev=BOCU1_SIMPLE_PREV(c); | |
684 | *target++=(uint8_t)PACK_SINGLE_DIFF(diff); | |
685 | } else { | |
686 | break; | |
687 | } | |
688 | } | |
689 | ++source; | |
690 | --targetCapacity; | |
691 | } | |
692 | /* restore real values */ | |
693 | targetCapacity=(const uint8_t *)pArgs->targetLimit-target; | |
694 | ||
695 | /* regular loop for all cases */ | |
696 | while(source<sourceLimit) { | |
697 | if(targetCapacity>0) { | |
698 | c=*source++; | |
699 | ||
700 | if(c<=0x20) { | |
701 | /* | |
702 | * ISO C0 control & space: | |
703 | * Encode directly for MIME compatibility, | |
704 | * and reset state except for space, to not disrupt compression. | |
705 | */ | |
706 | if(c!=0x20) { | |
707 | prev=BOCU1_ASCII_PREV; | |
708 | } | |
709 | *target++=(uint8_t)c; | |
710 | --targetCapacity; | |
711 | continue; | |
712 | } | |
713 | ||
714 | if(UTF_IS_LEAD(c)) { | |
715 | getTrail: | |
716 | if(source<sourceLimit) { | |
717 | /* test the following code unit */ | |
718 | UChar trail=*source; | |
719 | if(UTF_IS_SECOND_SURROGATE(trail)) { | |
720 | ++source; | |
721 | c=UTF16_GET_PAIR_VALUE(c, trail); | |
722 | } | |
723 | } else { | |
724 | /* no more input */ | |
725 | c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ | |
726 | break; | |
727 | } | |
728 | } | |
729 | ||
730 | /* | |
731 | * all other Unicode code points c==U+0021..U+10ffff | |
732 | * are encoded with the difference c-prev | |
733 | * | |
734 | * a new prev is computed from c, | |
735 | * placed in the middle of a 0x80-block (for most small scripts) or | |
736 | * in the middle of the Unihan and Hangul blocks | |
737 | * to statistically minimize the following difference | |
738 | */ | |
739 | diff=c-prev; | |
740 | prev=BOCU1_PREV(c); | |
741 | if(DIFF_IS_SINGLE(diff)) { | |
742 | *target++=(uint8_t)PACK_SINGLE_DIFF(diff); | |
743 | --targetCapacity; | |
744 | if(c<0x3000) { | |
745 | goto fastSingle; | |
746 | } | |
747 | } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { | |
748 | /* optimize 2-byte case */ | |
749 | int32_t m; | |
750 | ||
751 | if(diff>=0) { | |
752 | diff-=BOCU1_REACH_POS_1+1; | |
753 | m=diff%BOCU1_TRAIL_COUNT; | |
754 | diff/=BOCU1_TRAIL_COUNT; | |
755 | diff+=BOCU1_START_POS_2; | |
756 | } else { | |
757 | diff-=BOCU1_REACH_NEG_1; | |
758 | NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); | |
759 | diff+=BOCU1_START_NEG_2; | |
760 | } | |
761 | *target++=(uint8_t)diff; | |
762 | *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); | |
763 | targetCapacity-=2; | |
764 | } else { | |
765 | int32_t length; /* will be 2..4 */ | |
766 | ||
767 | diff=packDiff(diff); | |
768 | length=BOCU1_LENGTH_FROM_PACKED(diff); | |
769 | ||
770 | /* write the output character bytes from diff and length */ | |
771 | /* from the first if in the loop we know that targetCapacity>0 */ | |
772 | if(length<=targetCapacity) { | |
773 | switch(length) { | |
774 | /* each branch falls through to the next one */ | |
775 | case 4: | |
776 | *target++=(uint8_t)(diff>>24); | |
777 | case 3: | |
778 | *target++=(uint8_t)(diff>>16); | |
779 | /* case 2: handled above */ | |
780 | *target++=(uint8_t)(diff>>8); | |
781 | /* case 1: handled above */ | |
782 | *target++=(uint8_t)diff; | |
783 | default: | |
784 | /* will never occur */ | |
785 | break; | |
786 | } | |
787 | targetCapacity-=length; | |
788 | } else { | |
789 | uint8_t *charErrorBuffer; | |
790 | ||
791 | /* | |
792 | * We actually do this backwards here: | |
793 | * In order to save an intermediate variable, we output | |
794 | * first to the overflow buffer what does not fit into the | |
795 | * regular target. | |
796 | */ | |
797 | /* we know that 1<=targetCapacity<length<=4 */ | |
798 | length-=targetCapacity; | |
799 | charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; | |
800 | switch(length) { | |
801 | /* each branch falls through to the next one */ | |
802 | case 3: | |
803 | *charErrorBuffer++=(uint8_t)(diff>>16); | |
804 | case 2: | |
805 | *charErrorBuffer++=(uint8_t)(diff>>8); | |
806 | case 1: | |
807 | *charErrorBuffer=(uint8_t)diff; | |
808 | default: | |
809 | /* will never occur */ | |
810 | break; | |
811 | } | |
812 | cnv->charErrorBufferLength=(int8_t)length; | |
813 | ||
814 | /* now output what fits into the regular target */ | |
815 | diff>>=8*length; /* length was reduced by targetCapacity */ | |
816 | switch(targetCapacity) { | |
817 | /* each branch falls through to the next one */ | |
818 | case 3: | |
819 | *target++=(uint8_t)(diff>>16); | |
820 | case 2: | |
821 | *target++=(uint8_t)(diff>>8); | |
822 | case 1: | |
823 | *target++=(uint8_t)diff; | |
824 | default: | |
825 | /* will never occur */ | |
826 | break; | |
827 | } | |
828 | ||
829 | /* target overflow */ | |
830 | targetCapacity=0; | |
831 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
832 | break; | |
833 | } | |
834 | } | |
835 | } else { | |
836 | /* target is full */ | |
837 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
838 | break; | |
839 | } | |
840 | } | |
841 | ||
374ca955 A |
842 | /* set the converter state back into UConverter */ |
843 | cnv->fromUChar32= c<0 ? -c : 0; | |
844 | cnv->fromUnicodeStatus=(uint32_t)prev; | |
b75a7d8f A |
845 | |
846 | /* write back the updated pointers */ | |
847 | pArgs->source=source; | |
848 | pArgs->target=(char *)target; | |
849 | } | |
850 | ||
851 | /* BOCU-1-to-Unicode conversion functions ----------------------------------- */ | |
852 | ||
853 | /** | |
854 | * Function for BOCU-1 decoder; handles multi-byte lead bytes. | |
855 | * | |
856 | * @param b lead byte; | |
857 | * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD | |
858 | * @return (diff<<2)|count | |
859 | */ | |
860 | static U_INLINE int32_t | |
861 | decodeBocu1LeadByte(int32_t b) { | |
862 | int32_t diff, count; | |
863 | ||
864 | if(b>=BOCU1_START_NEG_2) { | |
865 | /* positive difference */ | |
866 | if(b<BOCU1_START_POS_3) { | |
867 | /* two bytes */ | |
868 | diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; | |
869 | count=1; | |
870 | } else if(b<BOCU1_START_POS_4) { | |
871 | /* three bytes */ | |
872 | diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; | |
873 | count=2; | |
874 | } else { | |
875 | /* four bytes */ | |
876 | diff=BOCU1_REACH_POS_3+1; | |
877 | count=3; | |
878 | } | |
879 | } else { | |
880 | /* negative difference */ | |
881 | if(b>=BOCU1_START_NEG_3) { | |
882 | /* two bytes */ | |
883 | diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; | |
884 | count=1; | |
885 | } else if(b>BOCU1_MIN) { | |
886 | /* three bytes */ | |
887 | diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; | |
888 | count=2; | |
889 | } else { | |
890 | /* four bytes */ | |
891 | diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; | |
892 | count=3; | |
893 | } | |
894 | } | |
895 | ||
896 | /* return the state for decoding the trail byte(s) */ | |
897 | return (diff<<2)|count; | |
898 | } | |
899 | ||
900 | /** | |
901 | * Function for BOCU-1 decoder; handles multi-byte trail bytes. | |
902 | * | |
903 | * @param count number of remaining trail bytes including this one | |
904 | * @param b trail byte | |
905 | * @return new delta for diff including b - <0 indicates an error | |
906 | * | |
907 | * @see decodeBocu1 | |
908 | */ | |
909 | static U_INLINE int32_t | |
910 | decodeBocu1TrailByte(int32_t count, int32_t b) { | |
911 | if(b<=0x20) { | |
912 | /* skip some C0 controls and make the trail byte range contiguous */ | |
913 | b=bocu1ByteToTrail[b]; | |
914 | /* b<0 for an illegal trail byte value will result in return<0 below */ | |
915 | #if BOCU1_MAX_TRAIL<0xff | |
916 | } else if(b>BOCU1_MAX_TRAIL) { | |
917 | return -99; | |
918 | #endif | |
919 | } else { | |
920 | b-=BOCU1_TRAIL_BYTE_OFFSET; | |
921 | } | |
922 | ||
923 | /* add trail byte into difference and decrement count */ | |
924 | if(count==1) { | |
925 | return b; | |
926 | } else if(count==2) { | |
927 | return b*BOCU1_TRAIL_COUNT; | |
928 | } else /* count==3 */ { | |
929 | return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); | |
930 | } | |
931 | } | |
932 | ||
933 | static void | |
934 | _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
935 | UErrorCode *pErrorCode) { | |
936 | UConverter *cnv; | |
937 | const uint8_t *source, *sourceLimit; | |
938 | UChar *target; | |
939 | const UChar *targetLimit; | |
940 | int32_t *offsets; | |
941 | ||
942 | int32_t prev, count, diff, c; | |
943 | ||
944 | int8_t byteIndex; | |
945 | uint8_t *bytes; | |
946 | ||
947 | int32_t sourceIndex, nextSourceIndex; | |
948 | ||
949 | /* set up the local pointers */ | |
950 | cnv=pArgs->converter; | |
951 | source=(const uint8_t *)pArgs->source; | |
952 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; | |
953 | target=pArgs->target; | |
954 | targetLimit=pArgs->targetLimit; | |
955 | offsets=pArgs->offsets; | |
956 | ||
957 | /* get the converter state from UConverter */ | |
958 | prev=(int32_t)cnv->toUnicodeStatus; | |
959 | if(prev==0) { | |
960 | prev=BOCU1_ASCII_PREV; | |
961 | } | |
962 | diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ | |
963 | count=diff&3; | |
964 | diff>>=2; | |
965 | ||
966 | byteIndex=cnv->toULength; | |
967 | bytes=cnv->toUBytes; | |
968 | ||
969 | /* sourceIndex=-1 if the current character began in the previous buffer */ | |
970 | sourceIndex=byteIndex==0 ? 0 : -1; | |
971 | nextSourceIndex=0; | |
972 | ||
973 | /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ | |
b75a7d8f A |
974 | if(count>0 && byteIndex>0 && target<targetLimit) { |
975 | goto getTrail; | |
976 | } | |
977 | ||
978 | fastSingle: | |
979 | /* fast loop for single-byte differences */ | |
980 | /* use count as the only loop counter variable */ | |
981 | diff=sourceLimit-source; | |
982 | count=pArgs->targetLimit-target; | |
983 | if(count>diff) { | |
984 | count=diff; | |
985 | } | |
374ca955 A |
986 | while(count>0) { |
987 | if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { | |
988 | c=prev+(c-BOCU1_MIDDLE); | |
989 | if(c<0x3000) { | |
b75a7d8f A |
990 | *target++=(UChar)c; |
991 | *offsets++=nextSourceIndex++; | |
374ca955 | 992 | prev=BOCU1_SIMPLE_PREV(c); |
b75a7d8f A |
993 | } else { |
994 | break; | |
995 | } | |
374ca955 A |
996 | } else if(c<=0x20) { |
997 | if(c!=0x20) { | |
998 | prev=BOCU1_ASCII_PREV; | |
999 | } | |
1000 | *target++=(UChar)c; | |
1001 | *offsets++=nextSourceIndex++; | |
1002 | } else { | |
1003 | break; | |
b75a7d8f | 1004 | } |
374ca955 A |
1005 | ++source; |
1006 | --count; | |
b75a7d8f | 1007 | } |
374ca955 | 1008 | sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ |
b75a7d8f A |
1009 | |
1010 | /* decode a sequence of single and lead bytes */ | |
1011 | while(source<sourceLimit) { | |
1012 | if(target>=targetLimit) { | |
1013 | /* target is full */ | |
1014 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
1015 | break; | |
1016 | } | |
1017 | ||
1018 | ++nextSourceIndex; | |
1019 | c=*source++; | |
1020 | if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { | |
1021 | /* Write a code point directly from a single-byte difference. */ | |
1022 | c=prev+(c-BOCU1_MIDDLE); | |
1023 | if(c<0x3000) { | |
1024 | *target++=(UChar)c; | |
374ca955 | 1025 | *offsets++=sourceIndex; |
b75a7d8f A |
1026 | prev=BOCU1_SIMPLE_PREV(c); |
1027 | sourceIndex=nextSourceIndex; | |
1028 | goto fastSingle; | |
1029 | } | |
1030 | } else if(c<=0x20) { | |
1031 | /* | |
1032 | * Direct-encoded C0 control code or space. | |
1033 | * Reset prev for C0 control codes but not for space. | |
1034 | */ | |
1035 | if(c!=0x20) { | |
1036 | prev=BOCU1_ASCII_PREV; | |
1037 | } | |
1038 | *target++=(UChar)c; | |
374ca955 | 1039 | *offsets++=sourceIndex; |
b75a7d8f A |
1040 | sourceIndex=nextSourceIndex; |
1041 | continue; | |
1042 | } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { | |
1043 | /* Optimize two-byte case. */ | |
1044 | if(c>=BOCU1_MIDDLE) { | |
1045 | diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; | |
1046 | } else { | |
1047 | diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; | |
1048 | } | |
1049 | ||
1050 | /* trail byte */ | |
1051 | ++nextSourceIndex; | |
1052 | c=decodeBocu1TrailByte(1, *source++); | |
1053 | if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { | |
1054 | bytes[0]=source[-2]; | |
1055 | bytes[1]=source[-1]; | |
1056 | byteIndex=2; | |
374ca955 A |
1057 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
1058 | break; | |
b75a7d8f A |
1059 | } |
1060 | } else if(c==BOCU1_RESET) { | |
1061 | /* only reset the state, no code point */ | |
1062 | prev=BOCU1_ASCII_PREV; | |
1063 | sourceIndex=nextSourceIndex; | |
1064 | continue; | |
1065 | } else { | |
1066 | /* | |
1067 | * For multi-byte difference lead bytes, set the decoder state | |
1068 | * with the partial difference value from the lead byte and | |
1069 | * with the number of trail bytes. | |
1070 | */ | |
1071 | bytes[0]=(uint8_t)c; | |
1072 | byteIndex=1; | |
1073 | ||
1074 | diff=decodeBocu1LeadByte(c); | |
1075 | count=diff&3; | |
1076 | diff>>=2; | |
1077 | getTrail: | |
1078 | for(;;) { | |
1079 | if(source>=sourceLimit) { | |
1080 | goto endloop; | |
1081 | } | |
1082 | ++nextSourceIndex; | |
1083 | c=bytes[byteIndex++]=*source++; | |
1084 | ||
1085 | /* trail byte in any position */ | |
1086 | c=decodeBocu1TrailByte(count, c); | |
1087 | if(c<0) { | |
374ca955 A |
1088 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
1089 | goto endloop; | |
b75a7d8f A |
1090 | } |
1091 | ||
1092 | diff+=c; | |
1093 | if(--count==0) { | |
1094 | /* final trail byte, deliver a code point */ | |
1095 | byteIndex=0; | |
1096 | c=prev+diff; | |
1097 | if((uint32_t)c>0x10ffff) { | |
374ca955 A |
1098 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
1099 | goto endloop; | |
b75a7d8f A |
1100 | } |
1101 | break; | |
1102 | } | |
1103 | } | |
1104 | } | |
1105 | ||
1106 | /* calculate the next prev and output c */ | |
1107 | prev=BOCU1_PREV(c); | |
1108 | if(c<=0xffff) { | |
1109 | *target++=(UChar)c; | |
374ca955 | 1110 | *offsets++=sourceIndex; |
b75a7d8f A |
1111 | } else { |
1112 | /* output surrogate pair */ | |
1113 | *target++=UTF16_LEAD(c); | |
1114 | if(target<targetLimit) { | |
1115 | *target++=UTF16_TRAIL(c); | |
374ca955 A |
1116 | *offsets++=sourceIndex; |
1117 | *offsets++=sourceIndex; | |
b75a7d8f A |
1118 | } else { |
1119 | /* target overflow */ | |
374ca955 | 1120 | *offsets++=sourceIndex; |
b75a7d8f A |
1121 | cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c); |
1122 | cnv->UCharErrorBufferLength=1; | |
1123 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
1124 | break; | |
1125 | } | |
1126 | } | |
1127 | sourceIndex=nextSourceIndex; | |
1128 | } | |
1129 | endloop: | |
1130 | ||
374ca955 A |
1131 | if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { |
1132 | /* set the converter state in UConverter to deal with the next character */ | |
b75a7d8f A |
1133 | cnv->toUnicodeStatus=BOCU1_ASCII_PREV; |
1134 | cnv->mode=0; | |
b75a7d8f A |
1135 | } else { |
1136 | /* set the converter state back into UConverter */ | |
1137 | cnv->toUnicodeStatus=(uint32_t)prev; | |
1138 | cnv->mode=(diff<<2)|count; | |
b75a7d8f | 1139 | } |
374ca955 | 1140 | cnv->toULength=byteIndex; |
b75a7d8f | 1141 | |
b75a7d8f A |
1142 | /* write back the updated pointers */ |
1143 | pArgs->source=(const char *)source; | |
1144 | pArgs->target=target; | |
1145 | pArgs->offsets=offsets; | |
1146 | return; | |
b75a7d8f A |
1147 | } |
1148 | ||
1149 | /* | |
1150 | * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. | |
1151 | * If a change is made in the original function, then either | |
1152 | * change this function the same way or | |
1153 | * re-copy the original function and remove the variables | |
1154 | * offsets, sourceIndex, and nextSourceIndex. | |
1155 | */ | |
1156 | static void | |
1157 | _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, | |
1158 | UErrorCode *pErrorCode) { | |
1159 | UConverter *cnv; | |
1160 | const uint8_t *source, *sourceLimit; | |
1161 | UChar *target; | |
1162 | const UChar *targetLimit; | |
1163 | ||
1164 | int32_t prev, count, diff, c; | |
1165 | ||
1166 | int8_t byteIndex; | |
1167 | uint8_t *bytes; | |
1168 | ||
1169 | U_ALIGN_CODE(16) | |
1170 | ||
1171 | /* set up the local pointers */ | |
1172 | cnv=pArgs->converter; | |
1173 | source=(const uint8_t *)pArgs->source; | |
1174 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; | |
1175 | target=pArgs->target; | |
1176 | targetLimit=pArgs->targetLimit; | |
1177 | ||
1178 | /* get the converter state from UConverter */ | |
1179 | prev=(int32_t)cnv->toUnicodeStatus; | |
1180 | if(prev==0) { | |
1181 | prev=BOCU1_ASCII_PREV; | |
1182 | } | |
1183 | diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ | |
1184 | count=diff&3; | |
1185 | diff>>=2; | |
1186 | ||
1187 | byteIndex=cnv->toULength; | |
1188 | bytes=cnv->toUBytes; | |
1189 | ||
1190 | /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ | |
b75a7d8f A |
1191 | if(count>0 && byteIndex>0 && target<targetLimit) { |
1192 | goto getTrail; | |
1193 | } | |
1194 | ||
1195 | fastSingle: | |
1196 | /* fast loop for single-byte differences */ | |
1197 | /* use count as the only loop counter variable */ | |
1198 | diff=sourceLimit-source; | |
1199 | count=pArgs->targetLimit-target; | |
1200 | if(count>diff) { | |
1201 | count=diff; | |
1202 | } | |
1203 | while(count>0) { | |
1204 | if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { | |
1205 | c=prev+(c-BOCU1_MIDDLE); | |
1206 | if(c<0x3000) { | |
1207 | *target++=(UChar)c; | |
1208 | prev=BOCU1_SIMPLE_PREV(c); | |
1209 | } else { | |
1210 | break; | |
1211 | } | |
1212 | } else if(c<=0x20) { | |
1213 | if(c!=0x20) { | |
1214 | prev=BOCU1_ASCII_PREV; | |
1215 | } | |
1216 | *target++=(UChar)c; | |
1217 | } else { | |
1218 | break; | |
1219 | } | |
1220 | ++source; | |
1221 | --count; | |
1222 | } | |
1223 | ||
1224 | /* decode a sequence of single and lead bytes */ | |
1225 | while(source<sourceLimit) { | |
1226 | if(target>=targetLimit) { | |
1227 | /* target is full */ | |
1228 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
1229 | break; | |
1230 | } | |
1231 | ||
1232 | c=*source++; | |
1233 | if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { | |
1234 | /* Write a code point directly from a single-byte difference. */ | |
1235 | c=prev+(c-BOCU1_MIDDLE); | |
1236 | if(c<0x3000) { | |
1237 | *target++=(UChar)c; | |
1238 | prev=BOCU1_SIMPLE_PREV(c); | |
1239 | goto fastSingle; | |
1240 | } | |
1241 | } else if(c<=0x20) { | |
1242 | /* | |
1243 | * Direct-encoded C0 control code or space. | |
1244 | * Reset prev for C0 control codes but not for space. | |
1245 | */ | |
1246 | if(c!=0x20) { | |
1247 | prev=BOCU1_ASCII_PREV; | |
1248 | } | |
1249 | *target++=(UChar)c; | |
1250 | continue; | |
1251 | } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { | |
1252 | /* Optimize two-byte case. */ | |
1253 | if(c>=BOCU1_MIDDLE) { | |
1254 | diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; | |
1255 | } else { | |
1256 | diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; | |
1257 | } | |
1258 | ||
1259 | /* trail byte */ | |
1260 | c=decodeBocu1TrailByte(1, *source++); | |
1261 | if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { | |
1262 | bytes[0]=source[-2]; | |
1263 | bytes[1]=source[-1]; | |
1264 | byteIndex=2; | |
374ca955 A |
1265 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
1266 | break; | |
b75a7d8f A |
1267 | } |
1268 | } else if(c==BOCU1_RESET) { | |
1269 | /* only reset the state, no code point */ | |
1270 | prev=BOCU1_ASCII_PREV; | |
1271 | continue; | |
1272 | } else { | |
1273 | /* | |
1274 | * For multi-byte difference lead bytes, set the decoder state | |
1275 | * with the partial difference value from the lead byte and | |
1276 | * with the number of trail bytes. | |
1277 | */ | |
1278 | bytes[0]=(uint8_t)c; | |
1279 | byteIndex=1; | |
1280 | ||
1281 | diff=decodeBocu1LeadByte(c); | |
1282 | count=diff&3; | |
1283 | diff>>=2; | |
1284 | getTrail: | |
1285 | for(;;) { | |
1286 | if(source>=sourceLimit) { | |
1287 | goto endloop; | |
1288 | } | |
1289 | c=bytes[byteIndex++]=*source++; | |
1290 | ||
1291 | /* trail byte in any position */ | |
1292 | c=decodeBocu1TrailByte(count, c); | |
1293 | if(c<0) { | |
374ca955 A |
1294 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
1295 | goto endloop; | |
b75a7d8f A |
1296 | } |
1297 | ||
1298 | diff+=c; | |
1299 | if(--count==0) { | |
1300 | /* final trail byte, deliver a code point */ | |
1301 | byteIndex=0; | |
1302 | c=prev+diff; | |
1303 | if((uint32_t)c>0x10ffff) { | |
374ca955 A |
1304 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
1305 | goto endloop; | |
b75a7d8f A |
1306 | } |
1307 | break; | |
1308 | } | |
1309 | } | |
1310 | } | |
1311 | ||
1312 | /* calculate the next prev and output c */ | |
1313 | prev=BOCU1_PREV(c); | |
1314 | if(c<=0xffff) { | |
1315 | *target++=(UChar)c; | |
1316 | } else { | |
1317 | /* output surrogate pair */ | |
1318 | *target++=UTF16_LEAD(c); | |
1319 | if(target<targetLimit) { | |
1320 | *target++=UTF16_TRAIL(c); | |
1321 | } else { | |
1322 | /* target overflow */ | |
1323 | cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c); | |
1324 | cnv->UCharErrorBufferLength=1; | |
1325 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
1326 | break; | |
1327 | } | |
1328 | } | |
1329 | } | |
1330 | endloop: | |
1331 | ||
374ca955 A |
1332 | if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { |
1333 | /* set the converter state in UConverter to deal with the next character */ | |
b75a7d8f A |
1334 | cnv->toUnicodeStatus=BOCU1_ASCII_PREV; |
1335 | cnv->mode=0; | |
b75a7d8f A |
1336 | } else { |
1337 | /* set the converter state back into UConverter */ | |
1338 | cnv->toUnicodeStatus=(uint32_t)prev; | |
1339 | cnv->mode=(diff<<2)|count; | |
b75a7d8f | 1340 | } |
374ca955 | 1341 | cnv->toULength=byteIndex; |
b75a7d8f | 1342 | |
b75a7d8f A |
1343 | /* write back the updated pointers */ |
1344 | pArgs->source=(const char *)source; | |
1345 | pArgs->target=target; | |
1346 | return; | |
b75a7d8f A |
1347 | } |
1348 | ||
1349 | /* miscellaneous ------------------------------------------------------------ */ | |
1350 | ||
1351 | static const UConverterImpl _Bocu1Impl={ | |
1352 | UCNV_BOCU1, | |
1353 | ||
1354 | NULL, | |
1355 | NULL, | |
1356 | ||
1357 | NULL, | |
1358 | NULL, | |
1359 | NULL, | |
1360 | ||
1361 | _Bocu1ToUnicode, | |
1362 | _Bocu1ToUnicodeWithOffsets, | |
1363 | _Bocu1FromUnicode, | |
1364 | _Bocu1FromUnicodeWithOffsets, | |
1365 | NULL, | |
1366 | ||
1367 | NULL, | |
1368 | NULL, | |
1369 | NULL, | |
1370 | NULL, | |
1371 | ucnv_getCompleteUnicodeSet | |
1372 | }; | |
1373 | ||
1374 | static const UConverterStaticData _Bocu1StaticData={ | |
1375 | sizeof(UConverterStaticData), | |
1376 | "BOCU-1", | |
1377 | 0, /* CCSID for BOCU-1 */ | |
1378 | UCNV_IBM, UCNV_BOCU1, | |
1379 | 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ | |
1380 | { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ | |
1381 | FALSE, FALSE, | |
1382 | 0, | |
1383 | 0, | |
1384 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
1385 | }; | |
1386 | ||
1387 | const UConverterSharedData _Bocu1Data={ | |
1388 | sizeof(UConverterSharedData), ~((uint32_t)0), | |
1389 | NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl, | |
1390 | 0 | |
1391 | }; | |
374ca955 A |
1392 | |
1393 | #endif |