]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u7.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u7.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u7.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24
25 /* UTF-7 -------------------------------------------------------------------- */
26
27 /*
28 * UTF-7 is a stateful encoding of Unicode.
29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30 * It was intended for use in Internet email systems, using in its bytewise
31 * encoding only a subset of 7-bit US-ASCII.
32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
33 * occasionally used.
34 *
35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36 * characters directly or in base64. Especially, the characters in set O
37 * as defined in the RFC (see below) may be encoded directly but are not
38 * allowed in, e.g., email headers.
39 * By default, the ICU UTF-7 converter encodes set O directly.
40 * By choosing the option "version=1", set O will be escaped instead.
41 * For example:
42 * utf7Converter=ucnv_open("UTF-7,version=1");
43 *
44 * For details about email headers see RFC 2047.
45 */
46
47 /*
48 * Tests for US-ASCII characters belonging to character classes
49 * defined in UTF-7.
50 *
51 * Set D (directly encoded characters) consists of the following
52 * characters: the upper and lower case letters A through Z
53 * and a through z, the 10 digits 0-9, and the following nine special
54 * characters (note that "+" and "=" are omitted):
55 * '(),-./:?
56 *
57 * Set O (optional direct characters) consists of the following
58 * characters (note that "\" and "~" are omitted):
59 * !"#$%&*;<=>@[]^_`{|}
60 *
61 * According to the rules in RFC 2152, the byte values for the following
62 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63 * - all C0 control codes except for CR LF TAB
64 * - BACKSLASH
65 * - TILDE
66 * - DEL
67 * - all codes beyond US-ASCII, i.e. all >127
68 */
69 #define inSetD(c) \
70 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71 (uint8_t)((c)-48)<10 || /* digits */ \
72 (uint8_t)((c)-39)<3 || /* '() */ \
73 (uint8_t)((c)-44)<4 || /* ,-./ */ \
74 (c)==58 || (c)==63 /* :? */ \
75 )
76
77 #define inSetO(c) \
78 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
79 (uint8_t)((c)-59)<4 || /* ;<=> */ \
80 (uint8_t)((c)-93)<4 || /* ]^_` */ \
81 (uint8_t)((c)-123)<3 || /* {|} */ \
82 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
83 )
84
85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
87
88 #define PLUS 43
89 #define MINUS 45
90 #define BACKSLASH 92
91 #define TILDE 126
92
93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
95
96 /* encode directly sets D and O and CR LF SP TAB */
97 static const UBool encodeDirectlyMaximum[128]={
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
107
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
110 };
111
112 /* encode directly set D and CR LF SP TAB but not set O */
113 static const UBool encodeDirectlyRestricted[128]={
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117
118 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
120
121 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
123
124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
126 };
127
128 static const uint8_t
129 toBase64[64]={
130 /* A-Z */
131 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
133 /* a-z */
134 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
136 /* 0-9 */
137 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
138 /* +/ */
139 43, 47
140 };
141
142 static const int8_t
143 fromBase64[128]={
144 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
147
148 /* general punctuation with + and / and a special value (-2) for - */
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
150 /* digits */
151 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
152
153 /* A-Z */
154 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
155 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
156
157 /* a-z */
158 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
160 };
161
162 /*
163 * converter status values:
164 *
165 * toUnicodeStatus:
166 * 24 inDirectMode (boolean)
167 * 23..16 base64Counter (-1..7)
168 * 15..0 bits (up to 14 bits incoming base64)
169 *
170 * fromUnicodeStatus:
171 * 31..28 version (0: set O direct 1: set O escaped)
172 * 24 inDirectMode (boolean)
173 * 23..16 base64Counter (0..2)
174 * 7..0 bits (6 bits outgoing base64)
175 *
176 */
177
178 static void
179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
180 if(choice<=UCNV_RESET_TO_UNICODE) {
181 /* reset toUnicode */
182 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
183 cnv->toULength=0;
184 }
185 if(choice!=UCNV_RESET_TO_UNICODE) {
186 /* reset fromUnicode */
187 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
188 }
189 }
190
191 static void
192 _UTF7Open(UConverter *cnv,
193 const char *name,
194 const char *locale,
195 uint32_t options,
196 UErrorCode *pErrorCode) {
197 if((options&0xf)<=1) {
198 cnv->fromUnicodeStatus=(options&0xf)<<28;
199 _UTF7Reset(cnv, UCNV_RESET_BOTH);
200 } else {
201 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
202 }
203 }
204
205 static void
206 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
207 UErrorCode *pErrorCode) {
208 UConverter *cnv;
209 const uint8_t *source, *sourceLimit;
210 UChar *target;
211 const UChar *targetLimit;
212 int32_t *offsets;
213
214 uint8_t *bytes;
215 uint8_t byteIndex;
216
217 int32_t length, targetCapacity;
218
219 /* UTF-7 state */
220 uint16_t bits;
221 int8_t base64Counter;
222 UBool inDirectMode;
223
224 int8_t base64Value;
225
226 int32_t sourceIndex, nextSourceIndex;
227
228 uint8_t b;
229 /* set up the local pointers */
230 cnv=pArgs->converter;
231
232 source=(const uint8_t *)pArgs->source;
233 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
234 target=pArgs->target;
235 targetLimit=pArgs->targetLimit;
236 offsets=pArgs->offsets;
237 /* get the state machine state */
238 {
239 uint32_t status=cnv->toUnicodeStatus;
240 inDirectMode=(UBool)((status>>24)&1);
241 base64Counter=(int8_t)(status>>16);
242 bits=(uint16_t)status;
243 }
244 bytes=cnv->toUBytes;
245 byteIndex=cnv->toULength;
246
247 /* sourceIndex=-1 if the current character began in the previous buffer */
248 sourceIndex=byteIndex==0 ? 0 : -1;
249 nextSourceIndex=0;
250
251 if(inDirectMode) {
252 directMode:
253 /*
254 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
255 * with their US-ASCII byte values.
256 * Backslash and Tilde and most control characters are not allowed in UTF-7.
257 * A plus sign starts Unicode (or "escape") Mode.
258 *
259 * In Direct Mode, only the sourceIndex is used.
260 */
261 byteIndex=0;
262 length=sourceLimit-source;
263 targetCapacity=targetLimit-target;
264 if(length>targetCapacity) {
265 length=targetCapacity;
266 }
267 while(length>0) {
268 b=*source++;
269 if(!isLegalUTF7(b)) {
270 /* illegal */
271 bytes[0]=b;
272 byteIndex=1;
273 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
274 break;
275 } else if(b!=PLUS) {
276 /* write directly encoded character */
277 *target++=b;
278 if(offsets!=NULL) {
279 *offsets++=sourceIndex++;
280 }
281 } else /* PLUS */ {
282 /* switch to Unicode mode */
283 nextSourceIndex=++sourceIndex;
284 inDirectMode=FALSE;
285 byteIndex=0;
286 bits=0;
287 base64Counter=-1;
288 goto unicodeMode;
289 }
290 --length;
291 }
292 if(source<sourceLimit && target>=targetLimit) {
293 /* target is full */
294 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
295 }
296 } else {
297 unicodeMode:
298 /*
299 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
300 * The base64 sequence ends with any character that is not in the base64 alphabet.
301 * A terminating minus sign is consumed.
302 *
303 * In Unicode Mode, the sourceIndex has the index to the start of the current
304 * base64 bytes, while nextSourceIndex is precisely parallel to source,
305 * keeping the index to the following byte.
306 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
307 */
308 while(source<sourceLimit) {
309 if(target<targetLimit) {
310 bytes[byteIndex++]=b=*source++;
311 ++nextSourceIndex;
312 if(b>=126) {
313 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
314 inDirectMode=TRUE;
315 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
316 break;
317 } else if((base64Value=fromBase64[b])>=0) {
318 /* collect base64 bytes into UChars */
319 switch(base64Counter) {
320 case -1: /* -1 is immediately after the + */
321 case 0:
322 bits=base64Value;
323 base64Counter=1;
324 break;
325 case 1:
326 case 3:
327 case 4:
328 case 6:
329 bits=(uint16_t)((bits<<6)|base64Value);
330 ++base64Counter;
331 break;
332 case 2:
333 *target++=(UChar)((bits<<4)|(base64Value>>2));
334 if(offsets!=NULL) {
335 *offsets++=sourceIndex;
336 sourceIndex=nextSourceIndex-1;
337 }
338 bytes[0]=b; /* keep this byte in case an error occurs */
339 byteIndex=1;
340 bits=(uint16_t)(base64Value&3);
341 base64Counter=3;
342 break;
343 case 5:
344 *target++=(UChar)((bits<<2)|(base64Value>>4));
345 if(offsets!=NULL) {
346 *offsets++=sourceIndex;
347 sourceIndex=nextSourceIndex-1;
348 }
349 bytes[0]=b; /* keep this byte in case an error occurs */
350 byteIndex=1;
351 bits=(uint16_t)(base64Value&15);
352 base64Counter=6;
353 break;
354 case 7:
355 *target++=(UChar)((bits<<6)|base64Value);
356 if(offsets!=NULL) {
357 *offsets++=sourceIndex;
358 sourceIndex=nextSourceIndex;
359 }
360 byteIndex=0;
361 bits=0;
362 base64Counter=0;
363 break;
364 default:
365 /* will never occur */
366 break;
367 }
368 } else if(base64Value==-2) {
369 /* minus sign terminates the base64 sequence */
370 inDirectMode=TRUE;
371 if(base64Counter==-1) {
372 /* +- i.e. a minus immediately following a plus */
373 *target++=PLUS;
374 if(offsets!=NULL) {
375 *offsets++=sourceIndex-1;
376 }
377 } else {
378 /* absorb the minus and leave the Unicode Mode */
379 if(bits!=0) {
380 /* bits are illegally left over, a UChar is incomplete */
381 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
382 break;
383 }
384 }
385 sourceIndex=nextSourceIndex;
386 goto directMode;
387 } else if(base64Value==-1) /* for any legal character except base64 and minus sign */ {
388 /* leave the Unicode Mode */
389 inDirectMode=TRUE;
390 if(base64Counter==-1) {
391 /* illegal: + immediately followed by something other than base64 or minus sign */
392 /* include the plus sign in the reported sequence */
393 --sourceIndex;
394 bytes[0]=PLUS;
395 bytes[1]=b;
396 byteIndex=2;
397 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
398 break;
399 } else if(bits==0) {
400 /* un-read the character in case it is a plus sign */
401 --source;
402 sourceIndex=nextSourceIndex-1;
403 goto directMode;
404 } else {
405 /* bits are illegally left over, a UChar is incomplete */
406 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
407 break;
408 }
409 } else /* base64Value==-3 for illegal characters */ {
410 /* illegal */
411 inDirectMode=TRUE;
412 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
413 break;
414 }
415 } else {
416 /* target is full */
417 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
418 break;
419 }
420 }
421 }
422
423 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
424 /*
425 * if we are in Unicode mode, then the byteIndex might not be 0,
426 * but that is ok if bits==0
427 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
428 * (not true for IMAP-mailbox-name where we must end in direct mode)
429 */
430 byteIndex=0;
431 }
432
433 /* set the converter state back into UConverter */
434 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
435 cnv->toULength=byteIndex;
436
437 /* write back the updated pointers */
438 pArgs->source=(const char *)source;
439 pArgs->target=target;
440 pArgs->offsets=offsets;
441 return;
442 }
443
444 static void
445 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
446 UErrorCode *pErrorCode) {
447 UConverter *cnv;
448 const UChar *source, *sourceLimit;
449 uint8_t *target, *targetLimit;
450 int32_t *offsets;
451
452 int32_t length, targetCapacity, sourceIndex;
453 UChar c;
454
455 /* UTF-7 state */
456 const UBool *encodeDirectly;
457 uint8_t bits;
458 int8_t base64Counter;
459 UBool inDirectMode;
460
461 /* set up the local pointers */
462 cnv=pArgs->converter;
463
464 /* set up the local pointers */
465 source=pArgs->source;
466 sourceLimit=pArgs->sourceLimit;
467 target=(uint8_t *)pArgs->target;
468 targetLimit=(uint8_t *)pArgs->targetLimit;
469 offsets=pArgs->offsets;
470
471 /* get the state machine state */
472 {
473 uint32_t status=cnv->fromUnicodeStatus;
474 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
475 inDirectMode=(UBool)((status>>24)&1);
476 base64Counter=(int8_t)(status>>16);
477 bits=(uint8_t)status;
478 }
479
480 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
481 sourceIndex=0;
482
483 if(inDirectMode) {
484 directMode:
485 length=sourceLimit-source;
486 targetCapacity=targetLimit-target;
487 if(length>targetCapacity) {
488 length=targetCapacity;
489 }
490 while(length>0) {
491 c=*source++;
492 /* currently always encode CR LF SP TAB directly */
493 if(c<=127 && encodeDirectly[c]) {
494 /* encode directly */
495 *target++=(uint8_t)c;
496 if(offsets!=NULL) {
497 *offsets++=sourceIndex++;
498 }
499 } else if(c==PLUS) {
500 /* output +- for + */
501 *target++=PLUS;
502 if(target<targetLimit) {
503 *target++=MINUS;
504 if(offsets!=NULL) {
505 *offsets++=sourceIndex;
506 *offsets++=sourceIndex++;
507 }
508 /* realign length and targetCapacity */
509 goto directMode;
510 } else {
511 if(offsets!=NULL) {
512 *offsets++=sourceIndex++;
513 }
514 cnv->charErrorBuffer[0]=MINUS;
515 cnv->charErrorBufferLength=1;
516 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
517 break;
518 }
519 } else {
520 /* un-read this character and switch to Unicode Mode */
521 --source;
522 *target++=PLUS;
523 if(offsets!=NULL) {
524 *offsets++=sourceIndex;
525 }
526 inDirectMode=FALSE;
527 base64Counter=0;
528 goto unicodeMode;
529 }
530 --length;
531 }
532 if(source<sourceLimit && target>=targetLimit) {
533 /* target is full */
534 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
535 }
536 } else {
537 unicodeMode:
538 while(source<sourceLimit) {
539 if(target<targetLimit) {
540 c=*source++;
541 if(c<=127 && encodeDirectly[c]) {
542 /* encode directly */
543 inDirectMode=TRUE;
544
545 /* trick: back out this character to make this easier */
546 --source;
547
548 /* terminate the base64 sequence */
549 if(base64Counter!=0) {
550 /* write remaining bits for the previous character */
551 *target++=toBase64[bits];
552 if(offsets!=NULL) {
553 *offsets++=sourceIndex-1;
554 }
555 }
556 if(fromBase64[c]!=-1) {
557 /* need to terminate with a minus */
558 if(target<targetLimit) {
559 *target++=MINUS;
560 if(offsets!=NULL) {
561 *offsets++=sourceIndex-1;
562 }
563 } else {
564 cnv->charErrorBuffer[0]=MINUS;
565 cnv->charErrorBufferLength=1;
566 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
567 break;
568 }
569 }
570 goto directMode;
571 } else {
572 /*
573 * base64 this character:
574 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
575 * and the bits of this character, each implicitly in UTF-16BE.
576 *
577 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
578 * character to the next. The actual 2 or 4 bits are shifted to the left edge
579 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
580 */
581 switch(base64Counter) {
582 case 0:
583 *target++=toBase64[c>>10];
584 if(target<targetLimit) {
585 *target++=toBase64[(c>>4)&0x3f];
586 if(offsets!=NULL) {
587 *offsets++=sourceIndex;
588 *offsets++=sourceIndex++;
589 }
590 } else {
591 if(offsets!=NULL) {
592 *offsets++=sourceIndex++;
593 }
594 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
595 cnv->charErrorBufferLength=1;
596 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
597 }
598 bits=(uint8_t)((c&15)<<2);
599 base64Counter=1;
600 break;
601 case 1:
602 *target++=toBase64[bits|(c>>14)];
603 if(target<targetLimit) {
604 *target++=toBase64[(c>>8)&0x3f];
605 if(target<targetLimit) {
606 *target++=toBase64[(c>>2)&0x3f];
607 if(offsets!=NULL) {
608 *offsets++=sourceIndex;
609 *offsets++=sourceIndex;
610 *offsets++=sourceIndex++;
611 }
612 } else {
613 if(offsets!=NULL) {
614 *offsets++=sourceIndex;
615 *offsets++=sourceIndex++;
616 }
617 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
618 cnv->charErrorBufferLength=1;
619 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
620 }
621 } else {
622 if(offsets!=NULL) {
623 *offsets++=sourceIndex++;
624 }
625 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
626 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
627 cnv->charErrorBufferLength=2;
628 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
629 }
630 bits=(uint8_t)((c&3)<<4);
631 base64Counter=2;
632 break;
633 case 2:
634 *target++=toBase64[bits|(c>>12)];
635 if(target<targetLimit) {
636 *target++=toBase64[(c>>6)&0x3f];
637 if(target<targetLimit) {
638 *target++=toBase64[c&0x3f];
639 if(offsets!=NULL) {
640 *offsets++=sourceIndex;
641 *offsets++=sourceIndex;
642 *offsets++=sourceIndex++;
643 }
644 } else {
645 if(offsets!=NULL) {
646 *offsets++=sourceIndex;
647 *offsets++=sourceIndex++;
648 }
649 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
650 cnv->charErrorBufferLength=1;
651 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
652 }
653 } else {
654 if(offsets!=NULL) {
655 *offsets++=sourceIndex++;
656 }
657 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
658 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
659 cnv->charErrorBufferLength=2;
660 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
661 }
662 bits=0;
663 base64Counter=0;
664 break;
665 default:
666 /* will never occur */
667 break;
668 }
669 }
670 } else {
671 /* target is full */
672 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
673 break;
674 }
675 }
676 }
677
678 if(pArgs->flush && source>=sourceLimit) {
679 /* flush remaining bits to the target */
680 if(!inDirectMode && base64Counter!=0) {
681 if(target<targetLimit) {
682 *target++=toBase64[bits];
683 if(offsets!=NULL) {
684 *offsets++=sourceIndex-1;
685 }
686 } else {
687 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
688 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
689 }
690 }
691 /* reset the state for the next conversion */
692 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
693 } else {
694 /* set the converter state back into UConverter */
695 cnv->fromUnicodeStatus=
696 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
697 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
698 }
699
700 /* write back the updated pointers */
701 pArgs->source=source;
702 pArgs->target=(char *)target;
703 pArgs->offsets=offsets;
704 return;
705 }
706
707 static const char *
708 _UTF7GetName(const UConverter *cnv) {
709 switch(cnv->fromUnicodeStatus>>28) {
710 case 1:
711 return "UTF-7,version=1";
712 default:
713 return "UTF-7";
714 }
715 }
716
717 static const UConverterImpl _UTF7Impl={
718 UCNV_UTF7,
719
720 NULL,
721 NULL,
722
723 _UTF7Open,
724 NULL,
725 _UTF7Reset,
726
727 _UTF7ToUnicodeWithOffsets,
728 _UTF7ToUnicodeWithOffsets,
729 _UTF7FromUnicodeWithOffsets,
730 _UTF7FromUnicodeWithOffsets,
731 NULL,
732
733 NULL,
734 _UTF7GetName,
735 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
736 NULL,
737 ucnv_getCompleteUnicodeSet
738 };
739
740 static const UConverterStaticData _UTF7StaticData={
741 sizeof(UConverterStaticData),
742 "UTF-7",
743 0, /* TODO CCSID for UTF-7 */
744 UCNV_IBM, UCNV_UTF7,
745 1, 4,
746 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
747 FALSE, FALSE,
748 0,
749 0,
750 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
751 };
752
753 const UConverterSharedData _UTF7Data={
754 sizeof(UConverterSharedData), ~((uint32_t)0),
755 NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
756 0
757 };
758
759 /* IMAP mailbox name encoding ----------------------------------------------- */
760
761 /*
762 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
763 * http://www.ietf.org/rfc/rfc2060.txt
764 *
765 * 5.1.3. Mailbox International Naming Convention
766 *
767 * By convention, international mailbox names are specified using a
768 * modified version of the UTF-7 encoding described in [UTF-7]. The
769 * purpose of these modifications is to correct the following problems
770 * with UTF-7:
771 *
772 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
773 * the common use of "+" in mailbox names, in particular USENET
774 * newsgroup names.
775 *
776 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
777 * conflicts with the use of "/" as a popular hierarchy delimiter.
778 *
779 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
780 * the use of "\" as a popular hierarchy delimiter.
781 *
782 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
783 * the use of "~" in some servers as a home directory indicator.
784 *
785 * 5) UTF-7 permits multiple alternate forms to represent the same
786 * string; in particular, printable US-ASCII chararacters can be
787 * represented in encoded form.
788 *
789 * In modified UTF-7, printable US-ASCII characters except for "&"
790 * represent themselves; that is, characters with octet values 0x20-0x25
791 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
792 * octet sequence "&-".
793 *
794 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
795 * Unicode 16-bit octets) are represented in modified BASE64, with a
796 * further modification from [UTF-7] that "," is used instead of "/".
797 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
798 * character which can represent itself.
799 *
800 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
801 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
802 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
803 * ").
804 *
805 * For example, here is a mailbox name which mixes English, Japanese,
806 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
807 */
808
809 /*
810 * Tests for US-ASCII characters belonging to character classes
811 * defined in UTF-7.
812 *
813 * Set D (directly encoded characters) consists of the following
814 * characters: the upper and lower case letters A through Z
815 * and a through z, the 10 digits 0-9, and the following nine special
816 * characters (note that "+" and "=" are omitted):
817 * '(),-./:?
818 *
819 * Set O (optional direct characters) consists of the following
820 * characters (note that "\" and "~" are omitted):
821 * !"#$%&*;<=>@[]^_`{|}
822 *
823 * According to the rules in RFC 2152, the byte values for the following
824 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
825 * - all C0 control codes except for CR LF TAB
826 * - BACKSLASH
827 * - TILDE
828 * - DEL
829 * - all codes beyond US-ASCII, i.e. all >127
830 */
831
832 /* uses '&' not '+' to start a base64 sequence */
833 #define AMPERSAND 0x26
834 #define COMMA 0x2c
835 #define SLASH 0x2f
836
837 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
838 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
839
840 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
841 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
842
843 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
844 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
845
846 /*
847 * converter status values:
848 *
849 * toUnicodeStatus:
850 * 24 inDirectMode (boolean)
851 * 23..16 base64Counter (-1..7)
852 * 15..0 bits (up to 14 bits incoming base64)
853 *
854 * fromUnicodeStatus:
855 * 24 inDirectMode (boolean)
856 * 23..16 base64Counter (0..2)
857 * 7..0 bits (6 bits outgoing base64)
858 *
859 * ignore bits 31..25
860 */
861
862 static void
863 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
864 UErrorCode *pErrorCode) {
865 UConverter *cnv;
866 const uint8_t *source, *sourceLimit;
867 UChar *target;
868 const UChar *targetLimit;
869 int32_t *offsets;
870
871 uint8_t *bytes;
872 uint8_t byteIndex;
873
874 int32_t length, targetCapacity;
875
876 /* UTF-7 state */
877 uint16_t bits;
878 int8_t base64Counter;
879 UBool inDirectMode;
880
881 int8_t base64Value;
882
883 int32_t sourceIndex, nextSourceIndex;
884
885 UChar c;
886 uint8_t b;
887
888 /* set up the local pointers */
889 cnv=pArgs->converter;
890
891 source=(const uint8_t *)pArgs->source;
892 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
893 target=pArgs->target;
894 targetLimit=pArgs->targetLimit;
895 offsets=pArgs->offsets;
896 /* get the state machine state */
897 {
898 uint32_t status=cnv->toUnicodeStatus;
899 inDirectMode=(UBool)((status>>24)&1);
900 base64Counter=(int8_t)(status>>16);
901 bits=(uint16_t)status;
902 }
903 bytes=cnv->toUBytes;
904 byteIndex=cnv->toULength;
905
906 /* sourceIndex=-1 if the current character began in the previous buffer */
907 sourceIndex=byteIndex==0 ? 0 : -1;
908 nextSourceIndex=0;
909
910 if(inDirectMode) {
911 directMode:
912 /*
913 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
914 * with their US-ASCII byte values.
915 * An ampersand starts Unicode (or "escape") Mode.
916 *
917 * In Direct Mode, only the sourceIndex is used.
918 */
919 byteIndex=0;
920 length=sourceLimit-source;
921 targetCapacity=targetLimit-target;
922 if(length>targetCapacity) {
923 length=targetCapacity;
924 }
925 while(length>0) {
926 b=*source++;
927 if(!isLegalIMAP(b)) {
928 /* illegal */
929 bytes[0]=b;
930 byteIndex=1;
931 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
932 break;
933 } else if(b!=AMPERSAND) {
934 /* write directly encoded character */
935 *target++=b;
936 if(offsets!=NULL) {
937 *offsets++=sourceIndex++;
938 }
939 } else /* AMPERSAND */ {
940 /* switch to Unicode mode */
941 nextSourceIndex=++sourceIndex;
942 inDirectMode=FALSE;
943 byteIndex=0;
944 bits=0;
945 base64Counter=-1;
946 goto unicodeMode;
947 }
948 --length;
949 }
950 if(source<sourceLimit && target>=targetLimit) {
951 /* target is full */
952 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
953 }
954 } else {
955 unicodeMode:
956 /*
957 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
958 * The base64 sequence ends with any character that is not in the base64 alphabet.
959 * A terminating minus sign is consumed.
960 * US-ASCII must not be base64-ed.
961 *
962 * In Unicode Mode, the sourceIndex has the index to the start of the current
963 * base64 bytes, while nextSourceIndex is precisely parallel to source,
964 * keeping the index to the following byte.
965 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
966 */
967 while(source<sourceLimit) {
968 if(target<targetLimit) {
969 bytes[byteIndex++]=b=*source++;
970 ++nextSourceIndex;
971 if(b>0x7e) {
972 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
973 inDirectMode=TRUE;
974 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
975 break;
976 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
977 /* collect base64 bytes into UChars */
978 switch(base64Counter) {
979 case -1: /* -1 is immediately after the & */
980 case 0:
981 bits=base64Value;
982 base64Counter=1;
983 break;
984 case 1:
985 case 3:
986 case 4:
987 case 6:
988 bits=(uint16_t)((bits<<6)|base64Value);
989 ++base64Counter;
990 break;
991 case 2:
992 c=(UChar)((bits<<4)|(base64Value>>2));
993 if(isLegalIMAP(c)) {
994 /* illegal */
995 inDirectMode=TRUE;
996 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
997 goto endloop;
998 }
999 *target++=c;
1000 if(offsets!=NULL) {
1001 *offsets++=sourceIndex;
1002 sourceIndex=nextSourceIndex-1;
1003 }
1004 bytes[0]=b; /* keep this byte in case an error occurs */
1005 byteIndex=1;
1006 bits=(uint16_t)(base64Value&3);
1007 base64Counter=3;
1008 break;
1009 case 5:
1010 c=(UChar)((bits<<2)|(base64Value>>4));
1011 if(isLegalIMAP(c)) {
1012 /* illegal */
1013 inDirectMode=TRUE;
1014 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1015 goto endloop;
1016 }
1017 *target++=c;
1018 if(offsets!=NULL) {
1019 *offsets++=sourceIndex;
1020 sourceIndex=nextSourceIndex-1;
1021 }
1022 bytes[0]=b; /* keep this byte in case an error occurs */
1023 byteIndex=1;
1024 bits=(uint16_t)(base64Value&15);
1025 base64Counter=6;
1026 break;
1027 case 7:
1028 c=(UChar)((bits<<6)|base64Value);
1029 if(isLegalIMAP(c)) {
1030 /* illegal */
1031 inDirectMode=TRUE;
1032 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1033 goto endloop;
1034 }
1035 *target++=c;
1036 if(offsets!=NULL) {
1037 *offsets++=sourceIndex;
1038 sourceIndex=nextSourceIndex;
1039 }
1040 byteIndex=0;
1041 bits=0;
1042 base64Counter=0;
1043 break;
1044 default:
1045 /* will never occur */
1046 break;
1047 }
1048 } else if(base64Value==-2) {
1049 /* minus sign terminates the base64 sequence */
1050 inDirectMode=TRUE;
1051 if(base64Counter==-1) {
1052 /* &- i.e. a minus immediately following an ampersand */
1053 *target++=AMPERSAND;
1054 if(offsets!=NULL) {
1055 *offsets++=sourceIndex-1;
1056 }
1057 } else {
1058 /* absorb the minus and leave the Unicode Mode */
1059 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1060 /* bits are illegally left over, a UChar is incomplete */
1061 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1062 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063 break;
1064 }
1065 }
1066 sourceIndex=nextSourceIndex;
1067 goto directMode;
1068 } else {
1069 if(base64Counter==-1) {
1070 /* illegal: & immediately followed by something other than base64 or minus sign */
1071 /* include the ampersand in the reported sequence */
1072 --sourceIndex;
1073 bytes[0]=AMPERSAND;
1074 bytes[1]=b;
1075 byteIndex=2;
1076 }
1077 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1078 /* base64Value==-3 for illegal characters */
1079 /* illegal */
1080 inDirectMode=TRUE;
1081 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1082 break;
1083 }
1084 } else {
1085 /* target is full */
1086 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1087 break;
1088 }
1089 }
1090 }
1091 endloop:
1092
1093 /*
1094 * the end of the input stream and detection of truncated input
1095 * are handled by the framework, but here we must check if we are in Unicode
1096 * mode and byteIndex==0 because we must end in direct mode
1097 *
1098 * conditions:
1099 * successful
1100 * in Unicode mode and byteIndex==0
1101 * end of input and no truncated input
1102 */
1103 if( U_SUCCESS(*pErrorCode) &&
1104 !inDirectMode && byteIndex==0 &&
1105 pArgs->flush && source>=sourceLimit
1106 ) {
1107 if(base64Counter==-1) {
1108 /* & at the very end of the input */
1109 /* make the ampersand the reported sequence */
1110 bytes[0]=AMPERSAND;
1111 byteIndex=1;
1112 }
1113 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1114
1115 inDirectMode=TRUE; /* avoid looping */
1116 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1117 }
1118
1119 /* set the converter state back into UConverter */
1120 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1121 cnv->toULength=byteIndex;
1122
1123 /* write back the updated pointers */
1124 pArgs->source=(const char *)source;
1125 pArgs->target=target;
1126 pArgs->offsets=offsets;
1127 return;
1128 }
1129
1130 static void
1131 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1132 UErrorCode *pErrorCode) {
1133 UConverter *cnv;
1134 const UChar *source, *sourceLimit;
1135 uint8_t *target, *targetLimit;
1136 int32_t *offsets;
1137
1138 int32_t length, targetCapacity, sourceIndex;
1139 UChar c;
1140 uint8_t b;
1141
1142 /* UTF-7 state */
1143 uint8_t bits;
1144 int8_t base64Counter;
1145 UBool inDirectMode;
1146
1147 /* set up the local pointers */
1148 cnv=pArgs->converter;
1149
1150 /* set up the local pointers */
1151 source=pArgs->source;
1152 sourceLimit=pArgs->sourceLimit;
1153 target=(uint8_t *)pArgs->target;
1154 targetLimit=(uint8_t *)pArgs->targetLimit;
1155 offsets=pArgs->offsets;
1156
1157 /* get the state machine state */
1158 {
1159 uint32_t status=cnv->fromUnicodeStatus;
1160 inDirectMode=(UBool)((status>>24)&1);
1161 base64Counter=(int8_t)(status>>16);
1162 bits=(uint8_t)status;
1163 }
1164
1165 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1166 sourceIndex=0;
1167
1168 if(inDirectMode) {
1169 directMode:
1170 length=sourceLimit-source;
1171 targetCapacity=targetLimit-target;
1172 if(length>targetCapacity) {
1173 length=targetCapacity;
1174 }
1175 while(length>0) {
1176 c=*source++;
1177 /* encode 0x20..0x7e except '&' directly */
1178 if(inSetDIMAP(c)) {
1179 /* encode directly */
1180 *target++=(uint8_t)c;
1181 if(offsets!=NULL) {
1182 *offsets++=sourceIndex++;
1183 }
1184 } else if(c==AMPERSAND) {
1185 /* output &- for & */
1186 *target++=AMPERSAND;
1187 if(target<targetLimit) {
1188 *target++=MINUS;
1189 if(offsets!=NULL) {
1190 *offsets++=sourceIndex;
1191 *offsets++=sourceIndex++;
1192 }
1193 /* realign length and targetCapacity */
1194 goto directMode;
1195 } else {
1196 if(offsets!=NULL) {
1197 *offsets++=sourceIndex++;
1198 }
1199 cnv->charErrorBuffer[0]=MINUS;
1200 cnv->charErrorBufferLength=1;
1201 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1202 break;
1203 }
1204 } else {
1205 /* un-read this character and switch to Unicode Mode */
1206 --source;
1207 *target++=AMPERSAND;
1208 if(offsets!=NULL) {
1209 *offsets++=sourceIndex;
1210 }
1211 inDirectMode=FALSE;
1212 base64Counter=0;
1213 goto unicodeMode;
1214 }
1215 --length;
1216 }
1217 if(source<sourceLimit && target>=targetLimit) {
1218 /* target is full */
1219 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1220 }
1221 } else {
1222 unicodeMode:
1223 while(source<sourceLimit) {
1224 if(target<targetLimit) {
1225 c=*source++;
1226 if(isLegalIMAP(c)) {
1227 /* encode directly */
1228 inDirectMode=TRUE;
1229
1230 /* trick: back out this character to make this easier */
1231 --source;
1232
1233 /* terminate the base64 sequence */
1234 if(base64Counter!=0) {
1235 /* write remaining bits for the previous character */
1236 *target++=TO_BASE64_IMAP(bits);
1237 if(offsets!=NULL) {
1238 *offsets++=sourceIndex-1;
1239 }
1240 }
1241 /* need to terminate with a minus */
1242 if(target<targetLimit) {
1243 *target++=MINUS;
1244 if(offsets!=NULL) {
1245 *offsets++=sourceIndex-1;
1246 }
1247 } else {
1248 cnv->charErrorBuffer[0]=MINUS;
1249 cnv->charErrorBufferLength=1;
1250 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1251 break;
1252 }
1253 goto directMode;
1254 } else {
1255 /*
1256 * base64 this character:
1257 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1258 * and the bits of this character, each implicitly in UTF-16BE.
1259 *
1260 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1261 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1262 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1263 */
1264 switch(base64Counter) {
1265 case 0:
1266 b=(uint8_t)(c>>10);
1267 *target++=TO_BASE64_IMAP(b);
1268 if(target<targetLimit) {
1269 b=(uint8_t)((c>>4)&0x3f);
1270 *target++=TO_BASE64_IMAP(b);
1271 if(offsets!=NULL) {
1272 *offsets++=sourceIndex;
1273 *offsets++=sourceIndex++;
1274 }
1275 } else {
1276 if(offsets!=NULL) {
1277 *offsets++=sourceIndex++;
1278 }
1279 b=(uint8_t)((c>>4)&0x3f);
1280 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1281 cnv->charErrorBufferLength=1;
1282 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283 }
1284 bits=(uint8_t)((c&15)<<2);
1285 base64Counter=1;
1286 break;
1287 case 1:
1288 b=(uint8_t)(bits|(c>>14));
1289 *target++=TO_BASE64_IMAP(b);
1290 if(target<targetLimit) {
1291 b=(uint8_t)((c>>8)&0x3f);
1292 *target++=TO_BASE64_IMAP(b);
1293 if(target<targetLimit) {
1294 b=(uint8_t)((c>>2)&0x3f);
1295 *target++=TO_BASE64_IMAP(b);
1296 if(offsets!=NULL) {
1297 *offsets++=sourceIndex;
1298 *offsets++=sourceIndex;
1299 *offsets++=sourceIndex++;
1300 }
1301 } else {
1302 if(offsets!=NULL) {
1303 *offsets++=sourceIndex;
1304 *offsets++=sourceIndex++;
1305 }
1306 b=(uint8_t)((c>>2)&0x3f);
1307 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1308 cnv->charErrorBufferLength=1;
1309 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1310 }
1311 } else {
1312 if(offsets!=NULL) {
1313 *offsets++=sourceIndex++;
1314 }
1315 b=(uint8_t)((c>>8)&0x3f);
1316 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1317 b=(uint8_t)((c>>2)&0x3f);
1318 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1319 cnv->charErrorBufferLength=2;
1320 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1321 }
1322 bits=(uint8_t)((c&3)<<4);
1323 base64Counter=2;
1324 break;
1325 case 2:
1326 b=(uint8_t)(bits|(c>>12));
1327 *target++=TO_BASE64_IMAP(b);
1328 if(target<targetLimit) {
1329 b=(uint8_t)((c>>6)&0x3f);
1330 *target++=TO_BASE64_IMAP(b);
1331 if(target<targetLimit) {
1332 b=(uint8_t)(c&0x3f);
1333 *target++=TO_BASE64_IMAP(b);
1334 if(offsets!=NULL) {
1335 *offsets++=sourceIndex;
1336 *offsets++=sourceIndex;
1337 *offsets++=sourceIndex++;
1338 }
1339 } else {
1340 if(offsets!=NULL) {
1341 *offsets++=sourceIndex;
1342 *offsets++=sourceIndex++;
1343 }
1344 b=(uint8_t)(c&0x3f);
1345 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1346 cnv->charErrorBufferLength=1;
1347 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1348 }
1349 } else {
1350 if(offsets!=NULL) {
1351 *offsets++=sourceIndex++;
1352 }
1353 b=(uint8_t)((c>>6)&0x3f);
1354 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1355 b=(uint8_t)(c&0x3f);
1356 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1357 cnv->charErrorBufferLength=2;
1358 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1359 }
1360 bits=0;
1361 base64Counter=0;
1362 break;
1363 default:
1364 /* will never occur */
1365 break;
1366 }
1367 }
1368 } else {
1369 /* target is full */
1370 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371 break;
1372 }
1373 }
1374 }
1375
1376 if(pArgs->flush && source>=sourceLimit) {
1377 /* flush remaining bits to the target */
1378 if(!inDirectMode) {
1379 if(base64Counter!=0) {
1380 if(target<targetLimit) {
1381 *target++=TO_BASE64_IMAP(bits);
1382 if(offsets!=NULL) {
1383 *offsets++=sourceIndex-1;
1384 }
1385 } else {
1386 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1387 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1388 }
1389 }
1390 /* need to terminate with a minus */
1391 if(target<targetLimit) {
1392 *target++=MINUS;
1393 if(offsets!=NULL) {
1394 *offsets++=sourceIndex-1;
1395 }
1396 } else {
1397 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1398 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1399 }
1400 }
1401 /* reset the state for the next conversion */
1402 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1403 } else {
1404 /* set the converter state back into UConverter */
1405 cnv->fromUnicodeStatus=
1406 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1407 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1408 }
1409
1410 /* write back the updated pointers */
1411 pArgs->source=source;
1412 pArgs->target=(char *)target;
1413 pArgs->offsets=offsets;
1414 return;
1415 }
1416
1417 static const UConverterImpl _IMAPImpl={
1418 UCNV_IMAP_MAILBOX,
1419
1420 NULL,
1421 NULL,
1422
1423 _UTF7Open,
1424 NULL,
1425 _UTF7Reset,
1426
1427 _IMAPToUnicodeWithOffsets,
1428 _IMAPToUnicodeWithOffsets,
1429 _IMAPFromUnicodeWithOffsets,
1430 _IMAPFromUnicodeWithOffsets,
1431 NULL,
1432
1433 NULL,
1434 NULL,
1435 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1436 NULL,
1437 ucnv_getCompleteUnicodeSet
1438 };
1439
1440 static const UConverterStaticData _IMAPStaticData={
1441 sizeof(UConverterStaticData),
1442 "IMAP-mailbox-name",
1443 0, /* TODO CCSID for IMAP-mailbox-name */
1444 UCNV_IBM, UCNV_IMAP_MAILBOX,
1445 1, 4,
1446 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1447 FALSE, FALSE,
1448 0,
1449 0,
1450 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1451 };
1452
1453 const UConverterSharedData _IMAPData={
1454 sizeof(UConverterSharedData), ~((uint32_t)0),
1455 NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1456 0
1457 };
1458
1459 #endif