2 **********************************************************************
3 * Copyright (C) 2002-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
25 /* UTF-7 -------------------------------------------------------------------- */
28 * UTF-7 is a stateful encoding of Unicode.
29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30 * It was intended for use in Internet email systems, using in its bytewise
31 * encoding only a subset of 7-bit US-ASCII.
32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36 * characters directly or in base64. Especially, the characters in set O
37 * as defined in the RFC (see below) may be encoded directly but are not
38 * allowed in, e.g., email headers.
39 * By default, the ICU UTF-7 converter encodes set O directly.
40 * By choosing the option "version=1", set O will be escaped instead.
42 * utf7Converter=ucnv_open("UTF-7,version=1");
44 * For details about email headers see RFC 2047.
48 * Tests for US-ASCII characters belonging to character classes
51 * Set D (directly encoded characters) consists of the following
52 * characters: the upper and lower case letters A through Z
53 * and a through z, the 10 digits 0-9, and the following nine special
54 * characters (note that "+" and "=" are omitted):
57 * Set O (optional direct characters) consists of the following
58 * characters (note that "\" and "~" are omitted):
59 * !"#$%&*;<=>@[]^_`{|}
61 * According to the rules in RFC 2152, the byte values for the following
62 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63 * - all C0 control codes except for CR LF TAB
67 * - all codes beyond US-ASCII, i.e. all >127
70 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71 (uint8_t)((c)-48)<10 || /* digits */ \
72 (uint8_t)((c)-39)<3 || /* '() */ \
73 (uint8_t)((c)-44)<4 || /* ,-./ */ \
74 (c)==58 || (c)==63 /* :? */ \
78 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
79 (uint8_t)((c)-59)<4 || /* ;<=> */ \
80 (uint8_t)((c)-93)<4 || /* ]^_` */ \
81 (uint8_t)((c)-123)<3 || /* {|} */ \
82 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
96 /* encode directly sets D and O and CR LF SP TAB */
97 static const UBool encodeDirectlyMaximum
[128]={
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
112 /* encode directly set D and CR LF SP TAB but not set O */
113 static const UBool encodeDirectlyRestricted
[128]={
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
121 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
131 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
134 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
137 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
144 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
148 /* general punctuation with + and / and a special value (-2) for - */
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
151 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
154 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
155 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
158 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
163 * converter status values:
166 * 24 inDirectMode (boolean)
167 * 23..16 base64Counter (-1..7)
168 * 15..0 bits (up to 14 bits incoming base64)
171 * 31..28 version (0: set O direct 1: set O escaped)
172 * 24 inDirectMode (boolean)
173 * 23..16 base64Counter (0..2)
174 * 7..0 bits (6 bits outgoing base64)
179 _UTF7Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
180 if(choice
<=UCNV_RESET_TO_UNICODE
) {
181 /* reset toUnicode */
182 cnv
->toUnicodeStatus
=0x1000000; /* inDirectMode=TRUE */
185 if(choice
!=UCNV_RESET_TO_UNICODE
) {
186 /* reset fromUnicode */
187 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
192 _UTF7Open(UConverter
*cnv
,
193 UConverterLoadArgs
*pArgs
,
194 UErrorCode
*pErrorCode
) {
195 if(UCNV_GET_VERSION(cnv
)<=1) {
196 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
197 cnv
->fromUnicodeStatus
=UCNV_GET_VERSION(cnv
)<<28;
198 _UTF7Reset(cnv
, UCNV_RESET_BOTH
);
200 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
206 UErrorCode
*pErrorCode
) {
208 const uint8_t *source
, *sourceLimit
;
210 const UChar
*targetLimit
;
216 int32_t length
, targetCapacity
;
220 int8_t base64Counter
;
225 int32_t sourceIndex
, nextSourceIndex
;
228 /* set up the local pointers */
229 cnv
=pArgs
->converter
;
231 source
=(const uint8_t *)pArgs
->source
;
232 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
233 target
=pArgs
->target
;
234 targetLimit
=pArgs
->targetLimit
;
235 offsets
=pArgs
->offsets
;
236 /* get the state machine state */
238 uint32_t status
=cnv
->toUnicodeStatus
;
239 inDirectMode
=(UBool
)((status
>>24)&1);
240 base64Counter
=(int8_t)(status
>>16);
241 bits
=(uint16_t)status
;
244 byteIndex
=cnv
->toULength
;
246 /* sourceIndex=-1 if the current character began in the previous buffer */
247 sourceIndex
=byteIndex
==0 ? 0 : -1;
253 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
254 * with their US-ASCII byte values.
255 * Backslash and Tilde and most control characters are not allowed in UTF-7.
256 * A plus sign starts Unicode (or "escape") Mode.
258 * In Direct Mode, only the sourceIndex is used.
261 length
=(int32_t)(sourceLimit
-source
);
262 targetCapacity
=(int32_t)(targetLimit
-target
);
263 if(length
>targetCapacity
) {
264 length
=targetCapacity
;
268 if(!isLegalUTF7(b
)) {
272 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
275 /* write directly encoded character */
278 *offsets
++=sourceIndex
++;
281 /* switch to Unicode mode */
282 nextSourceIndex
=++sourceIndex
;
291 if(source
<sourceLimit
&& target
>=targetLimit
) {
293 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
298 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
299 * The base64 sequence ends with any character that is not in the base64 alphabet.
300 * A terminating minus sign is consumed.
302 * In Unicode Mode, the sourceIndex has the index to the start of the current
303 * base64 bytes, while nextSourceIndex is precisely parallel to source,
304 * keeping the index to the following byte.
305 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
307 while(source
<sourceLimit
) {
308 if(target
<targetLimit
) {
309 bytes
[byteIndex
++]=b
=*source
++;
311 base64Value
= -3; /* initialize as illegal */
312 if(b
>=126 || (base64Value
=fromBase64
[b
])==-3 || base64Value
==-1) {
314 * base64Value==-1 for any legal character except base64 and minus sign, or
315 * base64Value==-3 for illegal characters:
316 * 1. In either case, leave Unicode mode.
317 * 2.1. If we ended with an incomplete UChar or none after the +, then
318 * generate an error for the preceding erroneous sequence and deal with
319 * the current (possibly illegal) character next time through.
320 * 2.2. Else the current char comes after a complete UChar, which was already
321 * pushed to the output buf, so:
322 * 2.2.1. If the current char is legal, just save it for processing next time.
323 * It may be for example, a plus which we need to deal with in direct mode.
324 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
327 if(base64Counter
==-1) {
328 /* illegal: + immediately followed by something other than base64 or minus sign */
329 /* include the plus sign in the reported sequence, but not the subsequent char */
333 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
336 /* bits are illegally left over, a UChar is incomplete */
337 /* don't include current char (legal or illegal) in error seq */
340 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
343 /* previous UChar was complete */
344 if (base64Value
==-3) {
345 /* current character is illegal, deal with it here */
346 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
349 /* un-read the current character in case it is a plus sign */
351 sourceIndex
=nextSourceIndex
-1;
355 } else if(base64Value
>=0) {
356 /* collect base64 bytes into UChars */
357 switch(base64Counter
) {
358 case -1: /* -1 is immediately after the + */
367 bits
=(uint16_t)((bits
<<6)|base64Value
);
371 *target
++=(UChar
)((bits
<<4)|(base64Value
>>2));
373 *offsets
++=sourceIndex
;
374 sourceIndex
=nextSourceIndex
-1;
376 bytes
[0]=b
; /* keep this byte in case an error occurs */
378 bits
=(uint16_t)(base64Value
&3);
382 *target
++=(UChar
)((bits
<<2)|(base64Value
>>4));
384 *offsets
++=sourceIndex
;
385 sourceIndex
=nextSourceIndex
-1;
387 bytes
[0]=b
; /* keep this byte in case an error occurs */
389 bits
=(uint16_t)(base64Value
&15);
393 *target
++=(UChar
)((bits
<<6)|base64Value
);
395 *offsets
++=sourceIndex
;
396 sourceIndex
=nextSourceIndex
;
403 /* will never occur */
406 } else /*base64Value==-2*/ {
407 /* minus sign terminates the base64 sequence */
409 if(base64Counter
==-1) {
410 /* +- i.e. a minus immediately following a plus */
413 *offsets
++=sourceIndex
-1;
416 /* absorb the minus and leave the Unicode Mode */
418 /* bits are illegally left over, a UChar is incomplete */
419 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
423 sourceIndex
=nextSourceIndex
;
428 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
434 if(U_SUCCESS(*pErrorCode
) && pArgs
->flush
&& source
==sourceLimit
&& bits
==0) {
436 * if we are in Unicode mode, then the byteIndex might not be 0,
437 * but that is ok if bits==0
438 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
439 * (not true for IMAP-mailbox-name where we must end in direct mode)
444 /* set the converter state back into UConverter */
445 cnv
->toUnicodeStatus
=((uint32_t)inDirectMode
<<24)|((uint32_t)((uint8_t)base64Counter
)<<16)|(uint32_t)bits
;
446 cnv
->toULength
=byteIndex
;
448 /* write back the updated pointers */
449 pArgs
->source
=(const char *)source
;
450 pArgs
->target
=target
;
451 pArgs
->offsets
=offsets
;
456 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
457 UErrorCode
*pErrorCode
) {
459 const UChar
*source
, *sourceLimit
;
460 uint8_t *target
, *targetLimit
;
463 int32_t length
, targetCapacity
, sourceIndex
;
467 const UBool
*encodeDirectly
;
469 int8_t base64Counter
;
472 /* set up the local pointers */
473 cnv
=pArgs
->converter
;
475 /* set up the local pointers */
476 source
=pArgs
->source
;
477 sourceLimit
=pArgs
->sourceLimit
;
478 target
=(uint8_t *)pArgs
->target
;
479 targetLimit
=(uint8_t *)pArgs
->targetLimit
;
480 offsets
=pArgs
->offsets
;
482 /* get the state machine state */
484 uint32_t status
=cnv
->fromUnicodeStatus
;
485 encodeDirectly
= status
<0x10000000 ? encodeDirectlyMaximum
: encodeDirectlyRestricted
;
486 inDirectMode
=(UBool
)((status
>>24)&1);
487 base64Counter
=(int8_t)(status
>>16);
488 bits
=(uint8_t)status
;
491 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
496 length
=(int32_t)(sourceLimit
-source
);
497 targetCapacity
=(int32_t)(targetLimit
-target
);
498 if(length
>targetCapacity
) {
499 length
=targetCapacity
;
503 /* currently always encode CR LF SP TAB directly */
504 if(c
<=127 && encodeDirectly
[c
]) {
505 /* encode directly */
506 *target
++=(uint8_t)c
;
508 *offsets
++=sourceIndex
++;
511 /* output +- for + */
513 if(target
<targetLimit
) {
516 *offsets
++=sourceIndex
;
517 *offsets
++=sourceIndex
++;
519 /* realign length and targetCapacity */
523 *offsets
++=sourceIndex
++;
525 cnv
->charErrorBuffer
[0]=MINUS
;
526 cnv
->charErrorBufferLength
=1;
527 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
531 /* un-read this character and switch to Unicode Mode */
535 *offsets
++=sourceIndex
;
543 if(source
<sourceLimit
&& target
>=targetLimit
) {
545 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
549 while(source
<sourceLimit
) {
550 if(target
<targetLimit
) {
552 if(c
<=127 && encodeDirectly
[c
]) {
553 /* encode directly */
556 /* trick: back out this character to make this easier */
559 /* terminate the base64 sequence */
560 if(base64Counter
!=0) {
561 /* write remaining bits for the previous character */
562 *target
++=toBase64
[bits
];
564 *offsets
++=sourceIndex
-1;
567 if(fromBase64
[c
]!=-1) {
568 /* need to terminate with a minus */
569 if(target
<targetLimit
) {
572 *offsets
++=sourceIndex
-1;
575 cnv
->charErrorBuffer
[0]=MINUS
;
576 cnv
->charErrorBufferLength
=1;
577 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
584 * base64 this character:
585 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
586 * and the bits of this character, each implicitly in UTF-16BE.
588 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
589 * character to the next. The actual 2 or 4 bits are shifted to the left edge
590 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
592 switch(base64Counter
) {
594 *target
++=toBase64
[c
>>10];
595 if(target
<targetLimit
) {
596 *target
++=toBase64
[(c
>>4)&0x3f];
598 *offsets
++=sourceIndex
;
599 *offsets
++=sourceIndex
++;
603 *offsets
++=sourceIndex
++;
605 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>4)&0x3f];
606 cnv
->charErrorBufferLength
=1;
607 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
609 bits
=(uint8_t)((c
&15)<<2);
613 *target
++=toBase64
[bits
|(c
>>14)];
614 if(target
<targetLimit
) {
615 *target
++=toBase64
[(c
>>8)&0x3f];
616 if(target
<targetLimit
) {
617 *target
++=toBase64
[(c
>>2)&0x3f];
619 *offsets
++=sourceIndex
;
620 *offsets
++=sourceIndex
;
621 *offsets
++=sourceIndex
++;
625 *offsets
++=sourceIndex
;
626 *offsets
++=sourceIndex
++;
628 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>2)&0x3f];
629 cnv
->charErrorBufferLength
=1;
630 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
634 *offsets
++=sourceIndex
++;
636 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>8)&0x3f];
637 cnv
->charErrorBuffer
[1]=toBase64
[(c
>>2)&0x3f];
638 cnv
->charErrorBufferLength
=2;
639 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
641 bits
=(uint8_t)((c
&3)<<4);
645 *target
++=toBase64
[bits
|(c
>>12)];
646 if(target
<targetLimit
) {
647 *target
++=toBase64
[(c
>>6)&0x3f];
648 if(target
<targetLimit
) {
649 *target
++=toBase64
[c
&0x3f];
651 *offsets
++=sourceIndex
;
652 *offsets
++=sourceIndex
;
653 *offsets
++=sourceIndex
++;
657 *offsets
++=sourceIndex
;
658 *offsets
++=sourceIndex
++;
660 cnv
->charErrorBuffer
[0]=toBase64
[c
&0x3f];
661 cnv
->charErrorBufferLength
=1;
662 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
666 *offsets
++=sourceIndex
++;
668 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>6)&0x3f];
669 cnv
->charErrorBuffer
[1]=toBase64
[c
&0x3f];
670 cnv
->charErrorBufferLength
=2;
671 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
677 /* will never occur */
683 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
689 if(pArgs
->flush
&& source
>=sourceLimit
) {
690 /* flush remaining bits to the target */
691 if(!inDirectMode
&& base64Counter
!=0) {
692 if(target
<targetLimit
) {
693 *target
++=toBase64
[bits
];
695 *offsets
++=sourceIndex
-1;
698 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=toBase64
[bits
];
699 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
702 /* reset the state for the next conversion */
703 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
705 /* set the converter state back into UConverter */
706 cnv
->fromUnicodeStatus
=
707 (cnv
->fromUnicodeStatus
&0xf0000000)| /* keep version*/
708 ((uint32_t)inDirectMode
<<24)|((uint32_t)base64Counter
<<16)|(uint32_t)bits
;
711 /* write back the updated pointers */
712 pArgs
->source
=source
;
713 pArgs
->target
=(char *)target
;
714 pArgs
->offsets
=offsets
;
719 _UTF7GetName(const UConverter
*cnv
) {
720 switch(cnv
->fromUnicodeStatus
>>28) {
722 return "UTF-7,version=1";
728 static const UConverterImpl _UTF7Impl
={
738 _UTF7ToUnicodeWithOffsets
,
739 _UTF7ToUnicodeWithOffsets
,
740 _UTF7FromUnicodeWithOffsets
,
741 _UTF7FromUnicodeWithOffsets
,
746 NULL
, /* we don't need writeSub() because we never call a callback at fromUnicode() */
748 ucnv_getCompleteUnicodeSet
751 static const UConverterStaticData _UTF7StaticData
={
752 sizeof(UConverterStaticData
),
754 0, /* TODO CCSID for UTF-7 */
757 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
761 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
764 const UConverterSharedData _UTF7Data
={
765 sizeof(UConverterSharedData
), ~((uint32_t)0),
766 NULL
, NULL
, &_UTF7StaticData
, FALSE
, &_UTF7Impl
,
770 /* IMAP mailbox name encoding ----------------------------------------------- */
773 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
774 * http://www.ietf.org/rfc/rfc2060.txt
776 * 5.1.3. Mailbox International Naming Convention
778 * By convention, international mailbox names are specified using a
779 * modified version of the UTF-7 encoding described in [UTF-7]. The
780 * purpose of these modifications is to correct the following problems
783 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
784 * the common use of "+" in mailbox names, in particular USENET
787 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
788 * conflicts with the use of "/" as a popular hierarchy delimiter.
790 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
791 * the use of "\" as a popular hierarchy delimiter.
793 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
794 * the use of "~" in some servers as a home directory indicator.
796 * 5) UTF-7 permits multiple alternate forms to represent the same
797 * string; in particular, printable US-ASCII chararacters can be
798 * represented in encoded form.
800 * In modified UTF-7, printable US-ASCII characters except for "&"
801 * represent themselves; that is, characters with octet values 0x20-0x25
802 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
803 * octet sequence "&-".
805 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
806 * Unicode 16-bit octets) are represented in modified BASE64, with a
807 * further modification from [UTF-7] that "," is used instead of "/".
808 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
809 * character which can represent itself.
811 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
812 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
813 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
816 * For example, here is a mailbox name which mixes English, Japanese,
817 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
821 * Tests for US-ASCII characters belonging to character classes
824 * Set D (directly encoded characters) consists of the following
825 * characters: the upper and lower case letters A through Z
826 * and a through z, the 10 digits 0-9, and the following nine special
827 * characters (note that "+" and "=" are omitted):
830 * Set O (optional direct characters) consists of the following
831 * characters (note that "\" and "~" are omitted):
832 * !"#$%&*;<=>@[]^_`{|}
834 * According to the rules in RFC 2152, the byte values for the following
835 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
836 * - all C0 control codes except for CR LF TAB
840 * - all codes beyond US-ASCII, i.e. all >127
843 /* uses '&' not '+' to start a base64 sequence */
844 #define AMPERSAND 0x26
848 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
849 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
851 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
852 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
854 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
855 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
858 * converter status values:
861 * 24 inDirectMode (boolean)
862 * 23..16 base64Counter (-1..7)
863 * 15..0 bits (up to 14 bits incoming base64)
866 * 24 inDirectMode (boolean)
867 * 23..16 base64Counter (0..2)
868 * 7..0 bits (6 bits outgoing base64)
874 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
875 UErrorCode
*pErrorCode
) {
877 const uint8_t *source
, *sourceLimit
;
879 const UChar
*targetLimit
;
885 int32_t length
, targetCapacity
;
889 int8_t base64Counter
;
894 int32_t sourceIndex
, nextSourceIndex
;
899 /* set up the local pointers */
900 cnv
=pArgs
->converter
;
902 source
=(const uint8_t *)pArgs
->source
;
903 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
904 target
=pArgs
->target
;
905 targetLimit
=pArgs
->targetLimit
;
906 offsets
=pArgs
->offsets
;
907 /* get the state machine state */
909 uint32_t status
=cnv
->toUnicodeStatus
;
910 inDirectMode
=(UBool
)((status
>>24)&1);
911 base64Counter
=(int8_t)(status
>>16);
912 bits
=(uint16_t)status
;
915 byteIndex
=cnv
->toULength
;
917 /* sourceIndex=-1 if the current character began in the previous buffer */
918 sourceIndex
=byteIndex
==0 ? 0 : -1;
924 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
925 * with their US-ASCII byte values.
926 * An ampersand starts Unicode (or "escape") Mode.
928 * In Direct Mode, only the sourceIndex is used.
931 length
=(int32_t)(sourceLimit
-source
);
932 targetCapacity
=(int32_t)(targetLimit
-target
);
933 if(length
>targetCapacity
) {
934 length
=targetCapacity
;
938 if(!isLegalIMAP(b
)) {
942 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
944 } else if(b
!=AMPERSAND
) {
945 /* write directly encoded character */
948 *offsets
++=sourceIndex
++;
950 } else /* AMPERSAND */ {
951 /* switch to Unicode mode */
952 nextSourceIndex
=++sourceIndex
;
961 if(source
<sourceLimit
&& target
>=targetLimit
) {
963 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
968 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
969 * The base64 sequence ends with any character that is not in the base64 alphabet.
970 * A terminating minus sign is consumed.
971 * US-ASCII must not be base64-ed.
973 * In Unicode Mode, the sourceIndex has the index to the start of the current
974 * base64 bytes, while nextSourceIndex is precisely parallel to source,
975 * keeping the index to the following byte.
976 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
978 while(source
<sourceLimit
) {
979 if(target
<targetLimit
) {
980 bytes
[byteIndex
++]=b
=*source
++;
983 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
985 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
987 } else if((base64Value
=FROM_BASE64_IMAP(b
))>=0) {
988 /* collect base64 bytes into UChars */
989 switch(base64Counter
) {
990 case -1: /* -1 is immediately after the & */
999 bits
=(uint16_t)((bits
<<6)|base64Value
);
1003 c
=(UChar
)((bits
<<4)|(base64Value
>>2));
1004 if(isLegalIMAP(c
)) {
1007 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1012 *offsets
++=sourceIndex
;
1013 sourceIndex
=nextSourceIndex
-1;
1015 bytes
[0]=b
; /* keep this byte in case an error occurs */
1017 bits
=(uint16_t)(base64Value
&3);
1021 c
=(UChar
)((bits
<<2)|(base64Value
>>4));
1022 if(isLegalIMAP(c
)) {
1025 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1030 *offsets
++=sourceIndex
;
1031 sourceIndex
=nextSourceIndex
-1;
1033 bytes
[0]=b
; /* keep this byte in case an error occurs */
1035 bits
=(uint16_t)(base64Value
&15);
1039 c
=(UChar
)((bits
<<6)|base64Value
);
1040 if(isLegalIMAP(c
)) {
1043 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1048 *offsets
++=sourceIndex
;
1049 sourceIndex
=nextSourceIndex
;
1056 /* will never occur */
1059 } else if(base64Value
==-2) {
1060 /* minus sign terminates the base64 sequence */
1062 if(base64Counter
==-1) {
1063 /* &- i.e. a minus immediately following an ampersand */
1064 *target
++=AMPERSAND
;
1066 *offsets
++=sourceIndex
-1;
1069 /* absorb the minus and leave the Unicode Mode */
1070 if(bits
!=0 || (base64Counter
!=0 && base64Counter
!=3 && base64Counter
!=6)) {
1071 /* bits are illegally left over, a UChar is incomplete */
1072 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1073 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1077 sourceIndex
=nextSourceIndex
;
1080 if(base64Counter
==-1) {
1081 /* illegal: & immediately followed by something other than base64 or minus sign */
1082 /* include the ampersand in the reported sequence */
1088 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1089 /* base64Value==-3 for illegal characters */
1092 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1096 /* target is full */
1097 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1105 * the end of the input stream and detection of truncated input
1106 * are handled by the framework, but here we must check if we are in Unicode
1107 * mode and byteIndex==0 because we must end in direct mode
1111 * in Unicode mode and byteIndex==0
1112 * end of input and no truncated input
1114 if( U_SUCCESS(*pErrorCode
) &&
1115 !inDirectMode
&& byteIndex
==0 &&
1116 pArgs
->flush
&& source
>=sourceLimit
1118 if(base64Counter
==-1) {
1119 /* & at the very end of the input */
1120 /* make the ampersand the reported sequence */
1124 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1126 inDirectMode
=TRUE
; /* avoid looping */
1127 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
1130 /* set the converter state back into UConverter */
1131 cnv
->toUnicodeStatus
=((uint32_t)inDirectMode
<<24)|((uint32_t)((uint8_t)base64Counter
)<<16)|(uint32_t)bits
;
1132 cnv
->toULength
=byteIndex
;
1134 /* write back the updated pointers */
1135 pArgs
->source
=(const char *)source
;
1136 pArgs
->target
=target
;
1137 pArgs
->offsets
=offsets
;
1142 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1143 UErrorCode
*pErrorCode
) {
1145 const UChar
*source
, *sourceLimit
;
1146 uint8_t *target
, *targetLimit
;
1149 int32_t length
, targetCapacity
, sourceIndex
;
1155 int8_t base64Counter
;
1158 /* set up the local pointers */
1159 cnv
=pArgs
->converter
;
1161 /* set up the local pointers */
1162 source
=pArgs
->source
;
1163 sourceLimit
=pArgs
->sourceLimit
;
1164 target
=(uint8_t *)pArgs
->target
;
1165 targetLimit
=(uint8_t *)pArgs
->targetLimit
;
1166 offsets
=pArgs
->offsets
;
1168 /* get the state machine state */
1170 uint32_t status
=cnv
->fromUnicodeStatus
;
1171 inDirectMode
=(UBool
)((status
>>24)&1);
1172 base64Counter
=(int8_t)(status
>>16);
1173 bits
=(uint8_t)status
;
1176 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1181 length
=(int32_t)(sourceLimit
-source
);
1182 targetCapacity
=(int32_t)(targetLimit
-target
);
1183 if(length
>targetCapacity
) {
1184 length
=targetCapacity
;
1188 /* encode 0x20..0x7e except '&' directly */
1190 /* encode directly */
1191 *target
++=(uint8_t)c
;
1193 *offsets
++=sourceIndex
++;
1195 } else if(c
==AMPERSAND
) {
1196 /* output &- for & */
1197 *target
++=AMPERSAND
;
1198 if(target
<targetLimit
) {
1201 *offsets
++=sourceIndex
;
1202 *offsets
++=sourceIndex
++;
1204 /* realign length and targetCapacity */
1208 *offsets
++=sourceIndex
++;
1210 cnv
->charErrorBuffer
[0]=MINUS
;
1211 cnv
->charErrorBufferLength
=1;
1212 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1216 /* un-read this character and switch to Unicode Mode */
1218 *target
++=AMPERSAND
;
1220 *offsets
++=sourceIndex
;
1228 if(source
<sourceLimit
&& target
>=targetLimit
) {
1229 /* target is full */
1230 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1234 while(source
<sourceLimit
) {
1235 if(target
<targetLimit
) {
1237 if(isLegalIMAP(c
)) {
1238 /* encode directly */
1241 /* trick: back out this character to make this easier */
1244 /* terminate the base64 sequence */
1245 if(base64Counter
!=0) {
1246 /* write remaining bits for the previous character */
1247 *target
++=TO_BASE64_IMAP(bits
);
1249 *offsets
++=sourceIndex
-1;
1252 /* need to terminate with a minus */
1253 if(target
<targetLimit
) {
1256 *offsets
++=sourceIndex
-1;
1259 cnv
->charErrorBuffer
[0]=MINUS
;
1260 cnv
->charErrorBufferLength
=1;
1261 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1267 * base64 this character:
1268 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1269 * and the bits of this character, each implicitly in UTF-16BE.
1271 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1272 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1273 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1275 switch(base64Counter
) {
1278 *target
++=TO_BASE64_IMAP(b
);
1279 if(target
<targetLimit
) {
1280 b
=(uint8_t)((c
>>4)&0x3f);
1281 *target
++=TO_BASE64_IMAP(b
);
1283 *offsets
++=sourceIndex
;
1284 *offsets
++=sourceIndex
++;
1288 *offsets
++=sourceIndex
++;
1290 b
=(uint8_t)((c
>>4)&0x3f);
1291 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1292 cnv
->charErrorBufferLength
=1;
1293 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1295 bits
=(uint8_t)((c
&15)<<2);
1299 b
=(uint8_t)(bits
|(c
>>14));
1300 *target
++=TO_BASE64_IMAP(b
);
1301 if(target
<targetLimit
) {
1302 b
=(uint8_t)((c
>>8)&0x3f);
1303 *target
++=TO_BASE64_IMAP(b
);
1304 if(target
<targetLimit
) {
1305 b
=(uint8_t)((c
>>2)&0x3f);
1306 *target
++=TO_BASE64_IMAP(b
);
1308 *offsets
++=sourceIndex
;
1309 *offsets
++=sourceIndex
;
1310 *offsets
++=sourceIndex
++;
1314 *offsets
++=sourceIndex
;
1315 *offsets
++=sourceIndex
++;
1317 b
=(uint8_t)((c
>>2)&0x3f);
1318 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1319 cnv
->charErrorBufferLength
=1;
1320 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1324 *offsets
++=sourceIndex
++;
1326 b
=(uint8_t)((c
>>8)&0x3f);
1327 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1328 b
=(uint8_t)((c
>>2)&0x3f);
1329 cnv
->charErrorBuffer
[1]=TO_BASE64_IMAP(b
);
1330 cnv
->charErrorBufferLength
=2;
1331 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1333 bits
=(uint8_t)((c
&3)<<4);
1337 b
=(uint8_t)(bits
|(c
>>12));
1338 *target
++=TO_BASE64_IMAP(b
);
1339 if(target
<targetLimit
) {
1340 b
=(uint8_t)((c
>>6)&0x3f);
1341 *target
++=TO_BASE64_IMAP(b
);
1342 if(target
<targetLimit
) {
1343 b
=(uint8_t)(c
&0x3f);
1344 *target
++=TO_BASE64_IMAP(b
);
1346 *offsets
++=sourceIndex
;
1347 *offsets
++=sourceIndex
;
1348 *offsets
++=sourceIndex
++;
1352 *offsets
++=sourceIndex
;
1353 *offsets
++=sourceIndex
++;
1355 b
=(uint8_t)(c
&0x3f);
1356 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1357 cnv
->charErrorBufferLength
=1;
1358 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1362 *offsets
++=sourceIndex
++;
1364 b
=(uint8_t)((c
>>6)&0x3f);
1365 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1366 b
=(uint8_t)(c
&0x3f);
1367 cnv
->charErrorBuffer
[1]=TO_BASE64_IMAP(b
);
1368 cnv
->charErrorBufferLength
=2;
1369 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1375 /* will never occur */
1380 /* target is full */
1381 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1387 if(pArgs
->flush
&& source
>=sourceLimit
) {
1388 /* flush remaining bits to the target */
1390 if(base64Counter
!=0) {
1391 if(target
<targetLimit
) {
1392 *target
++=TO_BASE64_IMAP(bits
);
1394 *offsets
++=sourceIndex
-1;
1397 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=TO_BASE64_IMAP(bits
);
1398 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1401 /* need to terminate with a minus */
1402 if(target
<targetLimit
) {
1405 *offsets
++=sourceIndex
-1;
1408 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=MINUS
;
1409 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1412 /* reset the state for the next conversion */
1413 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1415 /* set the converter state back into UConverter */
1416 cnv
->fromUnicodeStatus
=
1417 (cnv
->fromUnicodeStatus
&0xf0000000)| /* keep version*/
1418 ((uint32_t)inDirectMode
<<24)|((uint32_t)base64Counter
<<16)|(uint32_t)bits
;
1421 /* write back the updated pointers */
1422 pArgs
->source
=source
;
1423 pArgs
->target
=(char *)target
;
1424 pArgs
->offsets
=offsets
;
1428 static const UConverterImpl _IMAPImpl
={
1438 _IMAPToUnicodeWithOffsets
,
1439 _IMAPToUnicodeWithOffsets
,
1440 _IMAPFromUnicodeWithOffsets
,
1441 _IMAPFromUnicodeWithOffsets
,
1446 NULL
, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1448 ucnv_getCompleteUnicodeSet
1451 static const UConverterStaticData _IMAPStaticData
={
1452 sizeof(UConverterStaticData
),
1453 "IMAP-mailbox-name",
1454 0, /* TODO CCSID for IMAP-mailbox-name */
1455 UCNV_IBM
, UCNV_IMAP_MAILBOX
,
1457 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1461 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1464 const UConverterSharedData _IMAPData
={
1465 sizeof(UConverterSharedData
), ~((uint32_t)0),
1466 NULL
, NULL
, &_IMAPStaticData
, FALSE
, &_IMAPImpl
,