2 **********************************************************************
3 * Copyright (C) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22 #include "unicode/ucnv.h"
27 /* UTF-7 -------------------------------------------------------------------- */
30 * UTF-7 is a stateful encoding of Unicode.
31 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
32 * It was intended for use in Internet email systems, using in its bytewise
33 * encoding only a subset of 7-bit US-ASCII.
34 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
38 * characters directly or in base64. Especially, the characters in set O
39 * as defined in the RFC (see below) may be encoded directly but are not
40 * allowed in, e.g., email headers.
41 * By default, the ICU UTF-7 converter encodes set O directly.
42 * By choosing the option "version=1", set O will be escaped instead.
44 * utf7Converter=ucnv_open("UTF-7,version=1");
46 * For details about email headers see RFC 2047.
50 * Tests for US-ASCII characters belonging to character classes
53 * Set D (directly encoded characters) consists of the following
54 * characters: the upper and lower case letters A through Z
55 * and a through z, the 10 digits 0-9, and the following nine special
56 * characters (note that "+" and "=" are omitted):
59 * Set O (optional direct characters) consists of the following
60 * characters (note that "\" and "~" are omitted):
61 * !"#$%&*;<=>@[]^_`{|}
63 * According to the rules in RFC 2152, the byte values for the following
64 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
65 * - all C0 control codes except for CR LF TAB
69 * - all codes beyond US-ASCII, i.e. all >127
72 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
73 (uint8_t)((c)-48)<10 || /* digits */ \
74 (uint8_t)((c)-39)<3 || /* '() */ \
75 (uint8_t)((c)-44)<4 || /* ,-./ */ \
76 (c)==58 || (c)==63 /* :? */ \
80 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
81 (uint8_t)((c)-59)<4 || /* ;<=> */ \
82 (uint8_t)((c)-93)<4 || /* ]^_` */ \
83 (uint8_t)((c)-123)<3 || /* {|} */ \
84 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
87 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
88 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
95 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
96 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
98 /* encode directly sets D and O and CR LF SP TAB */
99 static const UBool encodeDirectlyMaximum
[128]={
100 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114 /* encode directly set D and CR LF SP TAB but not set O */
115 static const UBool encodeDirectlyRestricted
[128]={
116 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
123 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
126 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
133 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
134 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
136 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
137 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
139 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
146 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
147 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
148 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
150 /* general punctuation with + and / and a special value (-2) for - */
151 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
153 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
157 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
161 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
165 * converter status values:
168 * 24 inDirectMode (boolean)
169 * 23..16 base64Counter (-1..7)
170 * 15..0 bits (up to 14 bits incoming base64)
173 * 31..28 version (0: set O direct 1: set O escaped)
174 * 24 inDirectMode (boolean)
175 * 23..16 base64Counter (0..2)
176 * 7..0 bits (6 bits outgoing base64)
181 _UTF7Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
182 if(choice
<=UCNV_RESET_TO_UNICODE
) {
183 /* reset toUnicode */
184 cnv
->toUnicodeStatus
=0x1000000; /* inDirectMode=TRUE */
187 if(choice
!=UCNV_RESET_TO_UNICODE
) {
188 /* reset fromUnicode */
189 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
194 _UTF7Open(UConverter
*cnv
,
195 UConverterLoadArgs
*pArgs
,
196 UErrorCode
*pErrorCode
) {
197 if(UCNV_GET_VERSION(cnv
)<=1) {
198 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
199 cnv
->fromUnicodeStatus
=UCNV_GET_VERSION(cnv
)<<28;
200 _UTF7Reset(cnv
, UCNV_RESET_BOTH
);
202 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
207 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
208 UErrorCode
*pErrorCode
) {
210 const uint8_t *source
, *sourceLimit
;
212 const UChar
*targetLimit
;
218 int32_t length
, targetCapacity
;
222 int8_t base64Counter
;
227 int32_t sourceIndex
, nextSourceIndex
;
230 /* set up the local pointers */
231 cnv
=pArgs
->converter
;
233 source
=(const uint8_t *)pArgs
->source
;
234 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
235 target
=pArgs
->target
;
236 targetLimit
=pArgs
->targetLimit
;
237 offsets
=pArgs
->offsets
;
238 /* get the state machine state */
240 uint32_t status
=cnv
->toUnicodeStatus
;
241 inDirectMode
=(UBool
)((status
>>24)&1);
242 base64Counter
=(int8_t)(status
>>16);
243 bits
=(uint16_t)status
;
246 byteIndex
=cnv
->toULength
;
248 /* sourceIndex=-1 if the current character began in the previous buffer */
249 sourceIndex
=byteIndex
==0 ? 0 : -1;
255 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
256 * with their US-ASCII byte values.
257 * Backslash and Tilde and most control characters are not allowed in UTF-7.
258 * A plus sign starts Unicode (or "escape") Mode.
260 * In Direct Mode, only the sourceIndex is used.
263 length
=(int32_t)(sourceLimit
-source
);
264 targetCapacity
=(int32_t)(targetLimit
-target
);
265 if(length
>targetCapacity
) {
266 length
=targetCapacity
;
270 if(!isLegalUTF7(b
)) {
274 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
277 /* write directly encoded character */
280 *offsets
++=sourceIndex
++;
283 /* switch to Unicode mode */
284 nextSourceIndex
=++sourceIndex
;
293 if(source
<sourceLimit
&& target
>=targetLimit
) {
295 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
300 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
301 * The base64 sequence ends with any character that is not in the base64 alphabet.
302 * A terminating minus sign is consumed.
304 * In Unicode Mode, the sourceIndex has the index to the start of the current
305 * base64 bytes, while nextSourceIndex is precisely parallel to source,
306 * keeping the index to the following byte.
307 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
309 while(source
<sourceLimit
) {
310 if(target
<targetLimit
) {
311 bytes
[byteIndex
++]=b
=*source
++;
313 base64Value
= -3; /* initialize as illegal */
314 if(b
>=126 || (base64Value
=fromBase64
[b
])==-3 || base64Value
==-1) {
316 * base64Value==-1 for any legal character except base64 and minus sign, or
317 * base64Value==-3 for illegal characters:
318 * 1. In either case, leave Unicode mode.
319 * 2.1. If we ended with an incomplete UChar or none after the +, then
320 * generate an error for the preceding erroneous sequence and deal with
321 * the current (possibly illegal) character next time through.
322 * 2.2. Else the current char comes after a complete UChar, which was already
323 * pushed to the output buf, so:
324 * 2.2.1. If the current char is legal, just save it for processing next time.
325 * It may be for example, a plus which we need to deal with in direct mode.
326 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
329 if(base64Counter
==-1) {
330 /* illegal: + immediately followed by something other than base64 or minus sign */
331 /* include the plus sign in the reported sequence, but not the subsequent char */
335 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
338 /* bits are illegally left over, a UChar is incomplete */
339 /* don't include current char (legal or illegal) in error seq */
342 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
345 /* previous UChar was complete */
346 if(base64Value
==-3) {
347 /* current character is illegal, deal with it here */
348 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
351 /* un-read the current character in case it is a plus sign */
353 sourceIndex
=nextSourceIndex
-1;
357 } else if(base64Value
>=0) {
358 /* collect base64 bytes into UChars */
359 switch(base64Counter
) {
360 case -1: /* -1 is immediately after the + */
369 bits
=(uint16_t)((bits
<<6)|base64Value
);
373 *target
++=(UChar
)((bits
<<4)|(base64Value
>>2));
375 *offsets
++=sourceIndex
;
376 sourceIndex
=nextSourceIndex
-1;
378 bytes
[0]=b
; /* keep this byte in case an error occurs */
380 bits
=(uint16_t)(base64Value
&3);
384 *target
++=(UChar
)((bits
<<2)|(base64Value
>>4));
386 *offsets
++=sourceIndex
;
387 sourceIndex
=nextSourceIndex
-1;
389 bytes
[0]=b
; /* keep this byte in case an error occurs */
391 bits
=(uint16_t)(base64Value
&15);
395 *target
++=(UChar
)((bits
<<6)|base64Value
);
397 *offsets
++=sourceIndex
;
398 sourceIndex
=nextSourceIndex
;
405 /* will never occur */
408 } else /*base64Value==-2*/ {
409 /* minus sign terminates the base64 sequence */
411 if(base64Counter
==-1) {
412 /* +- i.e. a minus immediately following a plus */
415 *offsets
++=sourceIndex
-1;
418 /* absorb the minus and leave the Unicode Mode */
420 /* bits are illegally left over, a UChar is incomplete */
421 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
425 sourceIndex
=nextSourceIndex
;
430 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
436 if(U_SUCCESS(*pErrorCode
) && pArgs
->flush
&& source
==sourceLimit
&& bits
==0) {
438 * if we are in Unicode mode, then the byteIndex might not be 0,
439 * but that is ok if bits==0
440 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
441 * (not true for IMAP-mailbox-name where we must end in direct mode)
446 /* set the converter state back into UConverter */
447 cnv
->toUnicodeStatus
=((uint32_t)inDirectMode
<<24)|((uint32_t)((uint8_t)base64Counter
)<<16)|(uint32_t)bits
;
448 cnv
->toULength
=byteIndex
;
450 /* write back the updated pointers */
451 pArgs
->source
=(const char *)source
;
452 pArgs
->target
=target
;
453 pArgs
->offsets
=offsets
;
458 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
459 UErrorCode
*pErrorCode
) {
461 const UChar
*source
, *sourceLimit
;
462 uint8_t *target
, *targetLimit
;
465 int32_t length
, targetCapacity
, sourceIndex
;
469 const UBool
*encodeDirectly
;
471 int8_t base64Counter
;
474 /* set up the local pointers */
475 cnv
=pArgs
->converter
;
477 /* set up the local pointers */
478 source
=pArgs
->source
;
479 sourceLimit
=pArgs
->sourceLimit
;
480 target
=(uint8_t *)pArgs
->target
;
481 targetLimit
=(uint8_t *)pArgs
->targetLimit
;
482 offsets
=pArgs
->offsets
;
484 /* get the state machine state */
486 uint32_t status
=cnv
->fromUnicodeStatus
;
487 encodeDirectly
= status
<0x10000000 ? encodeDirectlyMaximum
: encodeDirectlyRestricted
;
488 inDirectMode
=(UBool
)((status
>>24)&1);
489 base64Counter
=(int8_t)(status
>>16);
490 bits
=(uint8_t)status
;
491 U_ASSERT(bits
<=UPRV_LENGTHOF(toBase64
));
494 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
499 length
=(int32_t)(sourceLimit
-source
);
500 targetCapacity
=(int32_t)(targetLimit
-target
);
501 if(length
>targetCapacity
) {
502 length
=targetCapacity
;
506 /* currently always encode CR LF SP TAB directly */
507 if(c
<=127 && encodeDirectly
[c
]) {
508 /* encode directly */
509 *target
++=(uint8_t)c
;
511 *offsets
++=sourceIndex
++;
514 /* output +- for + */
516 if(target
<targetLimit
) {
519 *offsets
++=sourceIndex
;
520 *offsets
++=sourceIndex
++;
522 /* realign length and targetCapacity */
526 *offsets
++=sourceIndex
++;
528 cnv
->charErrorBuffer
[0]=MINUS
;
529 cnv
->charErrorBufferLength
=1;
530 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
534 /* un-read this character and switch to Unicode Mode */
538 *offsets
++=sourceIndex
;
546 if(source
<sourceLimit
&& target
>=targetLimit
) {
548 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
552 while(source
<sourceLimit
) {
553 if(target
<targetLimit
) {
555 if(c
<=127 && encodeDirectly
[c
]) {
556 /* encode directly */
559 /* trick: back out this character to make this easier */
562 /* terminate the base64 sequence */
563 if(base64Counter
!=0) {
564 /* write remaining bits for the previous character */
565 *target
++=toBase64
[bits
];
567 *offsets
++=sourceIndex
-1;
570 if(fromBase64
[c
]!=-1) {
571 /* need to terminate with a minus */
572 if(target
<targetLimit
) {
575 *offsets
++=sourceIndex
-1;
578 cnv
->charErrorBuffer
[0]=MINUS
;
579 cnv
->charErrorBufferLength
=1;
580 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
587 * base64 this character:
588 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
589 * and the bits of this character, each implicitly in UTF-16BE.
591 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
592 * character to the next. The actual 2 or 4 bits are shifted to the left edge
593 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
595 switch(base64Counter
) {
597 *target
++=toBase64
[c
>>10];
598 if(target
<targetLimit
) {
599 *target
++=toBase64
[(c
>>4)&0x3f];
601 *offsets
++=sourceIndex
;
602 *offsets
++=sourceIndex
++;
606 *offsets
++=sourceIndex
++;
608 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>4)&0x3f];
609 cnv
->charErrorBufferLength
=1;
610 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
612 bits
=(uint8_t)((c
&15)<<2);
616 *target
++=toBase64
[bits
|(c
>>14)];
617 if(target
<targetLimit
) {
618 *target
++=toBase64
[(c
>>8)&0x3f];
619 if(target
<targetLimit
) {
620 *target
++=toBase64
[(c
>>2)&0x3f];
622 *offsets
++=sourceIndex
;
623 *offsets
++=sourceIndex
;
624 *offsets
++=sourceIndex
++;
628 *offsets
++=sourceIndex
;
629 *offsets
++=sourceIndex
++;
631 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>2)&0x3f];
632 cnv
->charErrorBufferLength
=1;
633 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
637 *offsets
++=sourceIndex
++;
639 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>8)&0x3f];
640 cnv
->charErrorBuffer
[1]=toBase64
[(c
>>2)&0x3f];
641 cnv
->charErrorBufferLength
=2;
642 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
644 bits
=(uint8_t)((c
&3)<<4);
648 *target
++=toBase64
[bits
|(c
>>12)];
649 if(target
<targetLimit
) {
650 *target
++=toBase64
[(c
>>6)&0x3f];
651 if(target
<targetLimit
) {
652 *target
++=toBase64
[c
&0x3f];
654 *offsets
++=sourceIndex
;
655 *offsets
++=sourceIndex
;
656 *offsets
++=sourceIndex
++;
660 *offsets
++=sourceIndex
;
661 *offsets
++=sourceIndex
++;
663 cnv
->charErrorBuffer
[0]=toBase64
[c
&0x3f];
664 cnv
->charErrorBufferLength
=1;
665 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
669 *offsets
++=sourceIndex
++;
671 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>6)&0x3f];
672 cnv
->charErrorBuffer
[1]=toBase64
[c
&0x3f];
673 cnv
->charErrorBufferLength
=2;
674 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
680 /* will never occur */
686 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
692 if(pArgs
->flush
&& source
>=sourceLimit
) {
693 /* flush remaining bits to the target */
695 if (base64Counter
!=0) {
696 if(target
<targetLimit
) {
697 *target
++=toBase64
[bits
];
699 *offsets
++=sourceIndex
-1;
702 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=toBase64
[bits
];
703 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
706 /* Add final MINUS to terminate unicodeMode */
707 if(target
<targetLimit
) {
710 *offsets
++=sourceIndex
-1;
713 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=MINUS
;
714 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
717 /* reset the state for the next conversion */
718 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
720 /* set the converter state back into UConverter */
721 cnv
->fromUnicodeStatus
=
722 (cnv
->fromUnicodeStatus
&0xf0000000)| /* keep version*/
723 ((uint32_t)inDirectMode
<<24)|((uint32_t)base64Counter
<<16)|(uint32_t)bits
;
726 /* write back the updated pointers */
727 pArgs
->source
=source
;
728 pArgs
->target
=(char *)target
;
729 pArgs
->offsets
=offsets
;
734 _UTF7GetName(const UConverter
*cnv
) {
735 switch(cnv
->fromUnicodeStatus
>>28) {
737 return "UTF-7,version=1";
743 static const UConverterImpl _UTF7Impl
={
753 _UTF7ToUnicodeWithOffsets
,
754 _UTF7ToUnicodeWithOffsets
,
755 _UTF7FromUnicodeWithOffsets
,
756 _UTF7FromUnicodeWithOffsets
,
761 NULL
, /* we don't need writeSub() because we never call a callback at fromUnicode() */
763 ucnv_getCompleteUnicodeSet
766 static const UConverterStaticData _UTF7StaticData
={
767 sizeof(UConverterStaticData
),
769 0, /* TODO CCSID for UTF-7 */
772 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
776 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
779 const UConverterSharedData _UTF7Data
=
780 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData
, &_UTF7Impl
);
782 /* IMAP mailbox name encoding ----------------------------------------------- */
785 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
786 * http://www.ietf.org/rfc/rfc2060.txt
788 * 5.1.3. Mailbox International Naming Convention
790 * By convention, international mailbox names are specified using a
791 * modified version of the UTF-7 encoding described in [UTF-7]. The
792 * purpose of these modifications is to correct the following problems
795 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
796 * the common use of "+" in mailbox names, in particular USENET
799 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
800 * conflicts with the use of "/" as a popular hierarchy delimiter.
802 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
803 * the use of "\" as a popular hierarchy delimiter.
805 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
806 * the use of "~" in some servers as a home directory indicator.
808 * 5) UTF-7 permits multiple alternate forms to represent the same
809 * string; in particular, printable US-ASCII chararacters can be
810 * represented in encoded form.
812 * In modified UTF-7, printable US-ASCII characters except for "&"
813 * represent themselves; that is, characters with octet values 0x20-0x25
814 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
815 * octet sequence "&-".
817 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
818 * Unicode 16-bit octets) are represented in modified BASE64, with a
819 * further modification from [UTF-7] that "," is used instead of "/".
820 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
821 * character which can represent itself.
823 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
824 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
825 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
828 * For example, here is a mailbox name which mixes English, Japanese,
829 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
833 * Tests for US-ASCII characters belonging to character classes
836 * Set D (directly encoded characters) consists of the following
837 * characters: the upper and lower case letters A through Z
838 * and a through z, the 10 digits 0-9, and the following nine special
839 * characters (note that "+" and "=" are omitted):
842 * Set O (optional direct characters) consists of the following
843 * characters (note that "\" and "~" are omitted):
844 * !"#$%&*;<=>@[]^_`{|}
846 * According to the rules in RFC 2152, the byte values for the following
847 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
848 * - all C0 control codes except for CR LF TAB
852 * - all codes beyond US-ASCII, i.e. all >127
855 /* uses '&' not '+' to start a base64 sequence */
856 #define AMPERSAND 0x26
860 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
861 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
863 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
864 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
866 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
867 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
870 * converter status values:
873 * 24 inDirectMode (boolean)
874 * 23..16 base64Counter (-1..7)
875 * 15..0 bits (up to 14 bits incoming base64)
878 * 24 inDirectMode (boolean)
879 * 23..16 base64Counter (0..2)
880 * 7..0 bits (6 bits outgoing base64)
886 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
887 UErrorCode
*pErrorCode
) {
889 const uint8_t *source
, *sourceLimit
;
891 const UChar
*targetLimit
;
897 int32_t length
, targetCapacity
;
901 int8_t base64Counter
;
906 int32_t sourceIndex
, nextSourceIndex
;
911 /* set up the local pointers */
912 cnv
=pArgs
->converter
;
914 source
=(const uint8_t *)pArgs
->source
;
915 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
916 target
=pArgs
->target
;
917 targetLimit
=pArgs
->targetLimit
;
918 offsets
=pArgs
->offsets
;
919 /* get the state machine state */
921 uint32_t status
=cnv
->toUnicodeStatus
;
922 inDirectMode
=(UBool
)((status
>>24)&1);
923 base64Counter
=(int8_t)(status
>>16);
924 bits
=(uint16_t)status
;
927 byteIndex
=cnv
->toULength
;
929 /* sourceIndex=-1 if the current character began in the previous buffer */
930 sourceIndex
=byteIndex
==0 ? 0 : -1;
936 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
937 * with their US-ASCII byte values.
938 * An ampersand starts Unicode (or "escape") Mode.
940 * In Direct Mode, only the sourceIndex is used.
943 length
=(int32_t)(sourceLimit
-source
);
944 targetCapacity
=(int32_t)(targetLimit
-target
);
945 if(length
>targetCapacity
) {
946 length
=targetCapacity
;
950 if(!isLegalIMAP(b
)) {
954 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
956 } else if(b
!=AMPERSAND
) {
957 /* write directly encoded character */
960 *offsets
++=sourceIndex
++;
962 } else /* AMPERSAND */ {
963 /* switch to Unicode mode */
964 nextSourceIndex
=++sourceIndex
;
973 if(source
<sourceLimit
&& target
>=targetLimit
) {
975 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
980 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
981 * The base64 sequence ends with any character that is not in the base64 alphabet.
982 * A terminating minus sign is consumed.
983 * US-ASCII must not be base64-ed.
985 * In Unicode Mode, the sourceIndex has the index to the start of the current
986 * base64 bytes, while nextSourceIndex is precisely parallel to source,
987 * keeping the index to the following byte.
988 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
990 while(source
<sourceLimit
) {
991 if(target
<targetLimit
) {
992 bytes
[byteIndex
++]=b
=*source
++;
995 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
997 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
999 } else if((base64Value
=FROM_BASE64_IMAP(b
))>=0) {
1000 /* collect base64 bytes into UChars */
1001 switch(base64Counter
) {
1002 case -1: /* -1 is immediately after the & */
1011 bits
=(uint16_t)((bits
<<6)|base64Value
);
1015 c
=(UChar
)((bits
<<4)|(base64Value
>>2));
1016 if(isLegalIMAP(c
)) {
1019 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1024 *offsets
++=sourceIndex
;
1025 sourceIndex
=nextSourceIndex
-1;
1027 bytes
[0]=b
; /* keep this byte in case an error occurs */
1029 bits
=(uint16_t)(base64Value
&3);
1033 c
=(UChar
)((bits
<<2)|(base64Value
>>4));
1034 if(isLegalIMAP(c
)) {
1037 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1042 *offsets
++=sourceIndex
;
1043 sourceIndex
=nextSourceIndex
-1;
1045 bytes
[0]=b
; /* keep this byte in case an error occurs */
1047 bits
=(uint16_t)(base64Value
&15);
1051 c
=(UChar
)((bits
<<6)|base64Value
);
1052 if(isLegalIMAP(c
)) {
1055 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1060 *offsets
++=sourceIndex
;
1061 sourceIndex
=nextSourceIndex
;
1068 /* will never occur */
1071 } else if(base64Value
==-2) {
1072 /* minus sign terminates the base64 sequence */
1074 if(base64Counter
==-1) {
1075 /* &- i.e. a minus immediately following an ampersand */
1076 *target
++=AMPERSAND
;
1078 *offsets
++=sourceIndex
-1;
1081 /* absorb the minus and leave the Unicode Mode */
1082 if(bits
!=0 || (base64Counter
!=0 && base64Counter
!=3 && base64Counter
!=6)) {
1083 /* bits are illegally left over, a UChar is incomplete */
1084 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1085 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1089 sourceIndex
=nextSourceIndex
;
1092 if(base64Counter
==-1) {
1093 /* illegal: & immediately followed by something other than base64 or minus sign */
1094 /* include the ampersand in the reported sequence */
1100 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1101 /* base64Value==-3 for illegal characters */
1104 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1108 /* target is full */
1109 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1117 * the end of the input stream and detection of truncated input
1118 * are handled by the framework, but here we must check if we are in Unicode
1119 * mode and byteIndex==0 because we must end in direct mode
1123 * in Unicode mode and byteIndex==0
1124 * end of input and no truncated input
1126 if( U_SUCCESS(*pErrorCode
) &&
1127 !inDirectMode
&& byteIndex
==0 &&
1128 pArgs
->flush
&& source
>=sourceLimit
1130 if(base64Counter
==-1) {
1131 /* & at the very end of the input */
1132 /* make the ampersand the reported sequence */
1136 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1138 inDirectMode
=TRUE
; /* avoid looping */
1139 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
1142 /* set the converter state back into UConverter */
1143 cnv
->toUnicodeStatus
=((uint32_t)inDirectMode
<<24)|((uint32_t)((uint8_t)base64Counter
)<<16)|(uint32_t)bits
;
1144 cnv
->toULength
=byteIndex
;
1146 /* write back the updated pointers */
1147 pArgs
->source
=(const char *)source
;
1148 pArgs
->target
=target
;
1149 pArgs
->offsets
=offsets
;
1154 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1155 UErrorCode
*pErrorCode
) {
1157 const UChar
*source
, *sourceLimit
;
1158 uint8_t *target
, *targetLimit
;
1161 int32_t length
, targetCapacity
, sourceIndex
;
1167 int8_t base64Counter
;
1170 /* set up the local pointers */
1171 cnv
=pArgs
->converter
;
1173 /* set up the local pointers */
1174 source
=pArgs
->source
;
1175 sourceLimit
=pArgs
->sourceLimit
;
1176 target
=(uint8_t *)pArgs
->target
;
1177 targetLimit
=(uint8_t *)pArgs
->targetLimit
;
1178 offsets
=pArgs
->offsets
;
1180 /* get the state machine state */
1182 uint32_t status
=cnv
->fromUnicodeStatus
;
1183 inDirectMode
=(UBool
)((status
>>24)&1);
1184 base64Counter
=(int8_t)(status
>>16);
1185 bits
=(uint8_t)status
;
1188 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1193 length
=(int32_t)(sourceLimit
-source
);
1194 targetCapacity
=(int32_t)(targetLimit
-target
);
1195 if(length
>targetCapacity
) {
1196 length
=targetCapacity
;
1200 /* encode 0x20..0x7e except '&' directly */
1202 /* encode directly */
1203 *target
++=(uint8_t)c
;
1205 *offsets
++=sourceIndex
++;
1207 } else if(c
==AMPERSAND
) {
1208 /* output &- for & */
1209 *target
++=AMPERSAND
;
1210 if(target
<targetLimit
) {
1213 *offsets
++=sourceIndex
;
1214 *offsets
++=sourceIndex
++;
1216 /* realign length and targetCapacity */
1220 *offsets
++=sourceIndex
++;
1222 cnv
->charErrorBuffer
[0]=MINUS
;
1223 cnv
->charErrorBufferLength
=1;
1224 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1228 /* un-read this character and switch to Unicode Mode */
1230 *target
++=AMPERSAND
;
1232 *offsets
++=sourceIndex
;
1240 if(source
<sourceLimit
&& target
>=targetLimit
) {
1241 /* target is full */
1242 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1246 while(source
<sourceLimit
) {
1247 if(target
<targetLimit
) {
1249 if(isLegalIMAP(c
)) {
1250 /* encode directly */
1253 /* trick: back out this character to make this easier */
1256 /* terminate the base64 sequence */
1257 if(base64Counter
!=0) {
1258 /* write remaining bits for the previous character */
1259 *target
++=TO_BASE64_IMAP(bits
);
1261 *offsets
++=sourceIndex
-1;
1264 /* need to terminate with a minus */
1265 if(target
<targetLimit
) {
1268 *offsets
++=sourceIndex
-1;
1271 cnv
->charErrorBuffer
[0]=MINUS
;
1272 cnv
->charErrorBufferLength
=1;
1273 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1279 * base64 this character:
1280 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1281 * and the bits of this character, each implicitly in UTF-16BE.
1283 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1284 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1285 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1287 switch(base64Counter
) {
1290 *target
++=TO_BASE64_IMAP(b
);
1291 if(target
<targetLimit
) {
1292 b
=(uint8_t)((c
>>4)&0x3f);
1293 *target
++=TO_BASE64_IMAP(b
);
1295 *offsets
++=sourceIndex
;
1296 *offsets
++=sourceIndex
++;
1300 *offsets
++=sourceIndex
++;
1302 b
=(uint8_t)((c
>>4)&0x3f);
1303 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1304 cnv
->charErrorBufferLength
=1;
1305 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1307 bits
=(uint8_t)((c
&15)<<2);
1311 b
=(uint8_t)(bits
|(c
>>14));
1312 *target
++=TO_BASE64_IMAP(b
);
1313 if(target
<targetLimit
) {
1314 b
=(uint8_t)((c
>>8)&0x3f);
1315 *target
++=TO_BASE64_IMAP(b
);
1316 if(target
<targetLimit
) {
1317 b
=(uint8_t)((c
>>2)&0x3f);
1318 *target
++=TO_BASE64_IMAP(b
);
1320 *offsets
++=sourceIndex
;
1321 *offsets
++=sourceIndex
;
1322 *offsets
++=sourceIndex
++;
1326 *offsets
++=sourceIndex
;
1327 *offsets
++=sourceIndex
++;
1329 b
=(uint8_t)((c
>>2)&0x3f);
1330 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1331 cnv
->charErrorBufferLength
=1;
1332 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1336 *offsets
++=sourceIndex
++;
1338 b
=(uint8_t)((c
>>8)&0x3f);
1339 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1340 b
=(uint8_t)((c
>>2)&0x3f);
1341 cnv
->charErrorBuffer
[1]=TO_BASE64_IMAP(b
);
1342 cnv
->charErrorBufferLength
=2;
1343 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1345 bits
=(uint8_t)((c
&3)<<4);
1349 b
=(uint8_t)(bits
|(c
>>12));
1350 *target
++=TO_BASE64_IMAP(b
);
1351 if(target
<targetLimit
) {
1352 b
=(uint8_t)((c
>>6)&0x3f);
1353 *target
++=TO_BASE64_IMAP(b
);
1354 if(target
<targetLimit
) {
1355 b
=(uint8_t)(c
&0x3f);
1356 *target
++=TO_BASE64_IMAP(b
);
1358 *offsets
++=sourceIndex
;
1359 *offsets
++=sourceIndex
;
1360 *offsets
++=sourceIndex
++;
1364 *offsets
++=sourceIndex
;
1365 *offsets
++=sourceIndex
++;
1367 b
=(uint8_t)(c
&0x3f);
1368 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1369 cnv
->charErrorBufferLength
=1;
1370 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1374 *offsets
++=sourceIndex
++;
1376 b
=(uint8_t)((c
>>6)&0x3f);
1377 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1378 b
=(uint8_t)(c
&0x3f);
1379 cnv
->charErrorBuffer
[1]=TO_BASE64_IMAP(b
);
1380 cnv
->charErrorBufferLength
=2;
1381 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1387 /* will never occur */
1392 /* target is full */
1393 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1399 if(pArgs
->flush
&& source
>=sourceLimit
) {
1400 /* flush remaining bits to the target */
1402 if(base64Counter
!=0) {
1403 if(target
<targetLimit
) {
1404 *target
++=TO_BASE64_IMAP(bits
);
1406 *offsets
++=sourceIndex
-1;
1409 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=TO_BASE64_IMAP(bits
);
1410 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1413 /* need to terminate with a minus */
1414 if(target
<targetLimit
) {
1417 *offsets
++=sourceIndex
-1;
1420 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=MINUS
;
1421 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1424 /* reset the state for the next conversion */
1425 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1427 /* set the converter state back into UConverter */
1428 cnv
->fromUnicodeStatus
=
1429 (cnv
->fromUnicodeStatus
&0xf0000000)| /* keep version*/
1430 ((uint32_t)inDirectMode
<<24)|((uint32_t)base64Counter
<<16)|(uint32_t)bits
;
1433 /* write back the updated pointers */
1434 pArgs
->source
=source
;
1435 pArgs
->target
=(char *)target
;
1436 pArgs
->offsets
=offsets
;
1440 static const UConverterImpl _IMAPImpl
={
1450 _IMAPToUnicodeWithOffsets
,
1451 _IMAPToUnicodeWithOffsets
,
1452 _IMAPFromUnicodeWithOffsets
,
1453 _IMAPFromUnicodeWithOffsets
,
1458 NULL
, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1460 ucnv_getCompleteUnicodeSet
1463 static const UConverterStaticData _IMAPStaticData
={
1464 sizeof(UConverterStaticData
),
1465 "IMAP-mailbox-name",
1466 0, /* TODO CCSID for IMAP-mailbox-name */
1467 UCNV_IBM
, UCNV_IMAP_MAILBOX
,
1469 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1473 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1476 const UConverterSharedData _IMAPData
=
1477 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData
, &_IMAPImpl
);