2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
26 /* UTF-7 -------------------------------------------------------------------- */
29 * UTF-7 is a stateful encoding of Unicode.
30 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
31 * It was intended for use in Internet email systems, using in its bytewise
32 * encoding only a subset of 7-bit US-ASCII.
33 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
36 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
37 * characters directly or in base64. Especially, the characters in set O
38 * as defined in the RFC (see below) may be encoded directly but are not
39 * allowed in, e.g., email headers.
40 * By default, the ICU UTF-7 converter encodes set O directly.
41 * By choosing the option "version=1", set O will be escaped instead.
43 * utf7Converter=ucnv_open("UTF-7,version=1");
45 * For details about email headers see RFC 2047.
49 * Tests for US-ASCII characters belonging to character classes
52 * Set D (directly encoded characters) consists of the following
53 * characters: the upper and lower case letters A through Z
54 * and a through z, the 10 digits 0-9, and the following nine special
55 * characters (note that "+" and "=" are omitted):
58 * Set O (optional direct characters) consists of the following
59 * characters (note that "\" and "~" are omitted):
60 * !"#$%&*;<=>@[]^_`{|}
62 * According to the rules in RFC 2152, the byte values for the following
63 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
64 * - all C0 control codes except for CR LF TAB
68 * - all codes beyond US-ASCII, i.e. all >127
71 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
72 (uint8_t)((c)-48)<10 || /* digits */ \
73 (uint8_t)((c)-39)<3 || /* '() */ \
74 (uint8_t)((c)-44)<4 || /* ,-./ */ \
75 (c)==58 || (c)==63 /* :? */ \
79 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
80 (uint8_t)((c)-59)<4 || /* ;<=> */ \
81 (uint8_t)((c)-93)<4 || /* ]^_` */ \
82 (uint8_t)((c)-123)<3 || /* {|} */ \
83 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
86 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
87 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
94 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
95 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
97 /* encode directly sets D and O and CR LF SP TAB */
98 static const UBool encodeDirectlyMaximum
[128]={
99 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
113 /* encode directly set D and CR LF SP TAB but not set O */
114 static const UBool encodeDirectlyRestricted
[128]={
115 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
122 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
132 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
133 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
135 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
136 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
138 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
145 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
147 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
149 /* general punctuation with + and / and a special value (-2) for - */
150 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
152 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
155 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
156 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
159 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
160 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164 * converter status values:
167 * 24 inDirectMode (boolean)
168 * 23..16 base64Counter (-1..7)
169 * 15..0 bits (up to 14 bits incoming base64)
172 * 31..28 version (0: set O direct 1: set O escaped)
173 * 24 inDirectMode (boolean)
174 * 23..16 base64Counter (0..2)
175 * 7..0 bits (6 bits outgoing base64)
180 _UTF7Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
181 if(choice
<=UCNV_RESET_TO_UNICODE
) {
182 /* reset toUnicode */
183 cnv
->toUnicodeStatus
=0x1000000; /* inDirectMode=TRUE */
186 if(choice
!=UCNV_RESET_TO_UNICODE
) {
187 /* reset fromUnicode */
188 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
193 _UTF7Open(UConverter
*cnv
,
194 UConverterLoadArgs
*pArgs
,
195 UErrorCode
*pErrorCode
) {
196 if(UCNV_GET_VERSION(cnv
)<=1) {
197 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
198 cnv
->fromUnicodeStatus
=UCNV_GET_VERSION(cnv
)<<28;
199 _UTF7Reset(cnv
, UCNV_RESET_BOTH
);
201 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
206 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
207 UErrorCode
*pErrorCode
) {
209 const uint8_t *source
, *sourceLimit
;
211 const UChar
*targetLimit
;
217 int32_t length
, targetCapacity
;
221 int8_t base64Counter
;
226 int32_t sourceIndex
, nextSourceIndex
;
229 /* set up the local pointers */
230 cnv
=pArgs
->converter
;
232 source
=(const uint8_t *)pArgs
->source
;
233 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
234 target
=pArgs
->target
;
235 targetLimit
=pArgs
->targetLimit
;
236 offsets
=pArgs
->offsets
;
237 /* get the state machine state */
239 uint32_t status
=cnv
->toUnicodeStatus
;
240 inDirectMode
=(UBool
)((status
>>24)&1);
241 base64Counter
=(int8_t)(status
>>16);
242 bits
=(uint16_t)status
;
245 byteIndex
=cnv
->toULength
;
247 /* sourceIndex=-1 if the current character began in the previous buffer */
248 sourceIndex
=byteIndex
==0 ? 0 : -1;
254 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
255 * with their US-ASCII byte values.
256 * Backslash and Tilde and most control characters are not allowed in UTF-7.
257 * A plus sign starts Unicode (or "escape") Mode.
259 * In Direct Mode, only the sourceIndex is used.
262 length
=(int32_t)(sourceLimit
-source
);
263 targetCapacity
=(int32_t)(targetLimit
-target
);
264 if(length
>targetCapacity
) {
265 length
=targetCapacity
;
269 if(!isLegalUTF7(b
)) {
273 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
276 /* write directly encoded character */
279 *offsets
++=sourceIndex
++;
282 /* switch to Unicode mode */
283 nextSourceIndex
=++sourceIndex
;
292 if(source
<sourceLimit
&& target
>=targetLimit
) {
294 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
299 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
300 * The base64 sequence ends with any character that is not in the base64 alphabet.
301 * A terminating minus sign is consumed.
303 * In Unicode Mode, the sourceIndex has the index to the start of the current
304 * base64 bytes, while nextSourceIndex is precisely parallel to source,
305 * keeping the index to the following byte.
306 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
308 while(source
<sourceLimit
) {
309 if(target
<targetLimit
) {
310 bytes
[byteIndex
++]=b
=*source
++;
312 base64Value
= -3; /* initialize as illegal */
313 if(b
>=126 || (base64Value
=fromBase64
[b
])==-3 || base64Value
==-1) {
315 * base64Value==-1 for any legal character except base64 and minus sign, or
316 * base64Value==-3 for illegal characters:
317 * 1. In either case, leave Unicode mode.
318 * 2.1. If we ended with an incomplete UChar or none after the +, then
319 * generate an error for the preceding erroneous sequence and deal with
320 * the current (possibly illegal) character next time through.
321 * 2.2. Else the current char comes after a complete UChar, which was already
322 * pushed to the output buf, so:
323 * 2.2.1. If the current char is legal, just save it for processing next time.
324 * It may be for example, a plus which we need to deal with in direct mode.
325 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
328 if(base64Counter
==-1) {
329 /* illegal: + immediately followed by something other than base64 or minus sign */
330 /* include the plus sign in the reported sequence, but not the subsequent char */
334 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
337 /* bits are illegally left over, a UChar is incomplete */
338 /* don't include current char (legal or illegal) in error seq */
341 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
344 /* previous UChar was complete */
345 if(base64Value
==-3) {
346 /* current character is illegal, deal with it here */
347 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
350 /* un-read the current character in case it is a plus sign */
352 sourceIndex
=nextSourceIndex
-1;
356 } else if(base64Value
>=0) {
357 /* collect base64 bytes into UChars */
358 switch(base64Counter
) {
359 case -1: /* -1 is immediately after the + */
368 bits
=(uint16_t)((bits
<<6)|base64Value
);
372 *target
++=(UChar
)((bits
<<4)|(base64Value
>>2));
374 *offsets
++=sourceIndex
;
375 sourceIndex
=nextSourceIndex
-1;
377 bytes
[0]=b
; /* keep this byte in case an error occurs */
379 bits
=(uint16_t)(base64Value
&3);
383 *target
++=(UChar
)((bits
<<2)|(base64Value
>>4));
385 *offsets
++=sourceIndex
;
386 sourceIndex
=nextSourceIndex
-1;
388 bytes
[0]=b
; /* keep this byte in case an error occurs */
390 bits
=(uint16_t)(base64Value
&15);
394 *target
++=(UChar
)((bits
<<6)|base64Value
);
396 *offsets
++=sourceIndex
;
397 sourceIndex
=nextSourceIndex
;
404 /* will never occur */
407 } else /*base64Value==-2*/ {
408 /* minus sign terminates the base64 sequence */
410 if(base64Counter
==-1) {
411 /* +- i.e. a minus immediately following a plus */
414 *offsets
++=sourceIndex
-1;
417 /* absorb the minus and leave the Unicode Mode */
419 /* bits are illegally left over, a UChar is incomplete */
420 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
424 sourceIndex
=nextSourceIndex
;
429 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
435 if(U_SUCCESS(*pErrorCode
) && pArgs
->flush
&& source
==sourceLimit
&& bits
==0) {
437 * if we are in Unicode mode, then the byteIndex might not be 0,
438 * but that is ok if bits==0
439 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
440 * (not true for IMAP-mailbox-name where we must end in direct mode)
445 /* set the converter state back into UConverter */
446 cnv
->toUnicodeStatus
=((uint32_t)inDirectMode
<<24)|((uint32_t)((uint8_t)base64Counter
)<<16)|(uint32_t)bits
;
447 cnv
->toULength
=byteIndex
;
449 /* write back the updated pointers */
450 pArgs
->source
=(const char *)source
;
451 pArgs
->target
=target
;
452 pArgs
->offsets
=offsets
;
457 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
458 UErrorCode
*pErrorCode
) {
460 const UChar
*source
, *sourceLimit
;
461 uint8_t *target
, *targetLimit
;
464 int32_t length
, targetCapacity
, sourceIndex
;
468 const UBool
*encodeDirectly
;
470 int8_t base64Counter
;
473 /* set up the local pointers */
474 cnv
=pArgs
->converter
;
476 /* set up the local pointers */
477 source
=pArgs
->source
;
478 sourceLimit
=pArgs
->sourceLimit
;
479 target
=(uint8_t *)pArgs
->target
;
480 targetLimit
=(uint8_t *)pArgs
->targetLimit
;
481 offsets
=pArgs
->offsets
;
483 /* get the state machine state */
485 uint32_t status
=cnv
->fromUnicodeStatus
;
486 encodeDirectly
= status
<0x10000000 ? encodeDirectlyMaximum
: encodeDirectlyRestricted
;
487 inDirectMode
=(UBool
)((status
>>24)&1);
488 base64Counter
=(int8_t)(status
>>16);
489 bits
=(uint8_t)status
;
490 U_ASSERT(bits
<=sizeof(toBase64
)/sizeof(toBase64
[0]));
493 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
498 length
=(int32_t)(sourceLimit
-source
);
499 targetCapacity
=(int32_t)(targetLimit
-target
);
500 if(length
>targetCapacity
) {
501 length
=targetCapacity
;
505 /* currently always encode CR LF SP TAB directly */
506 if(c
<=127 && encodeDirectly
[c
]) {
507 /* encode directly */
508 *target
++=(uint8_t)c
;
510 *offsets
++=sourceIndex
++;
513 /* output +- for + */
515 if(target
<targetLimit
) {
518 *offsets
++=sourceIndex
;
519 *offsets
++=sourceIndex
++;
521 /* realign length and targetCapacity */
525 *offsets
++=sourceIndex
++;
527 cnv
->charErrorBuffer
[0]=MINUS
;
528 cnv
->charErrorBufferLength
=1;
529 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
533 /* un-read this character and switch to Unicode Mode */
537 *offsets
++=sourceIndex
;
545 if(source
<sourceLimit
&& target
>=targetLimit
) {
547 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
551 while(source
<sourceLimit
) {
552 if(target
<targetLimit
) {
554 if(c
<=127 && encodeDirectly
[c
]) {
555 /* encode directly */
558 /* trick: back out this character to make this easier */
561 /* terminate the base64 sequence */
562 if(base64Counter
!=0) {
563 /* write remaining bits for the previous character */
564 *target
++=toBase64
[bits
];
566 *offsets
++=sourceIndex
-1;
569 if(fromBase64
[c
]!=-1) {
570 /* need to terminate with a minus */
571 if(target
<targetLimit
) {
574 *offsets
++=sourceIndex
-1;
577 cnv
->charErrorBuffer
[0]=MINUS
;
578 cnv
->charErrorBufferLength
=1;
579 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
586 * base64 this character:
587 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
588 * and the bits of this character, each implicitly in UTF-16BE.
590 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
591 * character to the next. The actual 2 or 4 bits are shifted to the left edge
592 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
594 switch(base64Counter
) {
596 *target
++=toBase64
[c
>>10];
597 if(target
<targetLimit
) {
598 *target
++=toBase64
[(c
>>4)&0x3f];
600 *offsets
++=sourceIndex
;
601 *offsets
++=sourceIndex
++;
605 *offsets
++=sourceIndex
++;
607 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>4)&0x3f];
608 cnv
->charErrorBufferLength
=1;
609 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
611 bits
=(uint8_t)((c
&15)<<2);
615 *target
++=toBase64
[bits
|(c
>>14)];
616 if(target
<targetLimit
) {
617 *target
++=toBase64
[(c
>>8)&0x3f];
618 if(target
<targetLimit
) {
619 *target
++=toBase64
[(c
>>2)&0x3f];
621 *offsets
++=sourceIndex
;
622 *offsets
++=sourceIndex
;
623 *offsets
++=sourceIndex
++;
627 *offsets
++=sourceIndex
;
628 *offsets
++=sourceIndex
++;
630 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>2)&0x3f];
631 cnv
->charErrorBufferLength
=1;
632 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
636 *offsets
++=sourceIndex
++;
638 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>8)&0x3f];
639 cnv
->charErrorBuffer
[1]=toBase64
[(c
>>2)&0x3f];
640 cnv
->charErrorBufferLength
=2;
641 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
643 bits
=(uint8_t)((c
&3)<<4);
647 *target
++=toBase64
[bits
|(c
>>12)];
648 if(target
<targetLimit
) {
649 *target
++=toBase64
[(c
>>6)&0x3f];
650 if(target
<targetLimit
) {
651 *target
++=toBase64
[c
&0x3f];
653 *offsets
++=sourceIndex
;
654 *offsets
++=sourceIndex
;
655 *offsets
++=sourceIndex
++;
659 *offsets
++=sourceIndex
;
660 *offsets
++=sourceIndex
++;
662 cnv
->charErrorBuffer
[0]=toBase64
[c
&0x3f];
663 cnv
->charErrorBufferLength
=1;
664 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
668 *offsets
++=sourceIndex
++;
670 cnv
->charErrorBuffer
[0]=toBase64
[(c
>>6)&0x3f];
671 cnv
->charErrorBuffer
[1]=toBase64
[c
&0x3f];
672 cnv
->charErrorBufferLength
=2;
673 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
679 /* will never occur */
685 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
691 if(pArgs
->flush
&& source
>=sourceLimit
) {
692 /* flush remaining bits to the target */
694 if (base64Counter
!=0) {
695 if(target
<targetLimit
) {
696 *target
++=toBase64
[bits
];
698 *offsets
++=sourceIndex
-1;
701 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=toBase64
[bits
];
702 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
705 /* Add final MINUS to terminate unicodeMode */
706 if(target
<targetLimit
) {
709 *offsets
++=sourceIndex
-1;
712 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=MINUS
;
713 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
716 /* reset the state for the next conversion */
717 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
719 /* set the converter state back into UConverter */
720 cnv
->fromUnicodeStatus
=
721 (cnv
->fromUnicodeStatus
&0xf0000000)| /* keep version*/
722 ((uint32_t)inDirectMode
<<24)|((uint32_t)base64Counter
<<16)|(uint32_t)bits
;
725 /* write back the updated pointers */
726 pArgs
->source
=source
;
727 pArgs
->target
=(char *)target
;
728 pArgs
->offsets
=offsets
;
733 _UTF7GetName(const UConverter
*cnv
) {
734 switch(cnv
->fromUnicodeStatus
>>28) {
736 return "UTF-7,version=1";
742 static const UConverterImpl _UTF7Impl
={
752 _UTF7ToUnicodeWithOffsets
,
753 _UTF7ToUnicodeWithOffsets
,
754 _UTF7FromUnicodeWithOffsets
,
755 _UTF7FromUnicodeWithOffsets
,
760 NULL
, /* we don't need writeSub() because we never call a callback at fromUnicode() */
762 ucnv_getCompleteUnicodeSet
765 static const UConverterStaticData _UTF7StaticData
={
766 sizeof(UConverterStaticData
),
768 0, /* TODO CCSID for UTF-7 */
771 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
775 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
778 const UConverterSharedData _UTF7Data
={
779 sizeof(UConverterSharedData
), ~((uint32_t)0),
780 NULL
, NULL
, &_UTF7StaticData
, FALSE
, &_UTF7Impl
,
784 /* IMAP mailbox name encoding ----------------------------------------------- */
787 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
788 * http://www.ietf.org/rfc/rfc2060.txt
790 * 5.1.3. Mailbox International Naming Convention
792 * By convention, international mailbox names are specified using a
793 * modified version of the UTF-7 encoding described in [UTF-7]. The
794 * purpose of these modifications is to correct the following problems
797 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
798 * the common use of "+" in mailbox names, in particular USENET
801 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
802 * conflicts with the use of "/" as a popular hierarchy delimiter.
804 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
805 * the use of "\" as a popular hierarchy delimiter.
807 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
808 * the use of "~" in some servers as a home directory indicator.
810 * 5) UTF-7 permits multiple alternate forms to represent the same
811 * string; in particular, printable US-ASCII chararacters can be
812 * represented in encoded form.
814 * In modified UTF-7, printable US-ASCII characters except for "&"
815 * represent themselves; that is, characters with octet values 0x20-0x25
816 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
817 * octet sequence "&-".
819 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
820 * Unicode 16-bit octets) are represented in modified BASE64, with a
821 * further modification from [UTF-7] that "," is used instead of "/".
822 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
823 * character which can represent itself.
825 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
826 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
827 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
830 * For example, here is a mailbox name which mixes English, Japanese,
831 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
835 * Tests for US-ASCII characters belonging to character classes
838 * Set D (directly encoded characters) consists of the following
839 * characters: the upper and lower case letters A through Z
840 * and a through z, the 10 digits 0-9, and the following nine special
841 * characters (note that "+" and "=" are omitted):
844 * Set O (optional direct characters) consists of the following
845 * characters (note that "\" and "~" are omitted):
846 * !"#$%&*;<=>@[]^_`{|}
848 * According to the rules in RFC 2152, the byte values for the following
849 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
850 * - all C0 control codes except for CR LF TAB
854 * - all codes beyond US-ASCII, i.e. all >127
857 /* uses '&' not '+' to start a base64 sequence */
858 #define AMPERSAND 0x26
862 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
863 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
865 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
866 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
868 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
869 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
872 * converter status values:
875 * 24 inDirectMode (boolean)
876 * 23..16 base64Counter (-1..7)
877 * 15..0 bits (up to 14 bits incoming base64)
880 * 24 inDirectMode (boolean)
881 * 23..16 base64Counter (0..2)
882 * 7..0 bits (6 bits outgoing base64)
888 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
889 UErrorCode
*pErrorCode
) {
891 const uint8_t *source
, *sourceLimit
;
893 const UChar
*targetLimit
;
899 int32_t length
, targetCapacity
;
903 int8_t base64Counter
;
908 int32_t sourceIndex
, nextSourceIndex
;
913 /* set up the local pointers */
914 cnv
=pArgs
->converter
;
916 source
=(const uint8_t *)pArgs
->source
;
917 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
918 target
=pArgs
->target
;
919 targetLimit
=pArgs
->targetLimit
;
920 offsets
=pArgs
->offsets
;
921 /* get the state machine state */
923 uint32_t status
=cnv
->toUnicodeStatus
;
924 inDirectMode
=(UBool
)((status
>>24)&1);
925 base64Counter
=(int8_t)(status
>>16);
926 bits
=(uint16_t)status
;
929 byteIndex
=cnv
->toULength
;
931 /* sourceIndex=-1 if the current character began in the previous buffer */
932 sourceIndex
=byteIndex
==0 ? 0 : -1;
938 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
939 * with their US-ASCII byte values.
940 * An ampersand starts Unicode (or "escape") Mode.
942 * In Direct Mode, only the sourceIndex is used.
945 length
=(int32_t)(sourceLimit
-source
);
946 targetCapacity
=(int32_t)(targetLimit
-target
);
947 if(length
>targetCapacity
) {
948 length
=targetCapacity
;
952 if(!isLegalIMAP(b
)) {
956 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
958 } else if(b
!=AMPERSAND
) {
959 /* write directly encoded character */
962 *offsets
++=sourceIndex
++;
964 } else /* AMPERSAND */ {
965 /* switch to Unicode mode */
966 nextSourceIndex
=++sourceIndex
;
975 if(source
<sourceLimit
&& target
>=targetLimit
) {
977 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
982 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
983 * The base64 sequence ends with any character that is not in the base64 alphabet.
984 * A terminating minus sign is consumed.
985 * US-ASCII must not be base64-ed.
987 * In Unicode Mode, the sourceIndex has the index to the start of the current
988 * base64 bytes, while nextSourceIndex is precisely parallel to source,
989 * keeping the index to the following byte.
990 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
992 while(source
<sourceLimit
) {
993 if(target
<targetLimit
) {
994 bytes
[byteIndex
++]=b
=*source
++;
997 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
999 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1001 } else if((base64Value
=FROM_BASE64_IMAP(b
))>=0) {
1002 /* collect base64 bytes into UChars */
1003 switch(base64Counter
) {
1004 case -1: /* -1 is immediately after the & */
1013 bits
=(uint16_t)((bits
<<6)|base64Value
);
1017 c
=(UChar
)((bits
<<4)|(base64Value
>>2));
1018 if(isLegalIMAP(c
)) {
1021 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1026 *offsets
++=sourceIndex
;
1027 sourceIndex
=nextSourceIndex
-1;
1029 bytes
[0]=b
; /* keep this byte in case an error occurs */
1031 bits
=(uint16_t)(base64Value
&3);
1035 c
=(UChar
)((bits
<<2)|(base64Value
>>4));
1036 if(isLegalIMAP(c
)) {
1039 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1044 *offsets
++=sourceIndex
;
1045 sourceIndex
=nextSourceIndex
-1;
1047 bytes
[0]=b
; /* keep this byte in case an error occurs */
1049 bits
=(uint16_t)(base64Value
&15);
1053 c
=(UChar
)((bits
<<6)|base64Value
);
1054 if(isLegalIMAP(c
)) {
1057 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1062 *offsets
++=sourceIndex
;
1063 sourceIndex
=nextSourceIndex
;
1070 /* will never occur */
1073 } else if(base64Value
==-2) {
1074 /* minus sign terminates the base64 sequence */
1076 if(base64Counter
==-1) {
1077 /* &- i.e. a minus immediately following an ampersand */
1078 *target
++=AMPERSAND
;
1080 *offsets
++=sourceIndex
-1;
1083 /* absorb the minus and leave the Unicode Mode */
1084 if(bits
!=0 || (base64Counter
!=0 && base64Counter
!=3 && base64Counter
!=6)) {
1085 /* bits are illegally left over, a UChar is incomplete */
1086 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1087 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1091 sourceIndex
=nextSourceIndex
;
1094 if(base64Counter
==-1) {
1095 /* illegal: & immediately followed by something other than base64 or minus sign */
1096 /* include the ampersand in the reported sequence */
1102 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1103 /* base64Value==-3 for illegal characters */
1106 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1110 /* target is full */
1111 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1119 * the end of the input stream and detection of truncated input
1120 * are handled by the framework, but here we must check if we are in Unicode
1121 * mode and byteIndex==0 because we must end in direct mode
1125 * in Unicode mode and byteIndex==0
1126 * end of input and no truncated input
1128 if( U_SUCCESS(*pErrorCode
) &&
1129 !inDirectMode
&& byteIndex
==0 &&
1130 pArgs
->flush
&& source
>=sourceLimit
1132 if(base64Counter
==-1) {
1133 /* & at the very end of the input */
1134 /* make the ampersand the reported sequence */
1138 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1140 inDirectMode
=TRUE
; /* avoid looping */
1141 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
1144 /* set the converter state back into UConverter */
1145 cnv
->toUnicodeStatus
=((uint32_t)inDirectMode
<<24)|((uint32_t)((uint8_t)base64Counter
)<<16)|(uint32_t)bits
;
1146 cnv
->toULength
=byteIndex
;
1148 /* write back the updated pointers */
1149 pArgs
->source
=(const char *)source
;
1150 pArgs
->target
=target
;
1151 pArgs
->offsets
=offsets
;
1156 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1157 UErrorCode
*pErrorCode
) {
1159 const UChar
*source
, *sourceLimit
;
1160 uint8_t *target
, *targetLimit
;
1163 int32_t length
, targetCapacity
, sourceIndex
;
1169 int8_t base64Counter
;
1172 /* set up the local pointers */
1173 cnv
=pArgs
->converter
;
1175 /* set up the local pointers */
1176 source
=pArgs
->source
;
1177 sourceLimit
=pArgs
->sourceLimit
;
1178 target
=(uint8_t *)pArgs
->target
;
1179 targetLimit
=(uint8_t *)pArgs
->targetLimit
;
1180 offsets
=pArgs
->offsets
;
1182 /* get the state machine state */
1184 uint32_t status
=cnv
->fromUnicodeStatus
;
1185 inDirectMode
=(UBool
)((status
>>24)&1);
1186 base64Counter
=(int8_t)(status
>>16);
1187 bits
=(uint8_t)status
;
1190 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1195 length
=(int32_t)(sourceLimit
-source
);
1196 targetCapacity
=(int32_t)(targetLimit
-target
);
1197 if(length
>targetCapacity
) {
1198 length
=targetCapacity
;
1202 /* encode 0x20..0x7e except '&' directly */
1204 /* encode directly */
1205 *target
++=(uint8_t)c
;
1207 *offsets
++=sourceIndex
++;
1209 } else if(c
==AMPERSAND
) {
1210 /* output &- for & */
1211 *target
++=AMPERSAND
;
1212 if(target
<targetLimit
) {
1215 *offsets
++=sourceIndex
;
1216 *offsets
++=sourceIndex
++;
1218 /* realign length and targetCapacity */
1222 *offsets
++=sourceIndex
++;
1224 cnv
->charErrorBuffer
[0]=MINUS
;
1225 cnv
->charErrorBufferLength
=1;
1226 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1230 /* un-read this character and switch to Unicode Mode */
1232 *target
++=AMPERSAND
;
1234 *offsets
++=sourceIndex
;
1242 if(source
<sourceLimit
&& target
>=targetLimit
) {
1243 /* target is full */
1244 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1248 while(source
<sourceLimit
) {
1249 if(target
<targetLimit
) {
1251 if(isLegalIMAP(c
)) {
1252 /* encode directly */
1255 /* trick: back out this character to make this easier */
1258 /* terminate the base64 sequence */
1259 if(base64Counter
!=0) {
1260 /* write remaining bits for the previous character */
1261 *target
++=TO_BASE64_IMAP(bits
);
1263 *offsets
++=sourceIndex
-1;
1266 /* need to terminate with a minus */
1267 if(target
<targetLimit
) {
1270 *offsets
++=sourceIndex
-1;
1273 cnv
->charErrorBuffer
[0]=MINUS
;
1274 cnv
->charErrorBufferLength
=1;
1275 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1281 * base64 this character:
1282 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1283 * and the bits of this character, each implicitly in UTF-16BE.
1285 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1286 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1287 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1289 switch(base64Counter
) {
1292 *target
++=TO_BASE64_IMAP(b
);
1293 if(target
<targetLimit
) {
1294 b
=(uint8_t)((c
>>4)&0x3f);
1295 *target
++=TO_BASE64_IMAP(b
);
1297 *offsets
++=sourceIndex
;
1298 *offsets
++=sourceIndex
++;
1302 *offsets
++=sourceIndex
++;
1304 b
=(uint8_t)((c
>>4)&0x3f);
1305 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1306 cnv
->charErrorBufferLength
=1;
1307 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1309 bits
=(uint8_t)((c
&15)<<2);
1313 b
=(uint8_t)(bits
|(c
>>14));
1314 *target
++=TO_BASE64_IMAP(b
);
1315 if(target
<targetLimit
) {
1316 b
=(uint8_t)((c
>>8)&0x3f);
1317 *target
++=TO_BASE64_IMAP(b
);
1318 if(target
<targetLimit
) {
1319 b
=(uint8_t)((c
>>2)&0x3f);
1320 *target
++=TO_BASE64_IMAP(b
);
1322 *offsets
++=sourceIndex
;
1323 *offsets
++=sourceIndex
;
1324 *offsets
++=sourceIndex
++;
1328 *offsets
++=sourceIndex
;
1329 *offsets
++=sourceIndex
++;
1331 b
=(uint8_t)((c
>>2)&0x3f);
1332 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1333 cnv
->charErrorBufferLength
=1;
1334 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1338 *offsets
++=sourceIndex
++;
1340 b
=(uint8_t)((c
>>8)&0x3f);
1341 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1342 b
=(uint8_t)((c
>>2)&0x3f);
1343 cnv
->charErrorBuffer
[1]=TO_BASE64_IMAP(b
);
1344 cnv
->charErrorBufferLength
=2;
1345 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1347 bits
=(uint8_t)((c
&3)<<4);
1351 b
=(uint8_t)(bits
|(c
>>12));
1352 *target
++=TO_BASE64_IMAP(b
);
1353 if(target
<targetLimit
) {
1354 b
=(uint8_t)((c
>>6)&0x3f);
1355 *target
++=TO_BASE64_IMAP(b
);
1356 if(target
<targetLimit
) {
1357 b
=(uint8_t)(c
&0x3f);
1358 *target
++=TO_BASE64_IMAP(b
);
1360 *offsets
++=sourceIndex
;
1361 *offsets
++=sourceIndex
;
1362 *offsets
++=sourceIndex
++;
1366 *offsets
++=sourceIndex
;
1367 *offsets
++=sourceIndex
++;
1369 b
=(uint8_t)(c
&0x3f);
1370 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1371 cnv
->charErrorBufferLength
=1;
1372 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1376 *offsets
++=sourceIndex
++;
1378 b
=(uint8_t)((c
>>6)&0x3f);
1379 cnv
->charErrorBuffer
[0]=TO_BASE64_IMAP(b
);
1380 b
=(uint8_t)(c
&0x3f);
1381 cnv
->charErrorBuffer
[1]=TO_BASE64_IMAP(b
);
1382 cnv
->charErrorBufferLength
=2;
1383 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1389 /* will never occur */
1394 /* target is full */
1395 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1401 if(pArgs
->flush
&& source
>=sourceLimit
) {
1402 /* flush remaining bits to the target */
1404 if(base64Counter
!=0) {
1405 if(target
<targetLimit
) {
1406 *target
++=TO_BASE64_IMAP(bits
);
1408 *offsets
++=sourceIndex
-1;
1411 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=TO_BASE64_IMAP(bits
);
1412 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1415 /* need to terminate with a minus */
1416 if(target
<targetLimit
) {
1419 *offsets
++=sourceIndex
-1;
1422 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++]=MINUS
;
1423 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1426 /* reset the state for the next conversion */
1427 cnv
->fromUnicodeStatus
=(cnv
->fromUnicodeStatus
&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1429 /* set the converter state back into UConverter */
1430 cnv
->fromUnicodeStatus
=
1431 (cnv
->fromUnicodeStatus
&0xf0000000)| /* keep version*/
1432 ((uint32_t)inDirectMode
<<24)|((uint32_t)base64Counter
<<16)|(uint32_t)bits
;
1435 /* write back the updated pointers */
1436 pArgs
->source
=source
;
1437 pArgs
->target
=(char *)target
;
1438 pArgs
->offsets
=offsets
;
1442 static const UConverterImpl _IMAPImpl
={
1452 _IMAPToUnicodeWithOffsets
,
1453 _IMAPToUnicodeWithOffsets
,
1454 _IMAPFromUnicodeWithOffsets
,
1455 _IMAPFromUnicodeWithOffsets
,
1460 NULL
, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1462 ucnv_getCompleteUnicodeSet
1465 static const UConverterStaticData _IMAPStaticData
={
1466 sizeof(UConverterStaticData
),
1467 "IMAP-mailbox-name",
1468 0, /* TODO CCSID for IMAP-mailbox-name */
1469 UCNV_IBM
, UCNV_IMAP_MAILBOX
,
1471 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1475 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1478 const UConverterSharedData _IMAPData
={
1479 sizeof(UConverterSharedData
), ~((uint32_t)0),
1480 NULL
, NULL
, &_IMAPStaticData
, FALSE
, &_IMAPImpl
,