icuSources/common/ucnv_u7.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2016, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u7.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  20
  21 #include "cmemory.h"
  22 #include "unicode/ucnv.h"
  23 #include "ucnv_bld.h"
  24 #include "ucnv_cnv.h"
  25 #include "uassert.h"
  26
  27 /* UTF-7 -------------------------------------------------------------------- */
  28
  29 /*
  30  * UTF-7 is a stateful encoding of Unicode.
  31  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  32  * It was intended for use in Internet email systems, using in its bytewise
  33  * encoding only a subset of 7-bit US-ASCII.
  34  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  35  * occasionally used.
  36  *
  37  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  38  * characters directly or in base64. Especially, the characters in set O
  39  * as defined in the RFC (see below) may be encoded directly but are not
  40  * allowed in, e.g., email headers.
  41  * By default, the ICU UTF-7 converter encodes set O directly.
  42  * By choosing the option "version=1", set O will be escaped instead.
  43  * For example:
  44  *     utf7Converter=ucnv_open("UTF-7,version=1");
  45  *
  46  * For details about email headers see RFC 2047.
  47  */
  48
  49 /*
  50  * Tests for US-ASCII characters belonging to character classes
  51  * defined in UTF-7.
  52  *
  53  * Set D (directly encoded characters) consists of the following
  54  * characters: the upper and lower case letters A through Z
  55  * and a through z, the 10 digits 0-9, and the following nine special
  56  * characters (note that "+" and "=" are omitted):
  57  *     '(),-./:?
  58  *
  59  * Set O (optional direct characters) consists of the following
  60  * characters (note that "\" and "~" are omitted):
  61  *     !"#$%&*;<=>@[]^_`{|}
  62  *
  63  * According to the rules in RFC 2152, the byte values for the following
  64  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  65  * - all C0 control codes except for CR LF TAB
  66  * - BACKSLASH
  67  * - TILDE
  68  * - DEL
  69  * - all codes beyond US-ASCII, i.e. all >127
  70  */
  71 #define inSetD(c) \
  72     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  73      (uint8_t)((c)-48)<10 ||    /* digits */ \
  74      (uint8_t)((c)-39)<3 ||     /* '() */ \
  75      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
  76      (c)==58 || (c)==63         /* :? */ \
  77     )
  78
  79 #define inSetO(c) \
  80     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
  81      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
  82      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
  83      (uint8_t)((c)-123)<3 ||        /* {|} */ \
  84      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
  85     )
  86
  87 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  88 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  89
  90 #define PLUS  43
  91 #define MINUS 45
  92 #define BACKSLASH 92
  93 #define TILDE 126
  94
  95 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  96 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  97
  98 /* encode directly sets D and O and CR LF SP TAB */
  99 static const UBool encodeDirectlyMaximum[128]={
 100  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 101     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 102     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 103
 104     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 105     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 106
 107     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 108     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
 109
 110     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 111     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
 112 };
 113
 114 /* encode directly set D and CR LF SP TAB but not set O */
 115 static const UBool encodeDirectlyRestricted[128]={
 116  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 117     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 118     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 119
 120     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 121     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 122
 123     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 124     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 125
 126     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 127     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
 128 };
 129
 130 static const uint8_t
 131 toBase64[64]={
 132     /* A-Z */
 133     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 134     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
 135     /* a-z */
 136     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 137     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
 138     /* 0-9 */
 139     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 140     /* +/ */
 141     43, 47
 142 };
 143
 144 static const int8_t
 145 fromBase64[128]={
 146     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
 147     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
 148     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
 149
 150     /* general punctuation with + and / and a special value (-2) for - */
 151     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
 152     /* digits */
 153     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
 154
 155     /* A-Z */
 156     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 157     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
 158
 159     /* a-z */
 160     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 161     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
 162 };
 163
 164 /*
 165  * converter status values:
 166  *
 167  * toUnicodeStatus:
 168  *     24 inDirectMode (boolean)
 169  * 23..16 base64Counter (-1..7)
 170  * 15..0  bits (up to 14 bits incoming base64)
 171  *
 172  * fromUnicodeStatus:
 173  * 31..28 version (0: set O direct  1: set O escaped)
 174  *     24 inDirectMode (boolean)
 175  * 23..16 base64Counter (0..2)
 176  *  7..0  bits (6 bits outgoing base64)
 177  *
 178  */
 179
 180 static void
 181 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
 182     if(choice<=UCNV_RESET_TO_UNICODE) {
 183         /* reset toUnicode */
 184         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
 185         cnv->toULength=0;
 186     }
 187     if(choice!=UCNV_RESET_TO_UNICODE) {
 188         /* reset fromUnicode */
 189         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 190     }
 191 }
 192
 193 static void
 194 _UTF7Open(UConverter *cnv,
 195           UConverterLoadArgs *pArgs,
 196           UErrorCode *pErrorCode) {
 197     if(UCNV_GET_VERSION(cnv)<=1) {
 198         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
 199         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
 200         _UTF7Reset(cnv, UCNV_RESET_BOTH);
 201     } else {
 202         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 203     }
 204 }
 205
 206 static void
 207 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 208                           UErrorCode *pErrorCode) {
 209     UConverter *cnv;
 210     const uint8_t *source, *sourceLimit;
 211     UChar *target;
 212     const UChar *targetLimit;
 213     int32_t *offsets;
 214
 215     uint8_t *bytes;
 216     uint8_t byteIndex;
 217
 218     int32_t length, targetCapacity;
 219
 220     /* UTF-7 state */
 221     uint16_t bits;
 222     int8_t base64Counter;
 223     UBool inDirectMode;
 224
 225     int8_t base64Value;
 226
 227     int32_t sourceIndex, nextSourceIndex;
 228
 229     uint8_t b;
 230     /* set up the local pointers */
 231     cnv=pArgs->converter;
 232
 233     source=(const uint8_t *)pArgs->source;
 234     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 235     target=pArgs->target;
 236     targetLimit=pArgs->targetLimit;
 237     offsets=pArgs->offsets;
 238     /* get the state machine state */
 239     {
 240         uint32_t status=cnv->toUnicodeStatus;
 241         inDirectMode=(UBool)((status>>24)&1);
 242         base64Counter=(int8_t)(status>>16);
 243         bits=(uint16_t)status;
 244     }
 245     bytes=cnv->toUBytes;
 246     byteIndex=cnv->toULength;
 247
 248     /* sourceIndex=-1 if the current character began in the previous buffer */
 249     sourceIndex=byteIndex==0 ? 0 : -1;
 250     nextSourceIndex=0;
 251
 252     if(inDirectMode) {
 253 directMode:
 254         /*
 255          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
 256          * with their US-ASCII byte values.
 257          * Backslash and Tilde and most control characters are not allowed in UTF-7.
 258          * A plus sign starts Unicode (or "escape") Mode.
 259          *
 260          * In Direct Mode, only the sourceIndex is used.
 261          */
 262         byteIndex=0;
 263         length=(int32_t)(sourceLimit-source);
 264         targetCapacity=(int32_t)(targetLimit-target);
 265         if(length>targetCapacity) {
 266             length=targetCapacity;
 267         }
 268         while(length>0) {
 269             b=*source++;
 270             if(!isLegalUTF7(b)) {
 271                 /* illegal */
 272                 bytes[0]=b;
 273                 byteIndex=1;
 274                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 275                 break;
 276             } else if(b!=PLUS) {
 277                 /* write directly encoded character */
 278                 *target++=b;
 279                 if(offsets!=NULL) {
 280                     *offsets++=sourceIndex++;
 281                 }
 282             } else /* PLUS */ {
 283                 /* switch to Unicode mode */
 284                 nextSourceIndex=++sourceIndex;
 285                 inDirectMode=FALSE;
 286                 byteIndex=0;
 287                 bits=0;
 288                 base64Counter=-1;
 289                 goto unicodeMode;
 290             }
 291             --length;
 292         }
 293         if(source<sourceLimit && target>=targetLimit) {
 294             /* target is full */
 295             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 296         }
 297     } else {
 298 unicodeMode:
 299         /*
 300          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 301          * The base64 sequence ends with any character that is not in the base64 alphabet.
 302          * A terminating minus sign is consumed.
 303          *
 304          * In Unicode Mode, the sourceIndex has the index to the start of the current
 305          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 306          * keeping the index to the following byte.
 307          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 308          */
 309         while(source<sourceLimit) {
 310             if(target<targetLimit) {
 311                 bytes[byteIndex++]=b=*source++;
 312                 ++nextSourceIndex;
 313                 base64Value = -3; /* initialize as illegal */
 314                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
 315                     /* either
 316                      * base64Value==-1 for any legal character except base64 and minus sign, or
 317                      * base64Value==-3 for illegal characters:
 318                      * 1. In either case, leave Unicode mode.
 319                      * 2.1. If we ended with an incomplete UChar or none after the +, then
 320                      *      generate an error for the preceding erroneous sequence and deal with
 321                      *      the current (possibly illegal) character next time through.
 322                      * 2.2. Else the current char comes after a complete UChar, which was already
 323                      *      pushed to the output buf, so:
 324                      * 2.2.1. If the current char is legal, just save it for processing next time.
 325                      *        It may be for example, a plus which we need to deal with in direct mode.
 326                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
 327                      */
 328                     inDirectMode=TRUE;
 329                     if(base64Counter==-1) {
 330                         /* illegal: + immediately followed by something other than base64 or minus sign */
 331                         /* include the plus sign in the reported sequence, but not the subsequent char */
 332                         --source;
 333                         bytes[0]=PLUS;
 334                         byteIndex=1;
 335                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 336                         break;
 337                     } else if(bits!=0) {
 338                         /* bits are illegally left over, a UChar is incomplete */
 339                         /* don't include current char (legal or illegal) in error seq */
 340                         --source;
 341                         --byteIndex;
 342                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 343                         break;
 344                     } else {
 345                         /* previous UChar was complete */
 346                         if(base64Value==-3) {
 347                             /* current character is illegal, deal with it here */
 348                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 349                             break;
 350                         } else {
 351                             /* un-read the current character in case it is a plus sign */
 352                             --source;
 353                             sourceIndex=nextSourceIndex-1;
 354                             goto directMode;
 355                         }
 356                     }
 357                 } else if(base64Value>=0) {
 358                     /* collect base64 bytes into UChars */
 359                     switch(base64Counter) {
 360                     case -1: /* -1 is immediately after the + */
 361                     case 0:
 362                         bits=base64Value;
 363                         base64Counter=1;
 364                         break;
 365                     case 1:
 366                     case 3:
 367                     case 4:
 368                     case 6:
 369                         bits=(uint16_t)((bits<<6)|base64Value);
 370                         ++base64Counter;
 371                         break;
 372                     case 2:
 373                         *target++=(UChar)((bits<<4)|(base64Value>>2));
 374                         if(offsets!=NULL) {
 375                             *offsets++=sourceIndex;
 376                             sourceIndex=nextSourceIndex-1;
 377                         }
 378                         bytes[0]=b; /* keep this byte in case an error occurs */
 379                         byteIndex=1;
 380                         bits=(uint16_t)(base64Value&3);
 381                         base64Counter=3;
 382                         break;
 383                     case 5:
 384                         *target++=(UChar)((bits<<2)|(base64Value>>4));
 385                         if(offsets!=NULL) {
 386                             *offsets++=sourceIndex;
 387                             sourceIndex=nextSourceIndex-1;
 388                         }
 389                         bytes[0]=b; /* keep this byte in case an error occurs */
 390                         byteIndex=1;
 391                         bits=(uint16_t)(base64Value&15);
 392                         base64Counter=6;
 393                         break;
 394                     case 7:
 395                         *target++=(UChar)((bits<<6)|base64Value);
 396                         if(offsets!=NULL) {
 397                             *offsets++=sourceIndex;
 398                             sourceIndex=nextSourceIndex;
 399                         }
 400                         byteIndex=0;
 401                         bits=0;
 402                         base64Counter=0;
 403                         break;
 404                     default:
 405                         /* will never occur */
 406                         break;
 407                     }
 408                 } else /*base64Value==-2*/ {
 409                     /* minus sign terminates the base64 sequence */
 410                     inDirectMode=TRUE;
 411                     if(base64Counter==-1) {
 412                         /* +- i.e. a minus immediately following a plus */
 413                         *target++=PLUS;
 414                         if(offsets!=NULL) {
 415                             *offsets++=sourceIndex-1;
 416                         }
 417                     } else {
 418                         /* absorb the minus and leave the Unicode Mode */
 419                         if(bits!=0) {
 420                             /* bits are illegally left over, a UChar is incomplete */
 421                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 422                             break;
 423                         }
 424                     }
 425                     sourceIndex=nextSourceIndex;
 426                     goto directMode;
 427                 }
 428             } else {
 429                 /* target is full */
 430                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 431                 break;
 432             }
 433         }
 434     }
 435
 436     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
 437         /*
 438          * if we are in Unicode mode, then the byteIndex might not be 0,
 439          * but that is ok if bits==0
 440          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
 441          * (not true for IMAP-mailbox-name where we must end in direct mode)
 442          */
 443         byteIndex=0;
 444     }
 445
 446     /* set the converter state back into UConverter */
 447     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
 448     cnv->toULength=byteIndex;
 449
 450     /* write back the updated pointers */
 451     pArgs->source=(const char *)source;
 452     pArgs->target=target;
 453     pArgs->offsets=offsets;
 454     return;
 455 }
 456
 457 static void
 458 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 459                             UErrorCode *pErrorCode) {
 460     UConverter *cnv;
 461     const UChar *source, *sourceLimit;
 462     uint8_t *target, *targetLimit;
 463     int32_t *offsets;
 464
 465     int32_t length, targetCapacity, sourceIndex;
 466     UChar c;
 467
 468     /* UTF-7 state */
 469     const UBool *encodeDirectly;
 470     uint8_t bits;
 471     int8_t base64Counter;
 472     UBool inDirectMode;
 473
 474     /* set up the local pointers */
 475     cnv=pArgs->converter;
 476
 477     /* set up the local pointers */
 478     source=pArgs->source;
 479     sourceLimit=pArgs->sourceLimit;
 480     target=(uint8_t *)pArgs->target;
 481     targetLimit=(uint8_t *)pArgs->targetLimit;
 482     offsets=pArgs->offsets;
 483
 484     /* get the state machine state */
 485     {
 486         uint32_t status=cnv->fromUnicodeStatus;
 487         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
 488         inDirectMode=(UBool)((status>>24)&1);
 489         base64Counter=(int8_t)(status>>16);
 490         bits=(uint8_t)status;
 491         U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
 492     }
 493
 494     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
 495     sourceIndex=0;
 496
 497     if(inDirectMode) {
 498 directMode:
 499         length=(int32_t)(sourceLimit-source);
 500         targetCapacity=(int32_t)(targetLimit-target);
 501         if(length>targetCapacity) {
 502             length=targetCapacity;
 503         }
 504         while(length>0) {
 505             c=*source++;
 506             /* currently always encode CR LF SP TAB directly */
 507             if(c<=127 && encodeDirectly[c]) {
 508                 /* encode directly */
 509                 *target++=(uint8_t)c;
 510                 if(offsets!=NULL) {
 511                     *offsets++=sourceIndex++;
 512                 }
 513             } else if(c==PLUS) {
 514                 /* output +- for + */
 515                 *target++=PLUS;
 516                 if(target<targetLimit) {
 517                     *target++=MINUS;
 518                     if(offsets!=NULL) {
 519                         *offsets++=sourceIndex;
 520                         *offsets++=sourceIndex++;
 521                     }
 522                     /* realign length and targetCapacity */
 523                     goto directMode;
 524                 } else {
 525                     if(offsets!=NULL) {
 526                         *offsets++=sourceIndex++;
 527                     }
 528                     cnv->charErrorBuffer[0]=MINUS;
 529                     cnv->charErrorBufferLength=1;
 530                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 531                     break;
 532                 }
 533             } else {
 534                 /* un-read this character and switch to Unicode Mode */
 535                 --source;
 536                 *target++=PLUS;
 537                 if(offsets!=NULL) {
 538                     *offsets++=sourceIndex;
 539                 }
 540                 inDirectMode=FALSE;
 541                 base64Counter=0;
 542                 goto unicodeMode;
 543             }
 544             --length;
 545         }
 546         if(source<sourceLimit && target>=targetLimit) {
 547             /* target is full */
 548             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 549         }
 550     } else {
 551 unicodeMode:
 552         while(source<sourceLimit) {
 553             if(target<targetLimit) {
 554                 c=*source++;
 555                 if(c<=127 && encodeDirectly[c]) {
 556                     /* encode directly */
 557                     inDirectMode=TRUE;
 558
 559                     /* trick: back out this character to make this easier */
 560                     --source;
 561
 562                     /* terminate the base64 sequence */
 563                     if(base64Counter!=0) {
 564                         /* write remaining bits for the previous character */
 565                         *target++=toBase64[bits];
 566                         if(offsets!=NULL) {
 567                             *offsets++=sourceIndex-1;
 568                         }
 569                     }
 570                     if(fromBase64[c]!=-1) {
 571                         /* need to terminate with a minus */
 572                         if(target<targetLimit) {
 573                             *target++=MINUS;
 574                             if(offsets!=NULL) {
 575                                 *offsets++=sourceIndex-1;
 576                             }
 577                         } else {
 578                             cnv->charErrorBuffer[0]=MINUS;
 579                             cnv->charErrorBufferLength=1;
 580                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 581                             break;
 582                         }
 583                     }
 584                     goto directMode;
 585                 } else {
 586                     /*
 587                      * base64 this character:
 588                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
 589                      * and the bits of this character, each implicitly in UTF-16BE.
 590                      *
 591                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
 592                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
 593                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
 594                      */
 595                     switch(base64Counter) {
 596                     case 0:
 597                         *target++=toBase64[c>>10];
 598                         if(target<targetLimit) {
 599                             *target++=toBase64[(c>>4)&0x3f];
 600                             if(offsets!=NULL) {
 601                                 *offsets++=sourceIndex;
 602                                 *offsets++=sourceIndex++;
 603                             }
 604                         } else {
 605                             if(offsets!=NULL) {
 606                                 *offsets++=sourceIndex++;
 607                             }
 608                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
 609                             cnv->charErrorBufferLength=1;
 610                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 611                         }
 612                         bits=(uint8_t)((c&15)<<2);
 613                         base64Counter=1;
 614                         break;
 615                     case 1:
 616                         *target++=toBase64[bits|(c>>14)];
 617                         if(target<targetLimit) {
 618                             *target++=toBase64[(c>>8)&0x3f];
 619                             if(target<targetLimit) {
 620                                 *target++=toBase64[(c>>2)&0x3f];
 621                                 if(offsets!=NULL) {
 622                                     *offsets++=sourceIndex;
 623                                     *offsets++=sourceIndex;
 624                                     *offsets++=sourceIndex++;
 625                                 }
 626                             } else {
 627                                 if(offsets!=NULL) {
 628                                     *offsets++=sourceIndex;
 629                                     *offsets++=sourceIndex++;
 630                                 }
 631                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
 632                                 cnv->charErrorBufferLength=1;
 633                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 634                             }
 635                         } else {
 636                             if(offsets!=NULL) {
 637                                 *offsets++=sourceIndex++;
 638                             }
 639                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
 640                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
 641                             cnv->charErrorBufferLength=2;
 642                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 643                         }
 644                         bits=(uint8_t)((c&3)<<4);
 645                         base64Counter=2;
 646                         break;
 647                     case 2:
 648                         *target++=toBase64[bits|(c>>12)];
 649                         if(target<targetLimit) {
 650                             *target++=toBase64[(c>>6)&0x3f];
 651                             if(target<targetLimit) {
 652                                 *target++=toBase64[c&0x3f];
 653                                 if(offsets!=NULL) {
 654                                     *offsets++=sourceIndex;
 655                                     *offsets++=sourceIndex;
 656                                     *offsets++=sourceIndex++;
 657                                 }
 658                             } else {
 659                                 if(offsets!=NULL) {
 660                                     *offsets++=sourceIndex;
 661                                     *offsets++=sourceIndex++;
 662                                 }
 663                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
 664                                 cnv->charErrorBufferLength=1;
 665                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 666                             }
 667                         } else {
 668                             if(offsets!=NULL) {
 669                                 *offsets++=sourceIndex++;
 670                             }
 671                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
 672                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
 673                             cnv->charErrorBufferLength=2;
 674                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 675                         }
 676                         bits=0;
 677                         base64Counter=0;
 678                         break;
 679                     default:
 680                         /* will never occur */
 681                         break;
 682                     }
 683                 }
 684             } else {
 685                 /* target is full */
 686                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 687                 break;
 688             }
 689         }
 690     }
 691
 692     if(pArgs->flush && source>=sourceLimit) {
 693         /* flush remaining bits to the target */
 694         if(!inDirectMode) {
 695             if (base64Counter!=0) {
 696                 if(target<targetLimit) {
 697                     *target++=toBase64[bits];
 698                     if(offsets!=NULL) {
 699                         *offsets++=sourceIndex-1;
 700                     }
 701                 } else {
 702                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
 703                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 704                 }
 705             }
 706             /* Add final MINUS to terminate unicodeMode */
 707             if(target<targetLimit) {
 708                 *target++=MINUS;
 709                 if(offsets!=NULL) {
 710                     *offsets++=sourceIndex-1;
 711                 }
 712             } else {
 713                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
 714                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 715             }
 716         }
 717         /* reset the state for the next conversion */
 718         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 719     } else {
 720         /* set the converter state back into UConverter */
 721         cnv->fromUnicodeStatus=
 722             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
 723             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
 724     }
 725
 726     /* write back the updated pointers */
 727     pArgs->source=source;
 728     pArgs->target=(char *)target;
 729     pArgs->offsets=offsets;
 730     return;
 731 }
 732
 733 static const char *
 734 _UTF7GetName(const UConverter *cnv) {
 735     switch(cnv->fromUnicodeStatus>>28) {
 736     case 1:
 737         return "UTF-7,version=1";
 738     default:
 739         return "UTF-7";
 740     }
 741 }
 742
 743 static const UConverterImpl _UTF7Impl={
 744     UCNV_UTF7,
 745
 746     NULL,
 747     NULL,
 748
 749     _UTF7Open,
 750     NULL,
 751     _UTF7Reset,
 752
 753     _UTF7ToUnicodeWithOffsets,
 754     _UTF7ToUnicodeWithOffsets,
 755     _UTF7FromUnicodeWithOffsets,
 756     _UTF7FromUnicodeWithOffsets,
 757     NULL,
 758
 759     NULL,
 760     _UTF7GetName,
 761     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
 762     NULL,
 763     ucnv_getCompleteUnicodeSet
 764 };
 765
 766 static const UConverterStaticData _UTF7StaticData={
 767     sizeof(UConverterStaticData),
 768     "UTF-7",
 769     0, /* TODO CCSID for UTF-7 */
 770     UCNV_IBM, UCNV_UTF7,
 771     1, 4,
 772     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
 773     FALSE, FALSE,
 774     0,
 775     0,
 776     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 777 };
 778
 779 const UConverterSharedData _UTF7Data=
 780         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
 781
 782 /* IMAP mailbox name encoding ----------------------------------------------- */
 783
 784 /*
 785  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 786  * http://www.ietf.org/rfc/rfc2060.txt
 787  *
 788  * 5.1.3.  Mailbox International Naming Convention
 789  *
 790  * By convention, international mailbox names are specified using a
 791  * modified version of the UTF-7 encoding described in [UTF-7].  The
 792  * purpose of these modifications is to correct the following problems
 793  * with UTF-7:
 794  *
 795  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
 796  *       the common use of "+" in mailbox names, in particular USENET
 797  *       newsgroup names.
 798  *
 799  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
 800  *       conflicts with the use of "/" as a popular hierarchy delimiter.
 801  *
 802  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
 803  *       the use of "\" as a popular hierarchy delimiter.
 804  *
 805  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
 806  *       the use of "~" in some servers as a home directory indicator.
 807  *
 808  *    5) UTF-7 permits multiple alternate forms to represent the same
 809  *       string; in particular, printable US-ASCII chararacters can be
 810  *       represented in encoded form.
 811  *
 812  * In modified UTF-7, printable US-ASCII characters except for "&"
 813  * represent themselves; that is, characters with octet values 0x20-0x25
 814  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
 815  * octet sequence "&-".
 816  *
 817  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
 818  * Unicode 16-bit octets) are represented in modified BASE64, with a
 819  * further modification from [UTF-7] that "," is used instead of "/".
 820  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
 821  * character which can represent itself.
 822  *
 823  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
 824  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
 825  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
 826  * ").
 827  *
 828  * For example, here is a mailbox name which mixes English, Japanese,
 829  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
 830  */
 831
 832 /*
 833  * Tests for US-ASCII characters belonging to character classes
 834  * defined in UTF-7.
 835  *
 836  * Set D (directly encoded characters) consists of the following
 837  * characters: the upper and lower case letters A through Z
 838  * and a through z, the 10 digits 0-9, and the following nine special
 839  * characters (note that "+" and "=" are omitted):
 840  *     '(),-./:?
 841  *
 842  * Set O (optional direct characters) consists of the following
 843  * characters (note that "\" and "~" are omitted):
 844  *     !"#$%&*;<=>@[]^_`{|}
 845  *
 846  * According to the rules in RFC 2152, the byte values for the following
 847  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
 848  * - all C0 control codes except for CR LF TAB
 849  * - BACKSLASH
 850  * - TILDE
 851  * - DEL
 852  * - all codes beyond US-ASCII, i.e. all >127
 853  */
 854
 855 /* uses '&' not '+' to start a base64 sequence */
 856 #define AMPERSAND 0x26
 857 #define COMMA 0x2c
 858 #define SLASH 0x2f
 859
 860 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
 861 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
 862
 863 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
 864 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
 865
 866 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
 867 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
 868
 869 /*
 870  * converter status values:
 871  *
 872  * toUnicodeStatus:
 873  *     24 inDirectMode (boolean)
 874  * 23..16 base64Counter (-1..7)
 875  * 15..0  bits (up to 14 bits incoming base64)
 876  *
 877  * fromUnicodeStatus:
 878  *     24 inDirectMode (boolean)
 879  * 23..16 base64Counter (0..2)
 880  *  7..0  bits (6 bits outgoing base64)
 881  *
 882  * ignore bits 31..25
 883  */
 884
 885 static void
 886 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 887                           UErrorCode *pErrorCode) {
 888     UConverter *cnv;
 889     const uint8_t *source, *sourceLimit;
 890     UChar *target;
 891     const UChar *targetLimit;
 892     int32_t *offsets;
 893
 894     uint8_t *bytes;
 895     uint8_t byteIndex;
 896
 897     int32_t length, targetCapacity;
 898
 899     /* UTF-7 state */
 900     uint16_t bits;
 901     int8_t base64Counter;
 902     UBool inDirectMode;
 903
 904     int8_t base64Value;
 905
 906     int32_t sourceIndex, nextSourceIndex;
 907
 908     UChar c;
 909     uint8_t b;
 910
 911     /* set up the local pointers */
 912     cnv=pArgs->converter;
 913
 914     source=(const uint8_t *)pArgs->source;
 915     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 916     target=pArgs->target;
 917     targetLimit=pArgs->targetLimit;
 918     offsets=pArgs->offsets;
 919     /* get the state machine state */
 920     {
 921         uint32_t status=cnv->toUnicodeStatus;
 922         inDirectMode=(UBool)((status>>24)&1);
 923         base64Counter=(int8_t)(status>>16);
 924         bits=(uint16_t)status;
 925     }
 926     bytes=cnv->toUBytes;
 927     byteIndex=cnv->toULength;
 928
 929     /* sourceIndex=-1 if the current character began in the previous buffer */
 930     sourceIndex=byteIndex==0 ? 0 : -1;
 931     nextSourceIndex=0;
 932
 933     if(inDirectMode) {
 934 directMode:
 935         /*
 936          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
 937          * with their US-ASCII byte values.
 938          * An ampersand starts Unicode (or "escape") Mode.
 939          *
 940          * In Direct Mode, only the sourceIndex is used.
 941          */
 942         byteIndex=0;
 943         length=(int32_t)(sourceLimit-source);
 944         targetCapacity=(int32_t)(targetLimit-target);
 945         if(length>targetCapacity) {
 946             length=targetCapacity;
 947         }
 948         while(length>0) {
 949             b=*source++;
 950             if(!isLegalIMAP(b)) {
 951                 /* illegal */
 952                 bytes[0]=b;
 953                 byteIndex=1;
 954                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 955                 break;
 956             } else if(b!=AMPERSAND) {
 957                 /* write directly encoded character */
 958                 *target++=b;
 959                 if(offsets!=NULL) {
 960                     *offsets++=sourceIndex++;
 961                 }
 962             } else /* AMPERSAND */ {
 963                 /* switch to Unicode mode */
 964                 nextSourceIndex=++sourceIndex;
 965                 inDirectMode=FALSE;
 966                 byteIndex=0;
 967                 bits=0;
 968                 base64Counter=-1;
 969                 goto unicodeMode;
 970             }
 971             --length;
 972         }
 973         if(source<sourceLimit && target>=targetLimit) {
 974             /* target is full */
 975             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 976         }
 977     } else {
 978 unicodeMode:
 979         /*
 980          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 981          * The base64 sequence ends with any character that is not in the base64 alphabet.
 982          * A terminating minus sign is consumed.
 983          * US-ASCII must not be base64-ed.
 984          *
 985          * In Unicode Mode, the sourceIndex has the index to the start of the current
 986          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 987          * keeping the index to the following byte.
 988          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 989          */
 990         while(source<sourceLimit) {
 991             if(target<targetLimit) {
 992                 bytes[byteIndex++]=b=*source++;
 993                 ++nextSourceIndex;
 994                 if(b>0x7e) {
 995                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
 996                     inDirectMode=TRUE;
 997                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 998                     break;
 999                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1000                     /* collect base64 bytes into UChars */
1001                     switch(base64Counter) {
1002                     case -1: /* -1 is immediately after the & */
1003                     case 0:
1004                         bits=base64Value;
1005                         base64Counter=1;
1006                         break;
1007                     case 1:
1008                     case 3:
1009                     case 4:
1010                     case 6:
1011                         bits=(uint16_t)((bits<<6)|base64Value);
1012                         ++base64Counter;
1013                         break;
1014                     case 2:
1015                         c=(UChar)((bits<<4)|(base64Value>>2));
1016                         if(isLegalIMAP(c)) {
1017                             /* illegal */
1018                             inDirectMode=TRUE;
1019                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1020                             goto endloop;
1021                         }
1022                         *target++=c;
1023                         if(offsets!=NULL) {
1024                             *offsets++=sourceIndex;
1025                             sourceIndex=nextSourceIndex-1;
1026                         }
1027                         bytes[0]=b; /* keep this byte in case an error occurs */
1028                         byteIndex=1;
1029                         bits=(uint16_t)(base64Value&3);
1030                         base64Counter=3;
1031                         break;
1032                     case 5:
1033                         c=(UChar)((bits<<2)|(base64Value>>4));
1034                         if(isLegalIMAP(c)) {
1035                             /* illegal */
1036                             inDirectMode=TRUE;
1037                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1038                             goto endloop;
1039                         }
1040                         *target++=c;
1041                         if(offsets!=NULL) {
1042                             *offsets++=sourceIndex;
1043                             sourceIndex=nextSourceIndex-1;
1044                         }
1045                         bytes[0]=b; /* keep this byte in case an error occurs */
1046                         byteIndex=1;
1047                         bits=(uint16_t)(base64Value&15);
1048                         base64Counter=6;
1049                         break;
1050                     case 7:
1051                         c=(UChar)((bits<<6)|base64Value);
1052                         if(isLegalIMAP(c)) {
1053                             /* illegal */
1054                             inDirectMode=TRUE;
1055                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1056                             goto endloop;
1057                         }
1058                         *target++=c;
1059                         if(offsets!=NULL) {
1060                             *offsets++=sourceIndex;
1061                             sourceIndex=nextSourceIndex;
1062                         }
1063                         byteIndex=0;
1064                         bits=0;
1065                         base64Counter=0;
1066                         break;
1067                     default:
1068                         /* will never occur */
1069                         break;
1070                     }
1071                 } else if(base64Value==-2) {
1072                     /* minus sign terminates the base64 sequence */
1073                     inDirectMode=TRUE;
1074                     if(base64Counter==-1) {
1075                         /* &- i.e. a minus immediately following an ampersand */
1076                         *target++=AMPERSAND;
1077                         if(offsets!=NULL) {
1078                             *offsets++=sourceIndex-1;
1079                         }
1080                     } else {
1081                         /* absorb the minus and leave the Unicode Mode */
1082                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1083                             /* bits are illegally left over, a UChar is incomplete */
1084                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1085                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1086                             break;
1087                         }
1088                     }
1089                     sourceIndex=nextSourceIndex;
1090                     goto directMode;
1091                 } else {
1092                     if(base64Counter==-1) {
1093                         /* illegal: & immediately followed by something other than base64 or minus sign */
1094                         /* include the ampersand in the reported sequence */
1095                         --sourceIndex;
1096                         bytes[0]=AMPERSAND;
1097                         bytes[1]=b;
1098                         byteIndex=2;
1099                     }
1100                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1101                     /* base64Value==-3 for illegal characters */
1102                     /* illegal */
1103                     inDirectMode=TRUE;
1104                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1105                     break;
1106                 }
1107             } else {
1108                 /* target is full */
1109                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1110                 break;
1111             }
1112         }
1113     }
1114 endloop:
1115
1116     /*
1117      * the end of the input stream and detection of truncated input
1118      * are handled by the framework, but here we must check if we are in Unicode
1119      * mode and byteIndex==0 because we must end in direct mode
1120      *
1121      * conditions:
1122      *   successful
1123      *   in Unicode mode and byteIndex==0
1124      *   end of input and no truncated input
1125      */
1126     if( U_SUCCESS(*pErrorCode) &&
1127         !inDirectMode && byteIndex==0 &&
1128         pArgs->flush && source>=sourceLimit
1129     ) {
1130         if(base64Counter==-1) {
1131             /* & at the very end of the input */
1132             /* make the ampersand the reported sequence */
1133             bytes[0]=AMPERSAND;
1134             byteIndex=1;
1135         }
1136         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1137
1138         inDirectMode=TRUE; /* avoid looping */
1139         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1140     }
1141
1142     /* set the converter state back into UConverter */
1143     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1144     cnv->toULength=byteIndex;
1145
1146     /* write back the updated pointers */
1147     pArgs->source=(const char *)source;
1148     pArgs->target=target;
1149     pArgs->offsets=offsets;
1150     return;
1151 }
1152
1153 static void
1154 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1155                             UErrorCode *pErrorCode) {
1156     UConverter *cnv;
1157     const UChar *source, *sourceLimit;
1158     uint8_t *target, *targetLimit;
1159     int32_t *offsets;
1160
1161     int32_t length, targetCapacity, sourceIndex;
1162     UChar c;
1163     uint8_t b;
1164
1165     /* UTF-7 state */
1166     uint8_t bits;
1167     int8_t base64Counter;
1168     UBool inDirectMode;
1169
1170     /* set up the local pointers */
1171     cnv=pArgs->converter;
1172
1173     /* set up the local pointers */
1174     source=pArgs->source;
1175     sourceLimit=pArgs->sourceLimit;
1176     target=(uint8_t *)pArgs->target;
1177     targetLimit=(uint8_t *)pArgs->targetLimit;
1178     offsets=pArgs->offsets;
1179
1180     /* get the state machine state */
1181     {
1182         uint32_t status=cnv->fromUnicodeStatus;
1183         inDirectMode=(UBool)((status>>24)&1);
1184         base64Counter=(int8_t)(status>>16);
1185         bits=(uint8_t)status;
1186     }
1187
1188     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1189     sourceIndex=0;
1190
1191     if(inDirectMode) {
1192 directMode:
1193         length=(int32_t)(sourceLimit-source);
1194         targetCapacity=(int32_t)(targetLimit-target);
1195         if(length>targetCapacity) {
1196             length=targetCapacity;
1197         }
1198         while(length>0) {
1199             c=*source++;
1200             /* encode 0x20..0x7e except '&' directly */
1201             if(inSetDIMAP(c)) {
1202                 /* encode directly */
1203                 *target++=(uint8_t)c;
1204                 if(offsets!=NULL) {
1205                     *offsets++=sourceIndex++;
1206                 }
1207             } else if(c==AMPERSAND) {
1208                 /* output &- for & */
1209                 *target++=AMPERSAND;
1210                 if(target<targetLimit) {
1211                     *target++=MINUS;
1212                     if(offsets!=NULL) {
1213                         *offsets++=sourceIndex;
1214                         *offsets++=sourceIndex++;
1215                     }
1216                     /* realign length and targetCapacity */
1217                     goto directMode;
1218                 } else {
1219                     if(offsets!=NULL) {
1220                         *offsets++=sourceIndex++;
1221                     }
1222                     cnv->charErrorBuffer[0]=MINUS;
1223                     cnv->charErrorBufferLength=1;
1224                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1225                     break;
1226                 }
1227             } else {
1228                 /* un-read this character and switch to Unicode Mode */
1229                 --source;
1230                 *target++=AMPERSAND;
1231                 if(offsets!=NULL) {
1232                     *offsets++=sourceIndex;
1233                 }
1234                 inDirectMode=FALSE;
1235                 base64Counter=0;
1236                 goto unicodeMode;
1237             }
1238             --length;
1239         }
1240         if(source<sourceLimit && target>=targetLimit) {
1241             /* target is full */
1242             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1243         }
1244     } else {
1245 unicodeMode:
1246         while(source<sourceLimit) {
1247             if(target<targetLimit) {
1248                 c=*source++;
1249                 if(isLegalIMAP(c)) {
1250                     /* encode directly */
1251                     inDirectMode=TRUE;
1252
1253                     /* trick: back out this character to make this easier */
1254                     --source;
1255
1256                     /* terminate the base64 sequence */
1257                     if(base64Counter!=0) {
1258                         /* write remaining bits for the previous character */
1259                         *target++=TO_BASE64_IMAP(bits);
1260                         if(offsets!=NULL) {
1261                             *offsets++=sourceIndex-1;
1262                         }
1263                     }
1264                     /* need to terminate with a minus */
1265                     if(target<targetLimit) {
1266                         *target++=MINUS;
1267                         if(offsets!=NULL) {
1268                             *offsets++=sourceIndex-1;
1269                         }
1270                     } else {
1271                         cnv->charErrorBuffer[0]=MINUS;
1272                         cnv->charErrorBufferLength=1;
1273                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1274                         break;
1275                     }
1276                     goto directMode;
1277                 } else {
1278                     /*
1279                      * base64 this character:
1280                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1281                      * and the bits of this character, each implicitly in UTF-16BE.
1282                      *
1283                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1284                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1285                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1286                      */
1287                     switch(base64Counter) {
1288                     case 0:
1289                         b=(uint8_t)(c>>10);
1290                         *target++=TO_BASE64_IMAP(b);
1291                         if(target<targetLimit) {
1292                             b=(uint8_t)((c>>4)&0x3f);
1293                             *target++=TO_BASE64_IMAP(b);
1294                             if(offsets!=NULL) {
1295                                 *offsets++=sourceIndex;
1296                                 *offsets++=sourceIndex++;
1297                             }
1298                         } else {
1299                             if(offsets!=NULL) {
1300                                 *offsets++=sourceIndex++;
1301                             }
1302                             b=(uint8_t)((c>>4)&0x3f);
1303                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1304                             cnv->charErrorBufferLength=1;
1305                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1306                         }
1307                         bits=(uint8_t)((c&15)<<2);
1308                         base64Counter=1;
1309                         break;
1310                     case 1:
1311                         b=(uint8_t)(bits|(c>>14));
1312                         *target++=TO_BASE64_IMAP(b);
1313                         if(target<targetLimit) {
1314                             b=(uint8_t)((c>>8)&0x3f);
1315                             *target++=TO_BASE64_IMAP(b);
1316                             if(target<targetLimit) {
1317                                 b=(uint8_t)((c>>2)&0x3f);
1318                                 *target++=TO_BASE64_IMAP(b);
1319                                 if(offsets!=NULL) {
1320                                     *offsets++=sourceIndex;
1321                                     *offsets++=sourceIndex;
1322                                     *offsets++=sourceIndex++;
1323                                 }
1324                             } else {
1325                                 if(offsets!=NULL) {
1326                                     *offsets++=sourceIndex;
1327                                     *offsets++=sourceIndex++;
1328                                 }
1329                                 b=(uint8_t)((c>>2)&0x3f);
1330                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1331                                 cnv->charErrorBufferLength=1;
1332                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1333                             }
1334                         } else {
1335                             if(offsets!=NULL) {
1336                                 *offsets++=sourceIndex++;
1337                             }
1338                             b=(uint8_t)((c>>8)&0x3f);
1339                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340                             b=(uint8_t)((c>>2)&0x3f);
1341                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1342                             cnv->charErrorBufferLength=2;
1343                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344                         }
1345                         bits=(uint8_t)((c&3)<<4);
1346                         base64Counter=2;
1347                         break;
1348                     case 2:
1349                         b=(uint8_t)(bits|(c>>12));
1350                         *target++=TO_BASE64_IMAP(b);
1351                         if(target<targetLimit) {
1352                             b=(uint8_t)((c>>6)&0x3f);
1353                             *target++=TO_BASE64_IMAP(b);
1354                             if(target<targetLimit) {
1355                                 b=(uint8_t)(c&0x3f);
1356                                 *target++=TO_BASE64_IMAP(b);
1357                                 if(offsets!=NULL) {
1358                                     *offsets++=sourceIndex;
1359                                     *offsets++=sourceIndex;
1360                                     *offsets++=sourceIndex++;
1361                                 }
1362                             } else {
1363                                 if(offsets!=NULL) {
1364                                     *offsets++=sourceIndex;
1365                                     *offsets++=sourceIndex++;
1366                                 }
1367                                 b=(uint8_t)(c&0x3f);
1368                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1369                                 cnv->charErrorBufferLength=1;
1370                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371                             }
1372                         } else {
1373                             if(offsets!=NULL) {
1374                                 *offsets++=sourceIndex++;
1375                             }
1376                             b=(uint8_t)((c>>6)&0x3f);
1377                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378                             b=(uint8_t)(c&0x3f);
1379                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1380                             cnv->charErrorBufferLength=2;
1381                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382                         }
1383                         bits=0;
1384                         base64Counter=0;
1385                         break;
1386                     default:
1387                         /* will never occur */
1388                         break;
1389                     }
1390                 }
1391             } else {
1392                 /* target is full */
1393                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1394                 break;
1395             }
1396         }
1397     }
1398
1399     if(pArgs->flush && source>=sourceLimit) {
1400         /* flush remaining bits to the target */
1401         if(!inDirectMode) {
1402             if(base64Counter!=0) {
1403                 if(target<targetLimit) {
1404                     *target++=TO_BASE64_IMAP(bits);
1405                     if(offsets!=NULL) {
1406                         *offsets++=sourceIndex-1;
1407                     }
1408                 } else {
1409                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1410                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1411                 }
1412             }
1413             /* need to terminate with a minus */
1414             if(target<targetLimit) {
1415                 *target++=MINUS;
1416                 if(offsets!=NULL) {
1417                     *offsets++=sourceIndex-1;
1418                 }
1419             } else {
1420                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1421                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1422             }
1423         }
1424         /* reset the state for the next conversion */
1425         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1426     } else {
1427         /* set the converter state back into UConverter */
1428         cnv->fromUnicodeStatus=
1429             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1430             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1431     }
1432
1433     /* write back the updated pointers */
1434     pArgs->source=source;
1435     pArgs->target=(char *)target;
1436     pArgs->offsets=offsets;
1437     return;
1438 }
1439
1440 static const UConverterImpl _IMAPImpl={
1441     UCNV_IMAP_MAILBOX,
1442
1443     NULL,
1444     NULL,
1445
1446     _UTF7Open,
1447     NULL,
1448     _UTF7Reset,
1449
1450     _IMAPToUnicodeWithOffsets,
1451     _IMAPToUnicodeWithOffsets,
1452     _IMAPFromUnicodeWithOffsets,
1453     _IMAPFromUnicodeWithOffsets,
1454     NULL,
1455
1456     NULL,
1457     NULL,
1458     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1459     NULL,
1460     ucnv_getCompleteUnicodeSet
1461 };
1462
1463 static const UConverterStaticData _IMAPStaticData={
1464     sizeof(UConverterStaticData),
1465     "IMAP-mailbox-name",
1466     0, /* TODO CCSID for IMAP-mailbox-name */
1467     UCNV_IBM, UCNV_IMAP_MAILBOX,
1468     1, 4,
1469     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1470     FALSE, FALSE,
1471     0,
1472     0,
1473     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1474 };
1475
1476 const UConverterSharedData _IMAPData=
1477         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1478
1479 #endif