icuSources/common/ucnv_u7.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2010, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u7.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "ucnv_bld.h"
  23 #include "ucnv_cnv.h"
  24
  25 /* UTF-7 -------------------------------------------------------------------- */
  26
  27 /*
  28  * UTF-7 is a stateful encoding of Unicode.
  29  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  30  * It was intended for use in Internet email systems, using in its bytewise
  31  * encoding only a subset of 7-bit US-ASCII.
  32  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  33  * occasionally used.
  34  *
  35  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  36  * characters directly or in base64. Especially, the characters in set O
  37  * as defined in the RFC (see below) may be encoded directly but are not
  38  * allowed in, e.g., email headers.
  39  * By default, the ICU UTF-7 converter encodes set O directly.
  40  * By choosing the option "version=1", set O will be escaped instead.
  41  * For example:
  42  *     utf7Converter=ucnv_open("UTF-7,version=1");
  43  *
  44  * For details about email headers see RFC 2047.
  45  */
  46
  47 /*
  48  * Tests for US-ASCII characters belonging to character classes
  49  * defined in UTF-7.
  50  *
  51  * Set D (directly encoded characters) consists of the following
  52  * characters: the upper and lower case letters A through Z
  53  * and a through z, the 10 digits 0-9, and the following nine special
  54  * characters (note that "+" and "=" are omitted):
  55  *     '(),-./:?
  56  *
  57  * Set O (optional direct characters) consists of the following
  58  * characters (note that "\" and "~" are omitted):
  59  *     !"#$%&*;<=>@[]^_`{|}
  60  *
  61  * According to the rules in RFC 2152, the byte values for the following
  62  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  63  * - all C0 control codes except for CR LF TAB
  64  * - BACKSLASH
  65  * - TILDE
  66  * - DEL
  67  * - all codes beyond US-ASCII, i.e. all >127
  68  */
  69 #define inSetD(c) \
  70     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  71      (uint8_t)((c)-48)<10 ||    /* digits */ \
  72      (uint8_t)((c)-39)<3 ||     /* '() */ \
  73      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
  74      (c)==58 || (c)==63         /* :? */ \
  75     )
  76
  77 #define inSetO(c) \
  78     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
  79      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
  80      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
  81      (uint8_t)((c)-123)<3 ||        /* {|} */ \
  82      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
  83     )
  84
  85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  87
  88 #define PLUS  43
  89 #define MINUS 45
  90 #define BACKSLASH 92
  91 #define TILDE 126
  92
  93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  95
  96 /* encode directly sets D and O and CR LF SP TAB */
  97 static const UBool encodeDirectlyMaximum[128]={
  98  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
  99     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 100     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 101
 102     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 103     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 104
 105     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
 107
 108     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
 110 };
 111
 112 /* encode directly set D and CR LF SP TAB but not set O */
 113 static const UBool encodeDirectlyRestricted[128]={
 114  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 115     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 116     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 117
 118     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 120
 121     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 122     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 123
 124     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 125     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
 126 };
 127
 128 static const uint8_t
 129 toBase64[64]={
 130     /* A-Z */
 131     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 132     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
 133     /* a-z */
 134     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 135     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
 136     /* 0-9 */
 137     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 138     /* +/ */
 139     43, 47
 140 };
 141
 142 static const int8_t
 143 fromBase64[128]={
 144     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
 145     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
 146     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
 147
 148     /* general punctuation with + and / and a special value (-2) for - */
 149     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
 150     /* digits */
 151     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
 152
 153     /* A-Z */
 154     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 155     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
 156
 157     /* a-z */
 158     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 159     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
 160 };
 161
 162 /*
 163  * converter status values:
 164  *
 165  * toUnicodeStatus:
 166  *     24 inDirectMode (boolean)
 167  * 23..16 base64Counter (-1..7)
 168  * 15..0  bits (up to 14 bits incoming base64)
 169  *
 170  * fromUnicodeStatus:
 171  * 31..28 version (0: set O direct  1: set O escaped)
 172  *     24 inDirectMode (boolean)
 173  * 23..16 base64Counter (0..2)
 174  *  7..0  bits (6 bits outgoing base64)
 175  *
 176  */
 177
 178 static void
 179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
 180     if(choice<=UCNV_RESET_TO_UNICODE) {
 181         /* reset toUnicode */
 182         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
 183         cnv->toULength=0;
 184     }
 185     if(choice!=UCNV_RESET_TO_UNICODE) {
 186         /* reset fromUnicode */
 187         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 188     }
 189 }
 190
 191 static void
 192 _UTF7Open(UConverter *cnv,
 193           UConverterLoadArgs *pArgs,
 194           UErrorCode *pErrorCode) {
 195     if(UCNV_GET_VERSION(cnv)<=1) {
 196         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
 197         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
 198         _UTF7Reset(cnv, UCNV_RESET_BOTH);
 199     } else {
 200         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 201     }
 202 }
 203
 204 static void
 205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 206                           UErrorCode *pErrorCode) {
 207     UConverter *cnv;
 208     const uint8_t *source, *sourceLimit;
 209     UChar *target;
 210     const UChar *targetLimit;
 211     int32_t *offsets;
 212
 213     uint8_t *bytes;
 214     uint8_t byteIndex;
 215
 216     int32_t length, targetCapacity;
 217
 218     /* UTF-7 state */
 219     uint16_t bits;
 220     int8_t base64Counter;
 221     UBool inDirectMode;
 222
 223     int8_t base64Value;
 224
 225     int32_t sourceIndex, nextSourceIndex;
 226
 227     uint8_t b;
 228     /* set up the local pointers */
 229     cnv=pArgs->converter;
 230
 231     source=(const uint8_t *)pArgs->source;
 232     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 233     target=pArgs->target;
 234     targetLimit=pArgs->targetLimit;
 235     offsets=pArgs->offsets;
 236     /* get the state machine state */
 237     {
 238         uint32_t status=cnv->toUnicodeStatus;
 239         inDirectMode=(UBool)((status>>24)&1);
 240         base64Counter=(int8_t)(status>>16);
 241         bits=(uint16_t)status;
 242     }
 243     bytes=cnv->toUBytes;
 244     byteIndex=cnv->toULength;
 245
 246     /* sourceIndex=-1 if the current character began in the previous buffer */
 247     sourceIndex=byteIndex==0 ? 0 : -1;
 248     nextSourceIndex=0;
 249
 250     if(inDirectMode) {
 251 directMode:
 252         /*
 253          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
 254          * with their US-ASCII byte values.
 255          * Backslash and Tilde and most control characters are not allowed in UTF-7.
 256          * A plus sign starts Unicode (or "escape") Mode.
 257          *
 258          * In Direct Mode, only the sourceIndex is used.
 259          */
 260         byteIndex=0;
 261         length=(int32_t)(sourceLimit-source);
 262         targetCapacity=(int32_t)(targetLimit-target);
 263         if(length>targetCapacity) {
 264             length=targetCapacity;
 265         }
 266         while(length>0) {
 267             b=*source++;
 268             if(!isLegalUTF7(b)) {
 269                 /* illegal */
 270                 bytes[0]=b;
 271                 byteIndex=1;
 272                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 273                 break;
 274             } else if(b!=PLUS) {
 275                 /* write directly encoded character */
 276                 *target++=b;
 277                 if(offsets!=NULL) {
 278                     *offsets++=sourceIndex++;
 279                 }
 280             } else /* PLUS */ {
 281                 /* switch to Unicode mode */
 282                 nextSourceIndex=++sourceIndex;
 283                 inDirectMode=FALSE;
 284                 byteIndex=0;
 285                 bits=0;
 286                 base64Counter=-1;
 287                 goto unicodeMode;
 288             }
 289             --length;
 290         }
 291         if(source<sourceLimit && target>=targetLimit) {
 292             /* target is full */
 293             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 294         }
 295     } else {
 296 unicodeMode:
 297         /*
 298          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 299          * The base64 sequence ends with any character that is not in the base64 alphabet.
 300          * A terminating minus sign is consumed.
 301          *
 302          * In Unicode Mode, the sourceIndex has the index to the start of the current
 303          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 304          * keeping the index to the following byte.
 305          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 306          */
 307         while(source<sourceLimit) {
 308             if(target<targetLimit) {
 309                 bytes[byteIndex++]=b=*source++;
 310                 ++nextSourceIndex;
 311                 base64Value = -3; /* initialize as illegal */
 312                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
 313                     /* either
 314                      * base64Value==-1 for any legal character except base64 and minus sign, or
 315                      * base64Value==-3 for illegal characters:
 316                      * 1. In either case, leave Unicode mode.
 317                      * 2.1. If we ended with an incomplete UChar or none after the +, then
 318                      *      generate an error for the preceding erroneous sequence and deal with
 319                      *      the current (possibly illegal) character next time through.
 320                      * 2.2. Else the current char comes after a complete UChar, which was already
 321                      *      pushed to the output buf, so:
 322                      * 2.2.1. If the current char is legal, just save it for processing next time.
 323                      *        It may be for example, a plus which we need to deal with in direct mode.
 324                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
 325                      */
 326                     inDirectMode=TRUE;
 327                     if(base64Counter==-1) {
 328                         /* illegal: + immediately followed by something other than base64 or minus sign */
 329                         /* include the plus sign in the reported sequence, but not the subsequent char */
 330                         --source;
 331                         bytes[0]=PLUS;
 332                         byteIndex=1;
 333                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 334                         break;
 335                     } else if(bits!=0) {
 336                         /* bits are illegally left over, a UChar is incomplete */
 337                         /* don't include current char (legal or illegal) in error seq */
 338                         --source;
 339                         --byteIndex;
 340                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 341                         break;
 342                     } else {
 343                         /* previous UChar was complete */
 344                         if (base64Value==-3) {
 345                             /* current character is illegal, deal with it here */
 346                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 347                             break;
 348                         } else {
 349                             /* un-read the current character in case it is a plus sign */
 350                             --source;
 351                             sourceIndex=nextSourceIndex-1;
 352                             goto directMode;
 353                         }
 354                     }
 355                 } else if(base64Value>=0) {
 356                     /* collect base64 bytes into UChars */
 357                     switch(base64Counter) {
 358                     case -1: /* -1 is immediately after the + */
 359                     case 0:
 360                         bits=base64Value;
 361                         base64Counter=1;
 362                         break;
 363                     case 1:
 364                     case 3:
 365                     case 4:
 366                     case 6:
 367                         bits=(uint16_t)((bits<<6)|base64Value);
 368                         ++base64Counter;
 369                         break;
 370                     case 2:
 371                         *target++=(UChar)((bits<<4)|(base64Value>>2));
 372                         if(offsets!=NULL) {
 373                             *offsets++=sourceIndex;
 374                             sourceIndex=nextSourceIndex-1;
 375                         }
 376                         bytes[0]=b; /* keep this byte in case an error occurs */
 377                         byteIndex=1;
 378                         bits=(uint16_t)(base64Value&3);
 379                         base64Counter=3;
 380                         break;
 381                     case 5:
 382                         *target++=(UChar)((bits<<2)|(base64Value>>4));
 383                         if(offsets!=NULL) {
 384                             *offsets++=sourceIndex;
 385                             sourceIndex=nextSourceIndex-1;
 386                         }
 387                         bytes[0]=b; /* keep this byte in case an error occurs */
 388                         byteIndex=1;
 389                         bits=(uint16_t)(base64Value&15);
 390                         base64Counter=6;
 391                         break;
 392                     case 7:
 393                         *target++=(UChar)((bits<<6)|base64Value);
 394                         if(offsets!=NULL) {
 395                             *offsets++=sourceIndex;
 396                             sourceIndex=nextSourceIndex;
 397                         }
 398                         byteIndex=0;
 399                         bits=0;
 400                         base64Counter=0;
 401                         break;
 402                     default:
 403                         /* will never occur */
 404                         break;
 405                     }
 406                 } else /*base64Value==-2*/ {
 407                     /* minus sign terminates the base64 sequence */
 408                     inDirectMode=TRUE;
 409                     if(base64Counter==-1) {
 410                         /* +- i.e. a minus immediately following a plus */
 411                         *target++=PLUS;
 412                         if(offsets!=NULL) {
 413                             *offsets++=sourceIndex-1;
 414                         }
 415                     } else {
 416                         /* absorb the minus and leave the Unicode Mode */
 417                         if(bits!=0) {
 418                             /* bits are illegally left over, a UChar is incomplete */
 419                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 420                             break;
 421                         }
 422                     }
 423                     sourceIndex=nextSourceIndex;
 424                     goto directMode;
 425                 }
 426             } else {
 427                 /* target is full */
 428                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 429                 break;
 430             }
 431         }
 432     }
 433
 434     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
 435         /*
 436          * if we are in Unicode mode, then the byteIndex might not be 0,
 437          * but that is ok if bits==0
 438          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
 439          * (not true for IMAP-mailbox-name where we must end in direct mode)
 440          */
 441         byteIndex=0;
 442     }
 443
 444     /* set the converter state back into UConverter */
 445     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
 446     cnv->toULength=byteIndex;
 447
 448     /* write back the updated pointers */
 449     pArgs->source=(const char *)source;
 450     pArgs->target=target;
 451     pArgs->offsets=offsets;
 452     return;
 453 }
 454
 455 static void
 456 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 457                             UErrorCode *pErrorCode) {
 458     UConverter *cnv;
 459     const UChar *source, *sourceLimit;
 460     uint8_t *target, *targetLimit;
 461     int32_t *offsets;
 462
 463     int32_t length, targetCapacity, sourceIndex;
 464     UChar c;
 465
 466     /* UTF-7 state */
 467     const UBool *encodeDirectly;
 468     uint8_t bits;
 469     int8_t base64Counter;
 470     UBool inDirectMode;
 471
 472     /* set up the local pointers */
 473     cnv=pArgs->converter;
 474
 475     /* set up the local pointers */
 476     source=pArgs->source;
 477     sourceLimit=pArgs->sourceLimit;
 478     target=(uint8_t *)pArgs->target;
 479     targetLimit=(uint8_t *)pArgs->targetLimit;
 480     offsets=pArgs->offsets;
 481
 482     /* get the state machine state */
 483     {
 484         uint32_t status=cnv->fromUnicodeStatus;
 485         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
 486         inDirectMode=(UBool)((status>>24)&1);
 487         base64Counter=(int8_t)(status>>16);
 488         bits=(uint8_t)status;
 489     }
 490
 491     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
 492     sourceIndex=0;
 493
 494     if(inDirectMode) {
 495 directMode:
 496         length=(int32_t)(sourceLimit-source);
 497         targetCapacity=(int32_t)(targetLimit-target);
 498         if(length>targetCapacity) {
 499             length=targetCapacity;
 500         }
 501         while(length>0) {
 502             c=*source++;
 503             /* currently always encode CR LF SP TAB directly */
 504             if(c<=127 && encodeDirectly[c]) {
 505                 /* encode directly */
 506                 *target++=(uint8_t)c;
 507                 if(offsets!=NULL) {
 508                     *offsets++=sourceIndex++;
 509                 }
 510             } else if(c==PLUS) {
 511                 /* output +- for + */
 512                 *target++=PLUS;
 513                 if(target<targetLimit) {
 514                     *target++=MINUS;
 515                     if(offsets!=NULL) {
 516                         *offsets++=sourceIndex;
 517                         *offsets++=sourceIndex++;
 518                     }
 519                     /* realign length and targetCapacity */
 520                     goto directMode;
 521                 } else {
 522                     if(offsets!=NULL) {
 523                         *offsets++=sourceIndex++;
 524                     }
 525                     cnv->charErrorBuffer[0]=MINUS;
 526                     cnv->charErrorBufferLength=1;
 527                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 528                     break;
 529                 }
 530             } else {
 531                 /* un-read this character and switch to Unicode Mode */
 532                 --source;
 533                 *target++=PLUS;
 534                 if(offsets!=NULL) {
 535                     *offsets++=sourceIndex;
 536                 }
 537                 inDirectMode=FALSE;
 538                 base64Counter=0;
 539                 goto unicodeMode;
 540             }
 541             --length;
 542         }
 543         if(source<sourceLimit && target>=targetLimit) {
 544             /* target is full */
 545             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 546         }
 547     } else {
 548 unicodeMode:
 549         while(source<sourceLimit) {
 550             if(target<targetLimit) {
 551                 c=*source++;
 552                 if(c<=127 && encodeDirectly[c]) {
 553                     /* encode directly */
 554                     inDirectMode=TRUE;
 555
 556                     /* trick: back out this character to make this easier */
 557                     --source;
 558
 559                     /* terminate the base64 sequence */
 560                     if(base64Counter!=0) {
 561                         /* write remaining bits for the previous character */
 562                         *target++=toBase64[bits];
 563                         if(offsets!=NULL) {
 564                             *offsets++=sourceIndex-1;
 565                         }
 566                     }
 567                     if(fromBase64[c]!=-1) {
 568                         /* need to terminate with a minus */
 569                         if(target<targetLimit) {
 570                             *target++=MINUS;
 571                             if(offsets!=NULL) {
 572                                 *offsets++=sourceIndex-1;
 573                             }
 574                         } else {
 575                             cnv->charErrorBuffer[0]=MINUS;
 576                             cnv->charErrorBufferLength=1;
 577                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 578                             break;
 579                         }
 580                     }
 581                     goto directMode;
 582                 } else {
 583                     /*
 584                      * base64 this character:
 585                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
 586                      * and the bits of this character, each implicitly in UTF-16BE.
 587                      *
 588                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
 589                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
 590                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
 591                      */
 592                     switch(base64Counter) {
 593                     case 0:
 594                         *target++=toBase64[c>>10];
 595                         if(target<targetLimit) {
 596                             *target++=toBase64[(c>>4)&0x3f];
 597                             if(offsets!=NULL) {
 598                                 *offsets++=sourceIndex;
 599                                 *offsets++=sourceIndex++;
 600                             }
 601                         } else {
 602                             if(offsets!=NULL) {
 603                                 *offsets++=sourceIndex++;
 604                             }
 605                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
 606                             cnv->charErrorBufferLength=1;
 607                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 608                         }
 609                         bits=(uint8_t)((c&15)<<2);
 610                         base64Counter=1;
 611                         break;
 612                     case 1:
 613                         *target++=toBase64[bits|(c>>14)];
 614                         if(target<targetLimit) {
 615                             *target++=toBase64[(c>>8)&0x3f];
 616                             if(target<targetLimit) {
 617                                 *target++=toBase64[(c>>2)&0x3f];
 618                                 if(offsets!=NULL) {
 619                                     *offsets++=sourceIndex;
 620                                     *offsets++=sourceIndex;
 621                                     *offsets++=sourceIndex++;
 622                                 }
 623                             } else {
 624                                 if(offsets!=NULL) {
 625                                     *offsets++=sourceIndex;
 626                                     *offsets++=sourceIndex++;
 627                                 }
 628                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
 629                                 cnv->charErrorBufferLength=1;
 630                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 631                             }
 632                         } else {
 633                             if(offsets!=NULL) {
 634                                 *offsets++=sourceIndex++;
 635                             }
 636                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
 637                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
 638                             cnv->charErrorBufferLength=2;
 639                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 640                         }
 641                         bits=(uint8_t)((c&3)<<4);
 642                         base64Counter=2;
 643                         break;
 644                     case 2:
 645                         *target++=toBase64[bits|(c>>12)];
 646                         if(target<targetLimit) {
 647                             *target++=toBase64[(c>>6)&0x3f];
 648                             if(target<targetLimit) {
 649                                 *target++=toBase64[c&0x3f];
 650                                 if(offsets!=NULL) {
 651                                     *offsets++=sourceIndex;
 652                                     *offsets++=sourceIndex;
 653                                     *offsets++=sourceIndex++;
 654                                 }
 655                             } else {
 656                                 if(offsets!=NULL) {
 657                                     *offsets++=sourceIndex;
 658                                     *offsets++=sourceIndex++;
 659                                 }
 660                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
 661                                 cnv->charErrorBufferLength=1;
 662                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 663                             }
 664                         } else {
 665                             if(offsets!=NULL) {
 666                                 *offsets++=sourceIndex++;
 667                             }
 668                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
 669                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
 670                             cnv->charErrorBufferLength=2;
 671                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 672                         }
 673                         bits=0;
 674                         base64Counter=0;
 675                         break;
 676                     default:
 677                         /* will never occur */
 678                         break;
 679                     }
 680                 }
 681             } else {
 682                 /* target is full */
 683                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 684                 break;
 685             }
 686         }
 687     }
 688
 689     if(pArgs->flush && source>=sourceLimit) {
 690         /* flush remaining bits to the target */
 691         if(!inDirectMode && base64Counter!=0) {
 692             if(target<targetLimit) {
 693                 *target++=toBase64[bits];
 694                 if(offsets!=NULL) {
 695                     *offsets++=sourceIndex-1;
 696                 }
 697             } else {
 698                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
 699                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 700             }
 701         }
 702         /* reset the state for the next conversion */
 703         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 704     } else {
 705         /* set the converter state back into UConverter */
 706         cnv->fromUnicodeStatus=
 707             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
 708             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
 709     }
 710
 711     /* write back the updated pointers */
 712     pArgs->source=source;
 713     pArgs->target=(char *)target;
 714     pArgs->offsets=offsets;
 715     return;
 716 }
 717
 718 static const char *
 719 _UTF7GetName(const UConverter *cnv) {
 720     switch(cnv->fromUnicodeStatus>>28) {
 721     case 1:
 722         return "UTF-7,version=1";
 723     default:
 724         return "UTF-7";
 725     }
 726 }
 727
 728 static const UConverterImpl _UTF7Impl={
 729     UCNV_UTF7,
 730
 731     NULL,
 732     NULL,
 733
 734     _UTF7Open,
 735     NULL,
 736     _UTF7Reset,
 737
 738     _UTF7ToUnicodeWithOffsets,
 739     _UTF7ToUnicodeWithOffsets,
 740     _UTF7FromUnicodeWithOffsets,
 741     _UTF7FromUnicodeWithOffsets,
 742     NULL,
 743
 744     NULL,
 745     _UTF7GetName,
 746     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
 747     NULL,
 748     ucnv_getCompleteUnicodeSet
 749 };
 750
 751 static const UConverterStaticData _UTF7StaticData={
 752     sizeof(UConverterStaticData),
 753     "UTF-7",
 754     0, /* TODO CCSID for UTF-7 */
 755     UCNV_IBM, UCNV_UTF7,
 756     1, 4,
 757     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
 758     FALSE, FALSE,
 759     0,
 760     0,
 761     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 762 };
 763
 764 const UConverterSharedData _UTF7Data={
 765     sizeof(UConverterSharedData), ~((uint32_t)0),
 766     NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
 767     0
 768 };
 769
 770 /* IMAP mailbox name encoding ----------------------------------------------- */
 771
 772 /*
 773  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 774  * http://www.ietf.org/rfc/rfc2060.txt
 775  *
 776  * 5.1.3.  Mailbox International Naming Convention
 777  *
 778  * By convention, international mailbox names are specified using a
 779  * modified version of the UTF-7 encoding described in [UTF-7].  The
 780  * purpose of these modifications is to correct the following problems
 781  * with UTF-7:
 782  *
 783  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
 784  *       the common use of "+" in mailbox names, in particular USENET
 785  *       newsgroup names.
 786  *
 787  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
 788  *       conflicts with the use of "/" as a popular hierarchy delimiter.
 789  *
 790  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
 791  *       the use of "\" as a popular hierarchy delimiter.
 792  *
 793  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
 794  *       the use of "~" in some servers as a home directory indicator.
 795  *
 796  *    5) UTF-7 permits multiple alternate forms to represent the same
 797  *       string; in particular, printable US-ASCII chararacters can be
 798  *       represented in encoded form.
 799  *
 800  * In modified UTF-7, printable US-ASCII characters except for "&"
 801  * represent themselves; that is, characters with octet values 0x20-0x25
 802  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
 803  * octet sequence "&-".
 804  *
 805  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
 806  * Unicode 16-bit octets) are represented in modified BASE64, with a
 807  * further modification from [UTF-7] that "," is used instead of "/".
 808  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
 809  * character which can represent itself.
 810  *
 811  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
 812  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
 813  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
 814  * ").
 815  *
 816  * For example, here is a mailbox name which mixes English, Japanese,
 817  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
 818  */
 819
 820 /*
 821  * Tests for US-ASCII characters belonging to character classes
 822  * defined in UTF-7.
 823  *
 824  * Set D (directly encoded characters) consists of the following
 825  * characters: the upper and lower case letters A through Z
 826  * and a through z, the 10 digits 0-9, and the following nine special
 827  * characters (note that "+" and "=" are omitted):
 828  *     '(),-./:?
 829  *
 830  * Set O (optional direct characters) consists of the following
 831  * characters (note that "\" and "~" are omitted):
 832  *     !"#$%&*;<=>@[]^_`{|}
 833  *
 834  * According to the rules in RFC 2152, the byte values for the following
 835  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
 836  * - all C0 control codes except for CR LF TAB
 837  * - BACKSLASH
 838  * - TILDE
 839  * - DEL
 840  * - all codes beyond US-ASCII, i.e. all >127
 841  */
 842
 843 /* uses '&' not '+' to start a base64 sequence */
 844 #define AMPERSAND 0x26
 845 #define COMMA 0x2c
 846 #define SLASH 0x2f
 847
 848 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
 849 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
 850
 851 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
 852 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
 853
 854 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
 855 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
 856
 857 /*
 858  * converter status values:
 859  *
 860  * toUnicodeStatus:
 861  *     24 inDirectMode (boolean)
 862  * 23..16 base64Counter (-1..7)
 863  * 15..0  bits (up to 14 bits incoming base64)
 864  *
 865  * fromUnicodeStatus:
 866  *     24 inDirectMode (boolean)
 867  * 23..16 base64Counter (0..2)
 868  *  7..0  bits (6 bits outgoing base64)
 869  *
 870  * ignore bits 31..25
 871  */
 872
 873 static void
 874 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 875                           UErrorCode *pErrorCode) {
 876     UConverter *cnv;
 877     const uint8_t *source, *sourceLimit;
 878     UChar *target;
 879     const UChar *targetLimit;
 880     int32_t *offsets;
 881
 882     uint8_t *bytes;
 883     uint8_t byteIndex;
 884
 885     int32_t length, targetCapacity;
 886
 887     /* UTF-7 state */
 888     uint16_t bits;
 889     int8_t base64Counter;
 890     UBool inDirectMode;
 891
 892     int8_t base64Value;
 893
 894     int32_t sourceIndex, nextSourceIndex;
 895
 896     UChar c;
 897     uint8_t b;
 898
 899     /* set up the local pointers */
 900     cnv=pArgs->converter;
 901
 902     source=(const uint8_t *)pArgs->source;
 903     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 904     target=pArgs->target;
 905     targetLimit=pArgs->targetLimit;
 906     offsets=pArgs->offsets;
 907     /* get the state machine state */
 908     {
 909         uint32_t status=cnv->toUnicodeStatus;
 910         inDirectMode=(UBool)((status>>24)&1);
 911         base64Counter=(int8_t)(status>>16);
 912         bits=(uint16_t)status;
 913     }
 914     bytes=cnv->toUBytes;
 915     byteIndex=cnv->toULength;
 916
 917     /* sourceIndex=-1 if the current character began in the previous buffer */
 918     sourceIndex=byteIndex==0 ? 0 : -1;
 919     nextSourceIndex=0;
 920
 921     if(inDirectMode) {
 922 directMode:
 923         /*
 924          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
 925          * with their US-ASCII byte values.
 926          * An ampersand starts Unicode (or "escape") Mode.
 927          *
 928          * In Direct Mode, only the sourceIndex is used.
 929          */
 930         byteIndex=0;
 931         length=(int32_t)(sourceLimit-source);
 932         targetCapacity=(int32_t)(targetLimit-target);
 933         if(length>targetCapacity) {
 934             length=targetCapacity;
 935         }
 936         while(length>0) {
 937             b=*source++;
 938             if(!isLegalIMAP(b)) {
 939                 /* illegal */
 940                 bytes[0]=b;
 941                 byteIndex=1;
 942                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 943                 break;
 944             } else if(b!=AMPERSAND) {
 945                 /* write directly encoded character */
 946                 *target++=b;
 947                 if(offsets!=NULL) {
 948                     *offsets++=sourceIndex++;
 949                 }
 950             } else /* AMPERSAND */ {
 951                 /* switch to Unicode mode */
 952                 nextSourceIndex=++sourceIndex;
 953                 inDirectMode=FALSE;
 954                 byteIndex=0;
 955                 bits=0;
 956                 base64Counter=-1;
 957                 goto unicodeMode;
 958             }
 959             --length;
 960         }
 961         if(source<sourceLimit && target>=targetLimit) {
 962             /* target is full */
 963             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 964         }
 965     } else {
 966 unicodeMode:
 967         /*
 968          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 969          * The base64 sequence ends with any character that is not in the base64 alphabet.
 970          * A terminating minus sign is consumed.
 971          * US-ASCII must not be base64-ed.
 972          *
 973          * In Unicode Mode, the sourceIndex has the index to the start of the current
 974          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 975          * keeping the index to the following byte.
 976          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 977          */
 978         while(source<sourceLimit) {
 979             if(target<targetLimit) {
 980                 bytes[byteIndex++]=b=*source++;
 981                 ++nextSourceIndex;
 982                 if(b>0x7e) {
 983                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
 984                     inDirectMode=TRUE;
 985                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 986                     break;
 987                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
 988                     /* collect base64 bytes into UChars */
 989                     switch(base64Counter) {
 990                     case -1: /* -1 is immediately after the & */
 991                     case 0:
 992                         bits=base64Value;
 993                         base64Counter=1;
 994                         break;
 995                     case 1:
 996                     case 3:
 997                     case 4:
 998                     case 6:
 999                         bits=(uint16_t)((bits<<6)|base64Value);
1000                         ++base64Counter;
1001                         break;
1002                     case 2:
1003                         c=(UChar)((bits<<4)|(base64Value>>2));
1004                         if(isLegalIMAP(c)) {
1005                             /* illegal */
1006                             inDirectMode=TRUE;
1007                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1008                             goto endloop;
1009                         }
1010                         *target++=c;
1011                         if(offsets!=NULL) {
1012                             *offsets++=sourceIndex;
1013                             sourceIndex=nextSourceIndex-1;
1014                         }
1015                         bytes[0]=b; /* keep this byte in case an error occurs */
1016                         byteIndex=1;
1017                         bits=(uint16_t)(base64Value&3);
1018                         base64Counter=3;
1019                         break;
1020                     case 5:
1021                         c=(UChar)((bits<<2)|(base64Value>>4));
1022                         if(isLegalIMAP(c)) {
1023                             /* illegal */
1024                             inDirectMode=TRUE;
1025                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1026                             goto endloop;
1027                         }
1028                         *target++=c;
1029                         if(offsets!=NULL) {
1030                             *offsets++=sourceIndex;
1031                             sourceIndex=nextSourceIndex-1;
1032                         }
1033                         bytes[0]=b; /* keep this byte in case an error occurs */
1034                         byteIndex=1;
1035                         bits=(uint16_t)(base64Value&15);
1036                         base64Counter=6;
1037                         break;
1038                     case 7:
1039                         c=(UChar)((bits<<6)|base64Value);
1040                         if(isLegalIMAP(c)) {
1041                             /* illegal */
1042                             inDirectMode=TRUE;
1043                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1044                             goto endloop;
1045                         }
1046                         *target++=c;
1047                         if(offsets!=NULL) {
1048                             *offsets++=sourceIndex;
1049                             sourceIndex=nextSourceIndex;
1050                         }
1051                         byteIndex=0;
1052                         bits=0;
1053                         base64Counter=0;
1054                         break;
1055                     default:
1056                         /* will never occur */
1057                         break;
1058                     }
1059                 } else if(base64Value==-2) {
1060                     /* minus sign terminates the base64 sequence */
1061                     inDirectMode=TRUE;
1062                     if(base64Counter==-1) {
1063                         /* &- i.e. a minus immediately following an ampersand */
1064                         *target++=AMPERSAND;
1065                         if(offsets!=NULL) {
1066                             *offsets++=sourceIndex-1;
1067                         }
1068                     } else {
1069                         /* absorb the minus and leave the Unicode Mode */
1070                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1071                             /* bits are illegally left over, a UChar is incomplete */
1072                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1073                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1074                             break;
1075                         }
1076                     }
1077                     sourceIndex=nextSourceIndex;
1078                     goto directMode;
1079                 } else {
1080                     if(base64Counter==-1) {
1081                         /* illegal: & immediately followed by something other than base64 or minus sign */
1082                         /* include the ampersand in the reported sequence */
1083                         --sourceIndex;
1084                         bytes[0]=AMPERSAND;
1085                         bytes[1]=b;
1086                         byteIndex=2;
1087                     }
1088                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1089                     /* base64Value==-3 for illegal characters */
1090                     /* illegal */
1091                     inDirectMode=TRUE;
1092                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1093                     break;
1094                 }
1095             } else {
1096                 /* target is full */
1097                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1098                 break;
1099             }
1100         }
1101     }
1102 endloop:
1103
1104     /*
1105      * the end of the input stream and detection of truncated input
1106      * are handled by the framework, but here we must check if we are in Unicode
1107      * mode and byteIndex==0 because we must end in direct mode
1108      *
1109      * conditions:
1110      *   successful
1111      *   in Unicode mode and byteIndex==0
1112      *   end of input and no truncated input
1113      */
1114     if( U_SUCCESS(*pErrorCode) &&
1115         !inDirectMode && byteIndex==0 &&
1116         pArgs->flush && source>=sourceLimit
1117     ) {
1118         if(base64Counter==-1) {
1119             /* & at the very end of the input */
1120             /* make the ampersand the reported sequence */
1121             bytes[0]=AMPERSAND;
1122             byteIndex=1;
1123         }
1124         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1125
1126         inDirectMode=TRUE; /* avoid looping */
1127         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1128     }
1129
1130     /* set the converter state back into UConverter */
1131     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1132     cnv->toULength=byteIndex;
1133
1134     /* write back the updated pointers */
1135     pArgs->source=(const char *)source;
1136     pArgs->target=target;
1137     pArgs->offsets=offsets;
1138     return;
1139 }
1140
1141 static void
1142 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1143                             UErrorCode *pErrorCode) {
1144     UConverter *cnv;
1145     const UChar *source, *sourceLimit;
1146     uint8_t *target, *targetLimit;
1147     int32_t *offsets;
1148
1149     int32_t length, targetCapacity, sourceIndex;
1150     UChar c;
1151     uint8_t b;
1152
1153     /* UTF-7 state */
1154     uint8_t bits;
1155     int8_t base64Counter;
1156     UBool inDirectMode;
1157
1158     /* set up the local pointers */
1159     cnv=pArgs->converter;
1160
1161     /* set up the local pointers */
1162     source=pArgs->source;
1163     sourceLimit=pArgs->sourceLimit;
1164     target=(uint8_t *)pArgs->target;
1165     targetLimit=(uint8_t *)pArgs->targetLimit;
1166     offsets=pArgs->offsets;
1167
1168     /* get the state machine state */
1169     {
1170         uint32_t status=cnv->fromUnicodeStatus;
1171         inDirectMode=(UBool)((status>>24)&1);
1172         base64Counter=(int8_t)(status>>16);
1173         bits=(uint8_t)status;
1174     }
1175
1176     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1177     sourceIndex=0;
1178
1179     if(inDirectMode) {
1180 directMode:
1181         length=(int32_t)(sourceLimit-source);
1182         targetCapacity=(int32_t)(targetLimit-target);
1183         if(length>targetCapacity) {
1184             length=targetCapacity;
1185         }
1186         while(length>0) {
1187             c=*source++;
1188             /* encode 0x20..0x7e except '&' directly */
1189             if(inSetDIMAP(c)) {
1190                 /* encode directly */
1191                 *target++=(uint8_t)c;
1192                 if(offsets!=NULL) {
1193                     *offsets++=sourceIndex++;
1194                 }
1195             } else if(c==AMPERSAND) {
1196                 /* output &- for & */
1197                 *target++=AMPERSAND;
1198                 if(target<targetLimit) {
1199                     *target++=MINUS;
1200                     if(offsets!=NULL) {
1201                         *offsets++=sourceIndex;
1202                         *offsets++=sourceIndex++;
1203                     }
1204                     /* realign length and targetCapacity */
1205                     goto directMode;
1206                 } else {
1207                     if(offsets!=NULL) {
1208                         *offsets++=sourceIndex++;
1209                     }
1210                     cnv->charErrorBuffer[0]=MINUS;
1211                     cnv->charErrorBufferLength=1;
1212                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1213                     break;
1214                 }
1215             } else {
1216                 /* un-read this character and switch to Unicode Mode */
1217                 --source;
1218                 *target++=AMPERSAND;
1219                 if(offsets!=NULL) {
1220                     *offsets++=sourceIndex;
1221                 }
1222                 inDirectMode=FALSE;
1223                 base64Counter=0;
1224                 goto unicodeMode;
1225             }
1226             --length;
1227         }
1228         if(source<sourceLimit && target>=targetLimit) {
1229             /* target is full */
1230             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1231         }
1232     } else {
1233 unicodeMode:
1234         while(source<sourceLimit) {
1235             if(target<targetLimit) {
1236                 c=*source++;
1237                 if(isLegalIMAP(c)) {
1238                     /* encode directly */
1239                     inDirectMode=TRUE;
1240
1241                     /* trick: back out this character to make this easier */
1242                     --source;
1243
1244                     /* terminate the base64 sequence */
1245                     if(base64Counter!=0) {
1246                         /* write remaining bits for the previous character */
1247                         *target++=TO_BASE64_IMAP(bits);
1248                         if(offsets!=NULL) {
1249                             *offsets++=sourceIndex-1;
1250                         }
1251                     }
1252                     /* need to terminate with a minus */
1253                     if(target<targetLimit) {
1254                         *target++=MINUS;
1255                         if(offsets!=NULL) {
1256                             *offsets++=sourceIndex-1;
1257                         }
1258                     } else {
1259                         cnv->charErrorBuffer[0]=MINUS;
1260                         cnv->charErrorBufferLength=1;
1261                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1262                         break;
1263                     }
1264                     goto directMode;
1265                 } else {
1266                     /*
1267                      * base64 this character:
1268                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1269                      * and the bits of this character, each implicitly in UTF-16BE.
1270                      *
1271                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1272                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1273                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1274                      */
1275                     switch(base64Counter) {
1276                     case 0:
1277                         b=(uint8_t)(c>>10);
1278                         *target++=TO_BASE64_IMAP(b);
1279                         if(target<targetLimit) {
1280                             b=(uint8_t)((c>>4)&0x3f);
1281                             *target++=TO_BASE64_IMAP(b);
1282                             if(offsets!=NULL) {
1283                                 *offsets++=sourceIndex;
1284                                 *offsets++=sourceIndex++;
1285                             }
1286                         } else {
1287                             if(offsets!=NULL) {
1288                                 *offsets++=sourceIndex++;
1289                             }
1290                             b=(uint8_t)((c>>4)&0x3f);
1291                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1292                             cnv->charErrorBufferLength=1;
1293                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1294                         }
1295                         bits=(uint8_t)((c&15)<<2);
1296                         base64Counter=1;
1297                         break;
1298                     case 1:
1299                         b=(uint8_t)(bits|(c>>14));
1300                         *target++=TO_BASE64_IMAP(b);
1301                         if(target<targetLimit) {
1302                             b=(uint8_t)((c>>8)&0x3f);
1303                             *target++=TO_BASE64_IMAP(b);
1304                             if(target<targetLimit) {
1305                                 b=(uint8_t)((c>>2)&0x3f);
1306                                 *target++=TO_BASE64_IMAP(b);
1307                                 if(offsets!=NULL) {
1308                                     *offsets++=sourceIndex;
1309                                     *offsets++=sourceIndex;
1310                                     *offsets++=sourceIndex++;
1311                                 }
1312                             } else {
1313                                 if(offsets!=NULL) {
1314                                     *offsets++=sourceIndex;
1315                                     *offsets++=sourceIndex++;
1316                                 }
1317                                 b=(uint8_t)((c>>2)&0x3f);
1318                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1319                                 cnv->charErrorBufferLength=1;
1320                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1321                             }
1322                         } else {
1323                             if(offsets!=NULL) {
1324                                 *offsets++=sourceIndex++;
1325                             }
1326                             b=(uint8_t)((c>>8)&0x3f);
1327                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1328                             b=(uint8_t)((c>>2)&0x3f);
1329                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1330                             cnv->charErrorBufferLength=2;
1331                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332                         }
1333                         bits=(uint8_t)((c&3)<<4);
1334                         base64Counter=2;
1335                         break;
1336                     case 2:
1337                         b=(uint8_t)(bits|(c>>12));
1338                         *target++=TO_BASE64_IMAP(b);
1339                         if(target<targetLimit) {
1340                             b=(uint8_t)((c>>6)&0x3f);
1341                             *target++=TO_BASE64_IMAP(b);
1342                             if(target<targetLimit) {
1343                                 b=(uint8_t)(c&0x3f);
1344                                 *target++=TO_BASE64_IMAP(b);
1345                                 if(offsets!=NULL) {
1346                                     *offsets++=sourceIndex;
1347                                     *offsets++=sourceIndex;
1348                                     *offsets++=sourceIndex++;
1349                                 }
1350                             } else {
1351                                 if(offsets!=NULL) {
1352                                     *offsets++=sourceIndex;
1353                                     *offsets++=sourceIndex++;
1354                                 }
1355                                 b=(uint8_t)(c&0x3f);
1356                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1357                                 cnv->charErrorBufferLength=1;
1358                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1359                             }
1360                         } else {
1361                             if(offsets!=NULL) {
1362                                 *offsets++=sourceIndex++;
1363                             }
1364                             b=(uint8_t)((c>>6)&0x3f);
1365                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1366                             b=(uint8_t)(c&0x3f);
1367                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1368                             cnv->charErrorBufferLength=2;
1369                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1370                         }
1371                         bits=0;
1372                         base64Counter=0;
1373                         break;
1374                     default:
1375                         /* will never occur */
1376                         break;
1377                     }
1378                 }
1379             } else {
1380                 /* target is full */
1381                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382                 break;
1383             }
1384         }
1385     }
1386
1387     if(pArgs->flush && source>=sourceLimit) {
1388         /* flush remaining bits to the target */
1389         if(!inDirectMode) {
1390             if(base64Counter!=0) {
1391                 if(target<targetLimit) {
1392                     *target++=TO_BASE64_IMAP(bits);
1393                     if(offsets!=NULL) {
1394                         *offsets++=sourceIndex-1;
1395                     }
1396                 } else {
1397                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1398                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1399                 }
1400             }
1401             /* need to terminate with a minus */
1402             if(target<targetLimit) {
1403                 *target++=MINUS;
1404                 if(offsets!=NULL) {
1405                     *offsets++=sourceIndex-1;
1406                 }
1407             } else {
1408                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1409                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1410             }
1411         }
1412         /* reset the state for the next conversion */
1413         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1414     } else {
1415         /* set the converter state back into UConverter */
1416         cnv->fromUnicodeStatus=
1417             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1418             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1419     }
1420
1421     /* write back the updated pointers */
1422     pArgs->source=source;
1423     pArgs->target=(char *)target;
1424     pArgs->offsets=offsets;
1425     return;
1426 }
1427
1428 static const UConverterImpl _IMAPImpl={
1429     UCNV_IMAP_MAILBOX,
1430
1431     NULL,
1432     NULL,
1433
1434     _UTF7Open,
1435     NULL,
1436     _UTF7Reset,
1437
1438     _IMAPToUnicodeWithOffsets,
1439     _IMAPToUnicodeWithOffsets,
1440     _IMAPFromUnicodeWithOffsets,
1441     _IMAPFromUnicodeWithOffsets,
1442     NULL,
1443
1444     NULL,
1445     NULL,
1446     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1447     NULL,
1448     ucnv_getCompleteUnicodeSet
1449 };
1450
1451 static const UConverterStaticData _IMAPStaticData={
1452     sizeof(UConverterStaticData),
1453     "IMAP-mailbox-name",
1454     0, /* TODO CCSID for IMAP-mailbox-name */
1455     UCNV_IBM, UCNV_IMAP_MAILBOX,
1456     1, 4,
1457     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1458     FALSE, FALSE,
1459     0,
1460     0,
1461     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1462 };
1463
1464 const UConverterSharedData _IMAPData={
1465     sizeof(UConverterSharedData), ~((uint32_t)0),
1466     NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1467     0
1468 };
1469
1470 #endif