icuSources/common/ucnv_u7.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2015, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u7.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "ucnv_bld.h"
  23 #include "ucnv_cnv.h"
  24 #include "uassert.h"
  25
  26 /* UTF-7 -------------------------------------------------------------------- */
  27
  28 /*
  29  * UTF-7 is a stateful encoding of Unicode.
  30  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  31  * It was intended for use in Internet email systems, using in its bytewise
  32  * encoding only a subset of 7-bit US-ASCII.
  33  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  34  * occasionally used.
  35  *
  36  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  37  * characters directly or in base64. Especially, the characters in set O
  38  * as defined in the RFC (see below) may be encoded directly but are not
  39  * allowed in, e.g., email headers.
  40  * By default, the ICU UTF-7 converter encodes set O directly.
  41  * By choosing the option "version=1", set O will be escaped instead.
  42  * For example:
  43  *     utf7Converter=ucnv_open("UTF-7,version=1");
  44  *
  45  * For details about email headers see RFC 2047.
  46  */
  47
  48 /*
  49  * Tests for US-ASCII characters belonging to character classes
  50  * defined in UTF-7.
  51  *
  52  * Set D (directly encoded characters) consists of the following
  53  * characters: the upper and lower case letters A through Z
  54  * and a through z, the 10 digits 0-9, and the following nine special
  55  * characters (note that "+" and "=" are omitted):
  56  *     '(),-./:?
  57  *
  58  * Set O (optional direct characters) consists of the following
  59  * characters (note that "\" and "~" are omitted):
  60  *     !"#$%&*;<=>@[]^_`{|}
  61  *
  62  * According to the rules in RFC 2152, the byte values for the following
  63  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  64  * - all C0 control codes except for CR LF TAB
  65  * - BACKSLASH
  66  * - TILDE
  67  * - DEL
  68  * - all codes beyond US-ASCII, i.e. all >127
  69  */
  70 #define inSetD(c) \
  71     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  72      (uint8_t)((c)-48)<10 ||    /* digits */ \
  73      (uint8_t)((c)-39)<3 ||     /* '() */ \
  74      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
  75      (c)==58 || (c)==63         /* :? */ \
  76     )
  77
  78 #define inSetO(c) \
  79     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
  80      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
  81      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
  82      (uint8_t)((c)-123)<3 ||        /* {|} */ \
  83      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
  84     )
  85
  86 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  87 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  88
  89 #define PLUS  43
  90 #define MINUS 45
  91 #define BACKSLASH 92
  92 #define TILDE 126
  93
  94 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  95 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  96
  97 /* encode directly sets D and O and CR LF SP TAB */
  98 static const UBool encodeDirectlyMaximum[128]={
  99  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 100     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 102
 103     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 104     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 105
 106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 107     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
 108
 109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 110     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
 111 };
 112
 113 /* encode directly set D and CR LF SP TAB but not set O */
 114 static const UBool encodeDirectlyRestricted[128]={
 115  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 116     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 117     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 118
 119     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 120     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 121
 122     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 123     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 124
 125     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 126     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
 127 };
 128
 129 static const uint8_t
 130 toBase64[64]={
 131     /* A-Z */
 132     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 133     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
 134     /* a-z */
 135     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 136     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
 137     /* 0-9 */
 138     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 139     /* +/ */
 140     43, 47
 141 };
 142
 143 static const int8_t
 144 fromBase64[128]={
 145     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
 146     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
 147     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
 148
 149     /* general punctuation with + and / and a special value (-2) for - */
 150     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
 151     /* digits */
 152     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
 153
 154     /* A-Z */
 155     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 156     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
 157
 158     /* a-z */
 159     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 160     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
 161 };
 162
 163 /*
 164  * converter status values:
 165  *
 166  * toUnicodeStatus:
 167  *     24 inDirectMode (boolean)
 168  * 23..16 base64Counter (-1..7)
 169  * 15..0  bits (up to 14 bits incoming base64)
 170  *
 171  * fromUnicodeStatus:
 172  * 31..28 version (0: set O direct  1: set O escaped)
 173  *     24 inDirectMode (boolean)
 174  * 23..16 base64Counter (0..2)
 175  *  7..0  bits (6 bits outgoing base64)
 176  *
 177  */
 178
 179 static void
 180 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
 181     if(choice<=UCNV_RESET_TO_UNICODE) {
 182         /* reset toUnicode */
 183         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
 184         cnv->toULength=0;
 185     }
 186     if(choice!=UCNV_RESET_TO_UNICODE) {
 187         /* reset fromUnicode */
 188         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 189     }
 190 }
 191
 192 static void
 193 _UTF7Open(UConverter *cnv,
 194           UConverterLoadArgs *pArgs,
 195           UErrorCode *pErrorCode) {
 196     if(UCNV_GET_VERSION(cnv)<=1) {
 197         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
 198         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
 199         _UTF7Reset(cnv, UCNV_RESET_BOTH);
 200     } else {
 201         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 202     }
 203 }
 204
 205 static void
 206 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 207                           UErrorCode *pErrorCode) {
 208     UConverter *cnv;
 209     const uint8_t *source, *sourceLimit;
 210     UChar *target;
 211     const UChar *targetLimit;
 212     int32_t *offsets;
 213
 214     uint8_t *bytes;
 215     uint8_t byteIndex;
 216
 217     int32_t length, targetCapacity;
 218
 219     /* UTF-7 state */
 220     uint16_t bits;
 221     int8_t base64Counter;
 222     UBool inDirectMode;
 223
 224     int8_t base64Value;
 225
 226     int32_t sourceIndex, nextSourceIndex;
 227
 228     uint8_t b;
 229     /* set up the local pointers */
 230     cnv=pArgs->converter;
 231
 232     source=(const uint8_t *)pArgs->source;
 233     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 234     target=pArgs->target;
 235     targetLimit=pArgs->targetLimit;
 236     offsets=pArgs->offsets;
 237     /* get the state machine state */
 238     {
 239         uint32_t status=cnv->toUnicodeStatus;
 240         inDirectMode=(UBool)((status>>24)&1);
 241         base64Counter=(int8_t)(status>>16);
 242         bits=(uint16_t)status;
 243     }
 244     bytes=cnv->toUBytes;
 245     byteIndex=cnv->toULength;
 246
 247     /* sourceIndex=-1 if the current character began in the previous buffer */
 248     sourceIndex=byteIndex==0 ? 0 : -1;
 249     nextSourceIndex=0;
 250
 251     if(inDirectMode) {
 252 directMode:
 253         /*
 254          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
 255          * with their US-ASCII byte values.
 256          * Backslash and Tilde and most control characters are not allowed in UTF-7.
 257          * A plus sign starts Unicode (or "escape") Mode.
 258          *
 259          * In Direct Mode, only the sourceIndex is used.
 260          */
 261         byteIndex=0;
 262         length=(int32_t)(sourceLimit-source);
 263         targetCapacity=(int32_t)(targetLimit-target);
 264         if(length>targetCapacity) {
 265             length=targetCapacity;
 266         }
 267         while(length>0) {
 268             b=*source++;
 269             if(!isLegalUTF7(b)) {
 270                 /* illegal */
 271                 bytes[0]=b;
 272                 byteIndex=1;
 273                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 274                 break;
 275             } else if(b!=PLUS) {
 276                 /* write directly encoded character */
 277                 *target++=b;
 278                 if(offsets!=NULL) {
 279                     *offsets++=sourceIndex++;
 280                 }
 281             } else /* PLUS */ {
 282                 /* switch to Unicode mode */
 283                 nextSourceIndex=++sourceIndex;
 284                 inDirectMode=FALSE;
 285                 byteIndex=0;
 286                 bits=0;
 287                 base64Counter=-1;
 288                 goto unicodeMode;
 289             }
 290             --length;
 291         }
 292         if(source<sourceLimit && target>=targetLimit) {
 293             /* target is full */
 294             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 295         }
 296     } else {
 297 unicodeMode:
 298         /*
 299          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 300          * The base64 sequence ends with any character that is not in the base64 alphabet.
 301          * A terminating minus sign is consumed.
 302          *
 303          * In Unicode Mode, the sourceIndex has the index to the start of the current
 304          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 305          * keeping the index to the following byte.
 306          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 307          */
 308         while(source<sourceLimit) {
 309             if(target<targetLimit) {
 310                 bytes[byteIndex++]=b=*source++;
 311                 ++nextSourceIndex;
 312                 base64Value = -3; /* initialize as illegal */
 313                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
 314                     /* either
 315                      * base64Value==-1 for any legal character except base64 and minus sign, or
 316                      * base64Value==-3 for illegal characters:
 317                      * 1. In either case, leave Unicode mode.
 318                      * 2.1. If we ended with an incomplete UChar or none after the +, then
 319                      *      generate an error for the preceding erroneous sequence and deal with
 320                      *      the current (possibly illegal) character next time through.
 321                      * 2.2. Else the current char comes after a complete UChar, which was already
 322                      *      pushed to the output buf, so:
 323                      * 2.2.1. If the current char is legal, just save it for processing next time.
 324                      *        It may be for example, a plus which we need to deal with in direct mode.
 325                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
 326                      */
 327                     inDirectMode=TRUE;
 328                     if(base64Counter==-1) {
 329                         /* illegal: + immediately followed by something other than base64 or minus sign */
 330                         /* include the plus sign in the reported sequence, but not the subsequent char */
 331                         --source;
 332                         bytes[0]=PLUS;
 333                         byteIndex=1;
 334                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 335                         break;
 336                     } else if(bits!=0) {
 337                         /* bits are illegally left over, a UChar is incomplete */
 338                         /* don't include current char (legal or illegal) in error seq */
 339                         --source;
 340                         --byteIndex;
 341                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 342                         break;
 343                     } else {
 344                         /* previous UChar was complete */
 345                         if(base64Value==-3) {
 346                             /* current character is illegal, deal with it here */
 347                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 348                             break;
 349                         } else {
 350                             /* un-read the current character in case it is a plus sign */
 351                             --source;
 352                             sourceIndex=nextSourceIndex-1;
 353                             goto directMode;
 354                         }
 355                     }
 356                 } else if(base64Value>=0) {
 357                     /* collect base64 bytes into UChars */
 358                     switch(base64Counter) {
 359                     case -1: /* -1 is immediately after the + */
 360                     case 0:
 361                         bits=base64Value;
 362                         base64Counter=1;
 363                         break;
 364                     case 1:
 365                     case 3:
 366                     case 4:
 367                     case 6:
 368                         bits=(uint16_t)((bits<<6)|base64Value);
 369                         ++base64Counter;
 370                         break;
 371                     case 2:
 372                         *target++=(UChar)((bits<<4)|(base64Value>>2));
 373                         if(offsets!=NULL) {
 374                             *offsets++=sourceIndex;
 375                             sourceIndex=nextSourceIndex-1;
 376                         }
 377                         bytes[0]=b; /* keep this byte in case an error occurs */
 378                         byteIndex=1;
 379                         bits=(uint16_t)(base64Value&3);
 380                         base64Counter=3;
 381                         break;
 382                     case 5:
 383                         *target++=(UChar)((bits<<2)|(base64Value>>4));
 384                         if(offsets!=NULL) {
 385                             *offsets++=sourceIndex;
 386                             sourceIndex=nextSourceIndex-1;
 387                         }
 388                         bytes[0]=b; /* keep this byte in case an error occurs */
 389                         byteIndex=1;
 390                         bits=(uint16_t)(base64Value&15);
 391                         base64Counter=6;
 392                         break;
 393                     case 7:
 394                         *target++=(UChar)((bits<<6)|base64Value);
 395                         if(offsets!=NULL) {
 396                             *offsets++=sourceIndex;
 397                             sourceIndex=nextSourceIndex;
 398                         }
 399                         byteIndex=0;
 400                         bits=0;
 401                         base64Counter=0;
 402                         break;
 403                     default:
 404                         /* will never occur */
 405                         break;
 406                     }
 407                 } else /*base64Value==-2*/ {
 408                     /* minus sign terminates the base64 sequence */
 409                     inDirectMode=TRUE;
 410                     if(base64Counter==-1) {
 411                         /* +- i.e. a minus immediately following a plus */
 412                         *target++=PLUS;
 413                         if(offsets!=NULL) {
 414                             *offsets++=sourceIndex-1;
 415                         }
 416                     } else {
 417                         /* absorb the minus and leave the Unicode Mode */
 418                         if(bits!=0) {
 419                             /* bits are illegally left over, a UChar is incomplete */
 420                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 421                             break;
 422                         }
 423                     }
 424                     sourceIndex=nextSourceIndex;
 425                     goto directMode;
 426                 }
 427             } else {
 428                 /* target is full */
 429                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 430                 break;
 431             }
 432         }
 433     }
 434
 435     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
 436         /*
 437          * if we are in Unicode mode, then the byteIndex might not be 0,
 438          * but that is ok if bits==0
 439          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
 440          * (not true for IMAP-mailbox-name where we must end in direct mode)
 441          */
 442         byteIndex=0;
 443     }
 444
 445     /* set the converter state back into UConverter */
 446     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
 447     cnv->toULength=byteIndex;
 448
 449     /* write back the updated pointers */
 450     pArgs->source=(const char *)source;
 451     pArgs->target=target;
 452     pArgs->offsets=offsets;
 453     return;
 454 }
 455
 456 static void
 457 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 458                             UErrorCode *pErrorCode) {
 459     UConverter *cnv;
 460     const UChar *source, *sourceLimit;
 461     uint8_t *target, *targetLimit;
 462     int32_t *offsets;
 463
 464     int32_t length, targetCapacity, sourceIndex;
 465     UChar c;
 466
 467     /* UTF-7 state */
 468     const UBool *encodeDirectly;
 469     uint8_t bits;
 470     int8_t base64Counter;
 471     UBool inDirectMode;
 472
 473     /* set up the local pointers */
 474     cnv=pArgs->converter;
 475
 476     /* set up the local pointers */
 477     source=pArgs->source;
 478     sourceLimit=pArgs->sourceLimit;
 479     target=(uint8_t *)pArgs->target;
 480     targetLimit=(uint8_t *)pArgs->targetLimit;
 481     offsets=pArgs->offsets;
 482
 483     /* get the state machine state */
 484     {
 485         uint32_t status=cnv->fromUnicodeStatus;
 486         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
 487         inDirectMode=(UBool)((status>>24)&1);
 488         base64Counter=(int8_t)(status>>16);
 489         bits=(uint8_t)status;
 490         U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0]));
 491     }
 492
 493     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
 494     sourceIndex=0;
 495
 496     if(inDirectMode) {
 497 directMode:
 498         length=(int32_t)(sourceLimit-source);
 499         targetCapacity=(int32_t)(targetLimit-target);
 500         if(length>targetCapacity) {
 501             length=targetCapacity;
 502         }
 503         while(length>0) {
 504             c=*source++;
 505             /* currently always encode CR LF SP TAB directly */
 506             if(c<=127 && encodeDirectly[c]) {
 507                 /* encode directly */
 508                 *target++=(uint8_t)c;
 509                 if(offsets!=NULL) {
 510                     *offsets++=sourceIndex++;
 511                 }
 512             } else if(c==PLUS) {
 513                 /* output +- for + */
 514                 *target++=PLUS;
 515                 if(target<targetLimit) {
 516                     *target++=MINUS;
 517                     if(offsets!=NULL) {
 518                         *offsets++=sourceIndex;
 519                         *offsets++=sourceIndex++;
 520                     }
 521                     /* realign length and targetCapacity */
 522                     goto directMode;
 523                 } else {
 524                     if(offsets!=NULL) {
 525                         *offsets++=sourceIndex++;
 526                     }
 527                     cnv->charErrorBuffer[0]=MINUS;
 528                     cnv->charErrorBufferLength=1;
 529                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 530                     break;
 531                 }
 532             } else {
 533                 /* un-read this character and switch to Unicode Mode */
 534                 --source;
 535                 *target++=PLUS;
 536                 if(offsets!=NULL) {
 537                     *offsets++=sourceIndex;
 538                 }
 539                 inDirectMode=FALSE;
 540                 base64Counter=0;
 541                 goto unicodeMode;
 542             }
 543             --length;
 544         }
 545         if(source<sourceLimit && target>=targetLimit) {
 546             /* target is full */
 547             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 548         }
 549     } else {
 550 unicodeMode:
 551         while(source<sourceLimit) {
 552             if(target<targetLimit) {
 553                 c=*source++;
 554                 if(c<=127 && encodeDirectly[c]) {
 555                     /* encode directly */
 556                     inDirectMode=TRUE;
 557
 558                     /* trick: back out this character to make this easier */
 559                     --source;
 560
 561                     /* terminate the base64 sequence */
 562                     if(base64Counter!=0) {
 563                         /* write remaining bits for the previous character */
 564                         *target++=toBase64[bits];
 565                         if(offsets!=NULL) {
 566                             *offsets++=sourceIndex-1;
 567                         }
 568                     }
 569                     if(fromBase64[c]!=-1) {
 570                         /* need to terminate with a minus */
 571                         if(target<targetLimit) {
 572                             *target++=MINUS;
 573                             if(offsets!=NULL) {
 574                                 *offsets++=sourceIndex-1;
 575                             }
 576                         } else {
 577                             cnv->charErrorBuffer[0]=MINUS;
 578                             cnv->charErrorBufferLength=1;
 579                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 580                             break;
 581                         }
 582                     }
 583                     goto directMode;
 584                 } else {
 585                     /*
 586                      * base64 this character:
 587                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
 588                      * and the bits of this character, each implicitly in UTF-16BE.
 589                      *
 590                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
 591                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
 592                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
 593                      */
 594                     switch(base64Counter) {
 595                     case 0:
 596                         *target++=toBase64[c>>10];
 597                         if(target<targetLimit) {
 598                             *target++=toBase64[(c>>4)&0x3f];
 599                             if(offsets!=NULL) {
 600                                 *offsets++=sourceIndex;
 601                                 *offsets++=sourceIndex++;
 602                             }
 603                         } else {
 604                             if(offsets!=NULL) {
 605                                 *offsets++=sourceIndex++;
 606                             }
 607                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
 608                             cnv->charErrorBufferLength=1;
 609                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 610                         }
 611                         bits=(uint8_t)((c&15)<<2);
 612                         base64Counter=1;
 613                         break;
 614                     case 1:
 615                         *target++=toBase64[bits|(c>>14)];
 616                         if(target<targetLimit) {
 617                             *target++=toBase64[(c>>8)&0x3f];
 618                             if(target<targetLimit) {
 619                                 *target++=toBase64[(c>>2)&0x3f];
 620                                 if(offsets!=NULL) {
 621                                     *offsets++=sourceIndex;
 622                                     *offsets++=sourceIndex;
 623                                     *offsets++=sourceIndex++;
 624                                 }
 625                             } else {
 626                                 if(offsets!=NULL) {
 627                                     *offsets++=sourceIndex;
 628                                     *offsets++=sourceIndex++;
 629                                 }
 630                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
 631                                 cnv->charErrorBufferLength=1;
 632                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 633                             }
 634                         } else {
 635                             if(offsets!=NULL) {
 636                                 *offsets++=sourceIndex++;
 637                             }
 638                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
 639                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
 640                             cnv->charErrorBufferLength=2;
 641                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 642                         }
 643                         bits=(uint8_t)((c&3)<<4);
 644                         base64Counter=2;
 645                         break;
 646                     case 2:
 647                         *target++=toBase64[bits|(c>>12)];
 648                         if(target<targetLimit) {
 649                             *target++=toBase64[(c>>6)&0x3f];
 650                             if(target<targetLimit) {
 651                                 *target++=toBase64[c&0x3f];
 652                                 if(offsets!=NULL) {
 653                                     *offsets++=sourceIndex;
 654                                     *offsets++=sourceIndex;
 655                                     *offsets++=sourceIndex++;
 656                                 }
 657                             } else {
 658                                 if(offsets!=NULL) {
 659                                     *offsets++=sourceIndex;
 660                                     *offsets++=sourceIndex++;
 661                                 }
 662                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
 663                                 cnv->charErrorBufferLength=1;
 664                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 665                             }
 666                         } else {
 667                             if(offsets!=NULL) {
 668                                 *offsets++=sourceIndex++;
 669                             }
 670                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
 671                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
 672                             cnv->charErrorBufferLength=2;
 673                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 674                         }
 675                         bits=0;
 676                         base64Counter=0;
 677                         break;
 678                     default:
 679                         /* will never occur */
 680                         break;
 681                     }
 682                 }
 683             } else {
 684                 /* target is full */
 685                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 686                 break;
 687             }
 688         }
 689     }
 690
 691     if(pArgs->flush && source>=sourceLimit) {
 692         /* flush remaining bits to the target */
 693         if(!inDirectMode) {
 694             if (base64Counter!=0) {
 695                 if(target<targetLimit) {
 696                     *target++=toBase64[bits];
 697                     if(offsets!=NULL) {
 698                         *offsets++=sourceIndex-1;
 699                     }
 700                 } else {
 701                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
 702                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 703                 }
 704             }
 705             /* Add final MINUS to terminate unicodeMode */
 706             if(target<targetLimit) {
 707                 *target++=MINUS;
 708                 if(offsets!=NULL) {
 709                     *offsets++=sourceIndex-1;
 710                 }
 711             } else {
 712                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
 713                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 714             }
 715         }
 716         /* reset the state for the next conversion */
 717         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 718     } else {
 719         /* set the converter state back into UConverter */
 720         cnv->fromUnicodeStatus=
 721             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
 722             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
 723     }
 724
 725     /* write back the updated pointers */
 726     pArgs->source=source;
 727     pArgs->target=(char *)target;
 728     pArgs->offsets=offsets;
 729     return;
 730 }
 731
 732 static const char *
 733 _UTF7GetName(const UConverter *cnv) {
 734     switch(cnv->fromUnicodeStatus>>28) {
 735     case 1:
 736         return "UTF-7,version=1";
 737     default:
 738         return "UTF-7";
 739     }
 740 }
 741
 742 static const UConverterImpl _UTF7Impl={
 743     UCNV_UTF7,
 744
 745     NULL,
 746     NULL,
 747
 748     _UTF7Open,
 749     NULL,
 750     _UTF7Reset,
 751
 752     _UTF7ToUnicodeWithOffsets,
 753     _UTF7ToUnicodeWithOffsets,
 754     _UTF7FromUnicodeWithOffsets,
 755     _UTF7FromUnicodeWithOffsets,
 756     NULL,
 757
 758     NULL,
 759     _UTF7GetName,
 760     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
 761     NULL,
 762     ucnv_getCompleteUnicodeSet
 763 };
 764
 765 static const UConverterStaticData _UTF7StaticData={
 766     sizeof(UConverterStaticData),
 767     "UTF-7",
 768     0, /* TODO CCSID for UTF-7 */
 769     UCNV_IBM, UCNV_UTF7,
 770     1, 4,
 771     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
 772     FALSE, FALSE,
 773     0,
 774     0,
 775     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 776 };
 777
 778 const UConverterSharedData _UTF7Data={
 779     sizeof(UConverterSharedData), ~((uint32_t)0),
 780     NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
 781     0
 782 };
 783
 784 /* IMAP mailbox name encoding ----------------------------------------------- */
 785
 786 /*
 787  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 788  * http://www.ietf.org/rfc/rfc2060.txt
 789  *
 790  * 5.1.3.  Mailbox International Naming Convention
 791  *
 792  * By convention, international mailbox names are specified using a
 793  * modified version of the UTF-7 encoding described in [UTF-7].  The
 794  * purpose of these modifications is to correct the following problems
 795  * with UTF-7:
 796  *
 797  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
 798  *       the common use of "+" in mailbox names, in particular USENET
 799  *       newsgroup names.
 800  *
 801  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
 802  *       conflicts with the use of "/" as a popular hierarchy delimiter.
 803  *
 804  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
 805  *       the use of "\" as a popular hierarchy delimiter.
 806  *
 807  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
 808  *       the use of "~" in some servers as a home directory indicator.
 809  *
 810  *    5) UTF-7 permits multiple alternate forms to represent the same
 811  *       string; in particular, printable US-ASCII chararacters can be
 812  *       represented in encoded form.
 813  *
 814  * In modified UTF-7, printable US-ASCII characters except for "&"
 815  * represent themselves; that is, characters with octet values 0x20-0x25
 816  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
 817  * octet sequence "&-".
 818  *
 819  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
 820  * Unicode 16-bit octets) are represented in modified BASE64, with a
 821  * further modification from [UTF-7] that "," is used instead of "/".
 822  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
 823  * character which can represent itself.
 824  *
 825  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
 826  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
 827  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
 828  * ").
 829  *
 830  * For example, here is a mailbox name which mixes English, Japanese,
 831  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
 832  */
 833
 834 /*
 835  * Tests for US-ASCII characters belonging to character classes
 836  * defined in UTF-7.
 837  *
 838  * Set D (directly encoded characters) consists of the following
 839  * characters: the upper and lower case letters A through Z
 840  * and a through z, the 10 digits 0-9, and the following nine special
 841  * characters (note that "+" and "=" are omitted):
 842  *     '(),-./:?
 843  *
 844  * Set O (optional direct characters) consists of the following
 845  * characters (note that "\" and "~" are omitted):
 846  *     !"#$%&*;<=>@[]^_`{|}
 847  *
 848  * According to the rules in RFC 2152, the byte values for the following
 849  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
 850  * - all C0 control codes except for CR LF TAB
 851  * - BACKSLASH
 852  * - TILDE
 853  * - DEL
 854  * - all codes beyond US-ASCII, i.e. all >127
 855  */
 856
 857 /* uses '&' not '+' to start a base64 sequence */
 858 #define AMPERSAND 0x26
 859 #define COMMA 0x2c
 860 #define SLASH 0x2f
 861
 862 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
 863 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
 864
 865 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
 866 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
 867
 868 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
 869 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
 870
 871 /*
 872  * converter status values:
 873  *
 874  * toUnicodeStatus:
 875  *     24 inDirectMode (boolean)
 876  * 23..16 base64Counter (-1..7)
 877  * 15..0  bits (up to 14 bits incoming base64)
 878  *
 879  * fromUnicodeStatus:
 880  *     24 inDirectMode (boolean)
 881  * 23..16 base64Counter (0..2)
 882  *  7..0  bits (6 bits outgoing base64)
 883  *
 884  * ignore bits 31..25
 885  */
 886
 887 static void
 888 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 889                           UErrorCode *pErrorCode) {
 890     UConverter *cnv;
 891     const uint8_t *source, *sourceLimit;
 892     UChar *target;
 893     const UChar *targetLimit;
 894     int32_t *offsets;
 895
 896     uint8_t *bytes;
 897     uint8_t byteIndex;
 898
 899     int32_t length, targetCapacity;
 900
 901     /* UTF-7 state */
 902     uint16_t bits;
 903     int8_t base64Counter;
 904     UBool inDirectMode;
 905
 906     int8_t base64Value;
 907
 908     int32_t sourceIndex, nextSourceIndex;
 909
 910     UChar c;
 911     uint8_t b;
 912
 913     /* set up the local pointers */
 914     cnv=pArgs->converter;
 915
 916     source=(const uint8_t *)pArgs->source;
 917     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 918     target=pArgs->target;
 919     targetLimit=pArgs->targetLimit;
 920     offsets=pArgs->offsets;
 921     /* get the state machine state */
 922     {
 923         uint32_t status=cnv->toUnicodeStatus;
 924         inDirectMode=(UBool)((status>>24)&1);
 925         base64Counter=(int8_t)(status>>16);
 926         bits=(uint16_t)status;
 927     }
 928     bytes=cnv->toUBytes;
 929     byteIndex=cnv->toULength;
 930
 931     /* sourceIndex=-1 if the current character began in the previous buffer */
 932     sourceIndex=byteIndex==0 ? 0 : -1;
 933     nextSourceIndex=0;
 934
 935     if(inDirectMode) {
 936 directMode:
 937         /*
 938          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
 939          * with their US-ASCII byte values.
 940          * An ampersand starts Unicode (or "escape") Mode.
 941          *
 942          * In Direct Mode, only the sourceIndex is used.
 943          */
 944         byteIndex=0;
 945         length=(int32_t)(sourceLimit-source);
 946         targetCapacity=(int32_t)(targetLimit-target);
 947         if(length>targetCapacity) {
 948             length=targetCapacity;
 949         }
 950         while(length>0) {
 951             b=*source++;
 952             if(!isLegalIMAP(b)) {
 953                 /* illegal */
 954                 bytes[0]=b;
 955                 byteIndex=1;
 956                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 957                 break;
 958             } else if(b!=AMPERSAND) {
 959                 /* write directly encoded character */
 960                 *target++=b;
 961                 if(offsets!=NULL) {
 962                     *offsets++=sourceIndex++;
 963                 }
 964             } else /* AMPERSAND */ {
 965                 /* switch to Unicode mode */
 966                 nextSourceIndex=++sourceIndex;
 967                 inDirectMode=FALSE;
 968                 byteIndex=0;
 969                 bits=0;
 970                 base64Counter=-1;
 971                 goto unicodeMode;
 972             }
 973             --length;
 974         }
 975         if(source<sourceLimit && target>=targetLimit) {
 976             /* target is full */
 977             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 978         }
 979     } else {
 980 unicodeMode:
 981         /*
 982          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 983          * The base64 sequence ends with any character that is not in the base64 alphabet.
 984          * A terminating minus sign is consumed.
 985          * US-ASCII must not be base64-ed.
 986          *
 987          * In Unicode Mode, the sourceIndex has the index to the start of the current
 988          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 989          * keeping the index to the following byte.
 990          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 991          */
 992         while(source<sourceLimit) {
 993             if(target<targetLimit) {
 994                 bytes[byteIndex++]=b=*source++;
 995                 ++nextSourceIndex;
 996                 if(b>0x7e) {
 997                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
 998                     inDirectMode=TRUE;
 999                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1000                     break;
1001                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1002                     /* collect base64 bytes into UChars */
1003                     switch(base64Counter) {
1004                     case -1: /* -1 is immediately after the & */
1005                     case 0:
1006                         bits=base64Value;
1007                         base64Counter=1;
1008                         break;
1009                     case 1:
1010                     case 3:
1011                     case 4:
1012                     case 6:
1013                         bits=(uint16_t)((bits<<6)|base64Value);
1014                         ++base64Counter;
1015                         break;
1016                     case 2:
1017                         c=(UChar)((bits<<4)|(base64Value>>2));
1018                         if(isLegalIMAP(c)) {
1019                             /* illegal */
1020                             inDirectMode=TRUE;
1021                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1022                             goto endloop;
1023                         }
1024                         *target++=c;
1025                         if(offsets!=NULL) {
1026                             *offsets++=sourceIndex;
1027                             sourceIndex=nextSourceIndex-1;
1028                         }
1029                         bytes[0]=b; /* keep this byte in case an error occurs */
1030                         byteIndex=1;
1031                         bits=(uint16_t)(base64Value&3);
1032                         base64Counter=3;
1033                         break;
1034                     case 5:
1035                         c=(UChar)((bits<<2)|(base64Value>>4));
1036                         if(isLegalIMAP(c)) {
1037                             /* illegal */
1038                             inDirectMode=TRUE;
1039                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1040                             goto endloop;
1041                         }
1042                         *target++=c;
1043                         if(offsets!=NULL) {
1044                             *offsets++=sourceIndex;
1045                             sourceIndex=nextSourceIndex-1;
1046                         }
1047                         bytes[0]=b; /* keep this byte in case an error occurs */
1048                         byteIndex=1;
1049                         bits=(uint16_t)(base64Value&15);
1050                         base64Counter=6;
1051                         break;
1052                     case 7:
1053                         c=(UChar)((bits<<6)|base64Value);
1054                         if(isLegalIMAP(c)) {
1055                             /* illegal */
1056                             inDirectMode=TRUE;
1057                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1058                             goto endloop;
1059                         }
1060                         *target++=c;
1061                         if(offsets!=NULL) {
1062                             *offsets++=sourceIndex;
1063                             sourceIndex=nextSourceIndex;
1064                         }
1065                         byteIndex=0;
1066                         bits=0;
1067                         base64Counter=0;
1068                         break;
1069                     default:
1070                         /* will never occur */
1071                         break;
1072                     }
1073                 } else if(base64Value==-2) {
1074                     /* minus sign terminates the base64 sequence */
1075                     inDirectMode=TRUE;
1076                     if(base64Counter==-1) {
1077                         /* &- i.e. a minus immediately following an ampersand */
1078                         *target++=AMPERSAND;
1079                         if(offsets!=NULL) {
1080                             *offsets++=sourceIndex-1;
1081                         }
1082                     } else {
1083                         /* absorb the minus and leave the Unicode Mode */
1084                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1085                             /* bits are illegally left over, a UChar is incomplete */
1086                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1087                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1088                             break;
1089                         }
1090                     }
1091                     sourceIndex=nextSourceIndex;
1092                     goto directMode;
1093                 } else {
1094                     if(base64Counter==-1) {
1095                         /* illegal: & immediately followed by something other than base64 or minus sign */
1096                         /* include the ampersand in the reported sequence */
1097                         --sourceIndex;
1098                         bytes[0]=AMPERSAND;
1099                         bytes[1]=b;
1100                         byteIndex=2;
1101                     }
1102                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1103                     /* base64Value==-3 for illegal characters */
1104                     /* illegal */
1105                     inDirectMode=TRUE;
1106                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1107                     break;
1108                 }
1109             } else {
1110                 /* target is full */
1111                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1112                 break;
1113             }
1114         }
1115     }
1116 endloop:
1117
1118     /*
1119      * the end of the input stream and detection of truncated input
1120      * are handled by the framework, but here we must check if we are in Unicode
1121      * mode and byteIndex==0 because we must end in direct mode
1122      *
1123      * conditions:
1124      *   successful
1125      *   in Unicode mode and byteIndex==0
1126      *   end of input and no truncated input
1127      */
1128     if( U_SUCCESS(*pErrorCode) &&
1129         !inDirectMode && byteIndex==0 &&
1130         pArgs->flush && source>=sourceLimit
1131     ) {
1132         if(base64Counter==-1) {
1133             /* & at the very end of the input */
1134             /* make the ampersand the reported sequence */
1135             bytes[0]=AMPERSAND;
1136             byteIndex=1;
1137         }
1138         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1139
1140         inDirectMode=TRUE; /* avoid looping */
1141         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1142     }
1143
1144     /* set the converter state back into UConverter */
1145     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1146     cnv->toULength=byteIndex;
1147
1148     /* write back the updated pointers */
1149     pArgs->source=(const char *)source;
1150     pArgs->target=target;
1151     pArgs->offsets=offsets;
1152     return;
1153 }
1154
1155 static void
1156 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1157                             UErrorCode *pErrorCode) {
1158     UConverter *cnv;
1159     const UChar *source, *sourceLimit;
1160     uint8_t *target, *targetLimit;
1161     int32_t *offsets;
1162
1163     int32_t length, targetCapacity, sourceIndex;
1164     UChar c;
1165     uint8_t b;
1166
1167     /* UTF-7 state */
1168     uint8_t bits;
1169     int8_t base64Counter;
1170     UBool inDirectMode;
1171
1172     /* set up the local pointers */
1173     cnv=pArgs->converter;
1174
1175     /* set up the local pointers */
1176     source=pArgs->source;
1177     sourceLimit=pArgs->sourceLimit;
1178     target=(uint8_t *)pArgs->target;
1179     targetLimit=(uint8_t *)pArgs->targetLimit;
1180     offsets=pArgs->offsets;
1181
1182     /* get the state machine state */
1183     {
1184         uint32_t status=cnv->fromUnicodeStatus;
1185         inDirectMode=(UBool)((status>>24)&1);
1186         base64Counter=(int8_t)(status>>16);
1187         bits=(uint8_t)status;
1188     }
1189
1190     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1191     sourceIndex=0;
1192
1193     if(inDirectMode) {
1194 directMode:
1195         length=(int32_t)(sourceLimit-source);
1196         targetCapacity=(int32_t)(targetLimit-target);
1197         if(length>targetCapacity) {
1198             length=targetCapacity;
1199         }
1200         while(length>0) {
1201             c=*source++;
1202             /* encode 0x20..0x7e except '&' directly */
1203             if(inSetDIMAP(c)) {
1204                 /* encode directly */
1205                 *target++=(uint8_t)c;
1206                 if(offsets!=NULL) {
1207                     *offsets++=sourceIndex++;
1208                 }
1209             } else if(c==AMPERSAND) {
1210                 /* output &- for & */
1211                 *target++=AMPERSAND;
1212                 if(target<targetLimit) {
1213                     *target++=MINUS;
1214                     if(offsets!=NULL) {
1215                         *offsets++=sourceIndex;
1216                         *offsets++=sourceIndex++;
1217                     }
1218                     /* realign length and targetCapacity */
1219                     goto directMode;
1220                 } else {
1221                     if(offsets!=NULL) {
1222                         *offsets++=sourceIndex++;
1223                     }
1224                     cnv->charErrorBuffer[0]=MINUS;
1225                     cnv->charErrorBufferLength=1;
1226                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1227                     break;
1228                 }
1229             } else {
1230                 /* un-read this character and switch to Unicode Mode */
1231                 --source;
1232                 *target++=AMPERSAND;
1233                 if(offsets!=NULL) {
1234                     *offsets++=sourceIndex;
1235                 }
1236                 inDirectMode=FALSE;
1237                 base64Counter=0;
1238                 goto unicodeMode;
1239             }
1240             --length;
1241         }
1242         if(source<sourceLimit && target>=targetLimit) {
1243             /* target is full */
1244             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1245         }
1246     } else {
1247 unicodeMode:
1248         while(source<sourceLimit) {
1249             if(target<targetLimit) {
1250                 c=*source++;
1251                 if(isLegalIMAP(c)) {
1252                     /* encode directly */
1253                     inDirectMode=TRUE;
1254
1255                     /* trick: back out this character to make this easier */
1256                     --source;
1257
1258                     /* terminate the base64 sequence */
1259                     if(base64Counter!=0) {
1260                         /* write remaining bits for the previous character */
1261                         *target++=TO_BASE64_IMAP(bits);
1262                         if(offsets!=NULL) {
1263                             *offsets++=sourceIndex-1;
1264                         }
1265                     }
1266                     /* need to terminate with a minus */
1267                     if(target<targetLimit) {
1268                         *target++=MINUS;
1269                         if(offsets!=NULL) {
1270                             *offsets++=sourceIndex-1;
1271                         }
1272                     } else {
1273                         cnv->charErrorBuffer[0]=MINUS;
1274                         cnv->charErrorBufferLength=1;
1275                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1276                         break;
1277                     }
1278                     goto directMode;
1279                 } else {
1280                     /*
1281                      * base64 this character:
1282                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1283                      * and the bits of this character, each implicitly in UTF-16BE.
1284                      *
1285                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1286                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1287                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1288                      */
1289                     switch(base64Counter) {
1290                     case 0:
1291                         b=(uint8_t)(c>>10);
1292                         *target++=TO_BASE64_IMAP(b);
1293                         if(target<targetLimit) {
1294                             b=(uint8_t)((c>>4)&0x3f);
1295                             *target++=TO_BASE64_IMAP(b);
1296                             if(offsets!=NULL) {
1297                                 *offsets++=sourceIndex;
1298                                 *offsets++=sourceIndex++;
1299                             }
1300                         } else {
1301                             if(offsets!=NULL) {
1302                                 *offsets++=sourceIndex++;
1303                             }
1304                             b=(uint8_t)((c>>4)&0x3f);
1305                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1306                             cnv->charErrorBufferLength=1;
1307                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1308                         }
1309                         bits=(uint8_t)((c&15)<<2);
1310                         base64Counter=1;
1311                         break;
1312                     case 1:
1313                         b=(uint8_t)(bits|(c>>14));
1314                         *target++=TO_BASE64_IMAP(b);
1315                         if(target<targetLimit) {
1316                             b=(uint8_t)((c>>8)&0x3f);
1317                             *target++=TO_BASE64_IMAP(b);
1318                             if(target<targetLimit) {
1319                                 b=(uint8_t)((c>>2)&0x3f);
1320                                 *target++=TO_BASE64_IMAP(b);
1321                                 if(offsets!=NULL) {
1322                                     *offsets++=sourceIndex;
1323                                     *offsets++=sourceIndex;
1324                                     *offsets++=sourceIndex++;
1325                                 }
1326                             } else {
1327                                 if(offsets!=NULL) {
1328                                     *offsets++=sourceIndex;
1329                                     *offsets++=sourceIndex++;
1330                                 }
1331                                 b=(uint8_t)((c>>2)&0x3f);
1332                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1333                                 cnv->charErrorBufferLength=1;
1334                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1335                             }
1336                         } else {
1337                             if(offsets!=NULL) {
1338                                 *offsets++=sourceIndex++;
1339                             }
1340                             b=(uint8_t)((c>>8)&0x3f);
1341                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1342                             b=(uint8_t)((c>>2)&0x3f);
1343                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1344                             cnv->charErrorBufferLength=2;
1345                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346                         }
1347                         bits=(uint8_t)((c&3)<<4);
1348                         base64Counter=2;
1349                         break;
1350                     case 2:
1351                         b=(uint8_t)(bits|(c>>12));
1352                         *target++=TO_BASE64_IMAP(b);
1353                         if(target<targetLimit) {
1354                             b=(uint8_t)((c>>6)&0x3f);
1355                             *target++=TO_BASE64_IMAP(b);
1356                             if(target<targetLimit) {
1357                                 b=(uint8_t)(c&0x3f);
1358                                 *target++=TO_BASE64_IMAP(b);
1359                                 if(offsets!=NULL) {
1360                                     *offsets++=sourceIndex;
1361                                     *offsets++=sourceIndex;
1362                                     *offsets++=sourceIndex++;
1363                                 }
1364                             } else {
1365                                 if(offsets!=NULL) {
1366                                     *offsets++=sourceIndex;
1367                                     *offsets++=sourceIndex++;
1368                                 }
1369                                 b=(uint8_t)(c&0x3f);
1370                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1371                                 cnv->charErrorBufferLength=1;
1372                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1373                             }
1374                         } else {
1375                             if(offsets!=NULL) {
1376                                 *offsets++=sourceIndex++;
1377                             }
1378                             b=(uint8_t)((c>>6)&0x3f);
1379                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1380                             b=(uint8_t)(c&0x3f);
1381                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1382                             cnv->charErrorBufferLength=2;
1383                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1384                         }
1385                         bits=0;
1386                         base64Counter=0;
1387                         break;
1388                     default:
1389                         /* will never occur */
1390                         break;
1391                     }
1392                 }
1393             } else {
1394                 /* target is full */
1395                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1396                 break;
1397             }
1398         }
1399     }
1400
1401     if(pArgs->flush && source>=sourceLimit) {
1402         /* flush remaining bits to the target */
1403         if(!inDirectMode) {
1404             if(base64Counter!=0) {
1405                 if(target<targetLimit) {
1406                     *target++=TO_BASE64_IMAP(bits);
1407                     if(offsets!=NULL) {
1408                         *offsets++=sourceIndex-1;
1409                     }
1410                 } else {
1411                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1412                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1413                 }
1414             }
1415             /* need to terminate with a minus */
1416             if(target<targetLimit) {
1417                 *target++=MINUS;
1418                 if(offsets!=NULL) {
1419                     *offsets++=sourceIndex-1;
1420                 }
1421             } else {
1422                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1423                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1424             }
1425         }
1426         /* reset the state for the next conversion */
1427         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1428     } else {
1429         /* set the converter state back into UConverter */
1430         cnv->fromUnicodeStatus=
1431             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1432             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1433     }
1434
1435     /* write back the updated pointers */
1436     pArgs->source=source;
1437     pArgs->target=(char *)target;
1438     pArgs->offsets=offsets;
1439     return;
1440 }
1441
1442 static const UConverterImpl _IMAPImpl={
1443     UCNV_IMAP_MAILBOX,
1444
1445     NULL,
1446     NULL,
1447
1448     _UTF7Open,
1449     NULL,
1450     _UTF7Reset,
1451
1452     _IMAPToUnicodeWithOffsets,
1453     _IMAPToUnicodeWithOffsets,
1454     _IMAPFromUnicodeWithOffsets,
1455     _IMAPFromUnicodeWithOffsets,
1456     NULL,
1457
1458     NULL,
1459     NULL,
1460     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1461     NULL,
1462     ucnv_getCompleteUnicodeSet
1463 };
1464
1465 static const UConverterStaticData _IMAPStaticData={
1466     sizeof(UConverterStaticData),
1467     "IMAP-mailbox-name",
1468     0, /* TODO CCSID for IMAP-mailbox-name */
1469     UCNV_IBM, UCNV_IMAP_MAILBOX,
1470     1, 4,
1471     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1472     FALSE, FALSE,
1473     0,
1474     0,
1475     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1476 };
1477
1478 const UConverterSharedData _IMAPData={
1479     sizeof(UConverterSharedData), ~((uint32_t)0),
1480     NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1481     0
1482 };
1483
1484 #endif