icuSources/common/ucnv_u7.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u7.c
   9 *   encoding:   UTF-8
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  22
  23 #include "cmemory.h"
  24 #include "unicode/ucnv.h"
  25 #include "ucnv_bld.h"
  26 #include "ucnv_cnv.h"
  27 #include "uassert.h"
  28
  29 /* UTF-7 -------------------------------------------------------------------- */
  30
  31 /*
  32  * UTF-7 is a stateful encoding of Unicode.
  33  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  34  * It was intended for use in Internet email systems, using in its bytewise
  35  * encoding only a subset of 7-bit US-ASCII.
  36  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  37  * occasionally used.
  38  *
  39  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  40  * characters directly or in base64. Especially, the characters in set O
  41  * as defined in the RFC (see below) may be encoded directly but are not
  42  * allowed in, e.g., email headers.
  43  * By default, the ICU UTF-7 converter encodes set O directly.
  44  * By choosing the option "version=1", set O will be escaped instead.
  45  * For example:
  46  *     utf7Converter=ucnv_open("UTF-7,version=1");
  47  *
  48  * For details about email headers see RFC 2047.
  49  */
  50
  51 /*
  52  * Tests for US-ASCII characters belonging to character classes
  53  * defined in UTF-7.
  54  *
  55  * Set D (directly encoded characters) consists of the following
  56  * characters: the upper and lower case letters A through Z
  57  * and a through z, the 10 digits 0-9, and the following nine special
  58  * characters (note that "+" and "=" are omitted):
  59  *     '(),-./:?
  60  *
  61  * Set O (optional direct characters) consists of the following
  62  * characters (note that "\" and "~" are omitted):
  63  *     !"#$%&*;<=>@[]^_`{|}
  64  *
  65  * According to the rules in RFC 2152, the byte values for the following
  66  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  67  * - all C0 control codes except for CR LF TAB
  68  * - BACKSLASH
  69  * - TILDE
  70  * - DEL
  71  * - all codes beyond US-ASCII, i.e. all >127
  72  */
  73 #define inSetD(c) \
  74     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  75      (uint8_t)((c)-48)<10 ||    /* digits */ \
  76      (uint8_t)((c)-39)<3 ||     /* '() */ \
  77      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
  78      (c)==58 || (c)==63         /* :? */ \
  79     )
  80
  81 #define inSetO(c) \
  82     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
  83      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
  84      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
  85      (uint8_t)((c)-123)<3 ||        /* {|} */ \
  86      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
  87     )
  88
  89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  91
  92 #define PLUS  43
  93 #define MINUS 45
  94 #define BACKSLASH 92
  95 #define TILDE 126
  96
  97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  99
 100 /* encode directly sets D and O and CR LF SP TAB */
 101 static const UBool encodeDirectlyMaximum[128]={
 102  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 103     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 105
 106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 107     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 108
 109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 110     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
 111
 112     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
 114 };
 115
 116 /* encode directly set D and CR LF SP TAB but not set O */
 117 static const UBool encodeDirectlyRestricted[128]={
 118  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 119     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 120     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 121
 122     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 123     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 124
 125     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 126     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 127
 128     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 129     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
 130 };
 131
 132 static const uint8_t
 133 toBase64[64]={
 134     /* A-Z */
 135     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 136     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
 137     /* a-z */
 138     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 139     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
 140     /* 0-9 */
 141     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 142     /* +/ */
 143     43, 47
 144 };
 145
 146 static const int8_t
 147 fromBase64[128]={
 148     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
 149     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
 150     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
 151
 152     /* general punctuation with + and / and a special value (-2) for - */
 153     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
 154     /* digits */
 155     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
 156
 157     /* A-Z */
 158     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 159     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
 160
 161     /* a-z */
 162     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 163     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
 164 };
 165
 166 /*
 167  * converter status values:
 168  *
 169  * toUnicodeStatus:
 170  *     24 inDirectMode (boolean)
 171  * 23..16 base64Counter (-1..7)
 172  * 15..0  bits (up to 14 bits incoming base64)
 173  *
 174  * fromUnicodeStatus:
 175  * 31..28 version (0: set O direct  1: set O escaped)
 176  *     24 inDirectMode (boolean)
 177  * 23..16 base64Counter (0..2)
 178  *  7..0  bits (6 bits outgoing base64)
 179  *
 180  */
 181
 182 U_CDECL_BEGIN
 183 static void U_CALLCONV
 184 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
 185     if(choice<=UCNV_RESET_TO_UNICODE) {
 186         /* reset toUnicode */
 187         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
 188         cnv->toULength=0;
 189     }
 190     if(choice!=UCNV_RESET_TO_UNICODE) {
 191         /* reset fromUnicode */
 192         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 193     }
 194 }
 195
 196 static void U_CALLCONV
 197 _UTF7Open(UConverter *cnv,
 198           UConverterLoadArgs *pArgs,
 199           UErrorCode *pErrorCode) {
 200     (void)pArgs;
 201     if(UCNV_GET_VERSION(cnv)<=1) {
 202         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
 203         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
 204         _UTF7Reset(cnv, UCNV_RESET_BOTH);
 205     } else {
 206         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 207     }
 208 }
 209
 210 static void U_CALLCONV
 211 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 212                           UErrorCode *pErrorCode) {
 213     UConverter *cnv;
 214     const uint8_t *source, *sourceLimit;
 215     UChar *target;
 216     const UChar *targetLimit;
 217     int32_t *offsets;
 218
 219     uint8_t *bytes;
 220     uint8_t byteIndex;
 221
 222     int32_t length, targetCapacity;
 223
 224     /* UTF-7 state */
 225     uint16_t bits;
 226     int8_t base64Counter;
 227     UBool inDirectMode;
 228
 229     int8_t base64Value;
 230
 231     int32_t sourceIndex, nextSourceIndex;
 232
 233     uint8_t b;
 234     /* set up the local pointers */
 235     cnv=pArgs->converter;
 236
 237     source=(const uint8_t *)pArgs->source;
 238     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 239     target=pArgs->target;
 240     targetLimit=pArgs->targetLimit;
 241     offsets=pArgs->offsets;
 242     /* get the state machine state */
 243     {
 244         uint32_t status=cnv->toUnicodeStatus;
 245         inDirectMode=(UBool)((status>>24)&1);
 246         base64Counter=(int8_t)(status>>16);
 247         bits=(uint16_t)status;
 248     }
 249     bytes=cnv->toUBytes;
 250     byteIndex=cnv->toULength;
 251
 252     /* sourceIndex=-1 if the current character began in the previous buffer */
 253     sourceIndex=byteIndex==0 ? 0 : -1;
 254     nextSourceIndex=0;
 255
 256     if(inDirectMode) {
 257 directMode:
 258         /*
 259          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
 260          * with their US-ASCII byte values.
 261          * Backslash and Tilde and most control characters are not allowed in UTF-7.
 262          * A plus sign starts Unicode (or "escape") Mode.
 263          *
 264          * In Direct Mode, only the sourceIndex is used.
 265          */
 266         byteIndex=0;
 267         length=(int32_t)(sourceLimit-source);
 268         targetCapacity=(int32_t)(targetLimit-target);
 269         if(length>targetCapacity) {
 270             length=targetCapacity;
 271         }
 272         while(length>0) {
 273             b=*source++;
 274             if(!isLegalUTF7(b)) {
 275                 /* illegal */
 276                 bytes[0]=b;
 277                 byteIndex=1;
 278                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 279                 break;
 280             } else if(b!=PLUS) {
 281                 /* write directly encoded character */
 282                 *target++=b;
 283                 if(offsets!=NULL) {
 284                     *offsets++=sourceIndex++;
 285                 }
 286             } else /* PLUS */ {
 287                 /* switch to Unicode mode */
 288                 nextSourceIndex=++sourceIndex;
 289                 inDirectMode=FALSE;
 290                 byteIndex=0;
 291                 bits=0;
 292                 base64Counter=-1;
 293                 goto unicodeMode;
 294             }
 295             --length;
 296         }
 297         if(source<sourceLimit && target>=targetLimit) {
 298             /* target is full */
 299             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 300         }
 301     } else {
 302 unicodeMode:
 303         /*
 304          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 305          * The base64 sequence ends with any character that is not in the base64 alphabet.
 306          * A terminating minus sign is consumed.
 307          *
 308          * In Unicode Mode, the sourceIndex has the index to the start of the current
 309          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 310          * keeping the index to the following byte.
 311          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 312          */
 313         while(source<sourceLimit) {
 314             if(target<targetLimit) {
 315                 bytes[byteIndex++]=b=*source++;
 316                 ++nextSourceIndex;
 317                 base64Value = -3; /* initialize as illegal */
 318                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
 319                     /* either
 320                      * base64Value==-1 for any legal character except base64 and minus sign, or
 321                      * base64Value==-3 for illegal characters:
 322                      * 1. In either case, leave Unicode mode.
 323                      * 2.1. If we ended with an incomplete UChar or none after the +, then
 324                      *      generate an error for the preceding erroneous sequence and deal with
 325                      *      the current (possibly illegal) character next time through.
 326                      * 2.2. Else the current char comes after a complete UChar, which was already
 327                      *      pushed to the output buf, so:
 328                      * 2.2.1. If the current char is legal, just save it for processing next time.
 329                      *        It may be for example, a plus which we need to deal with in direct mode.
 330                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
 331                      */
 332                     inDirectMode=TRUE;
 333                     if(base64Counter==-1) {
 334                         /* illegal: + immediately followed by something other than base64 or minus sign */
 335                         /* include the plus sign in the reported sequence, but not the subsequent char */
 336                         --source;
 337                         bytes[0]=PLUS;
 338                         byteIndex=1;
 339                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 340                         break;
 341                     } else if(bits!=0) {
 342                         /* bits are illegally left over, a UChar is incomplete */
 343                         /* don't include current char (legal or illegal) in error seq */
 344                         --source;
 345                         --byteIndex;
 346                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 347                         break;
 348                     } else {
 349                         /* previous UChar was complete */
 350                         if(base64Value==-3) {
 351                             /* current character is illegal, deal with it here */
 352                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 353                             break;
 354                         } else {
 355                             /* un-read the current character in case it is a plus sign */
 356                             --source;
 357                             sourceIndex=nextSourceIndex-1;
 358                             goto directMode;
 359                         }
 360                     }
 361                 } else if(base64Value>=0) {
 362                     /* collect base64 bytes into UChars */
 363                     switch(base64Counter) {
 364                     case -1: /* -1 is immediately after the + */
 365                     case 0:
 366                         bits=base64Value;
 367                         base64Counter=1;
 368                         break;
 369                     case 1:
 370                     case 3:
 371                     case 4:
 372                     case 6:
 373                         bits=(uint16_t)((bits<<6)|base64Value);
 374                         ++base64Counter;
 375                         break;
 376                     case 2:
 377                         *target++=(UChar)((bits<<4)|(base64Value>>2));
 378                         if(offsets!=NULL) {
 379                             *offsets++=sourceIndex;
 380                             sourceIndex=nextSourceIndex-1;
 381                         }
 382                         bytes[0]=b; /* keep this byte in case an error occurs */
 383                         byteIndex=1;
 384                         bits=(uint16_t)(base64Value&3);
 385                         base64Counter=3;
 386                         break;
 387                     case 5:
 388                         *target++=(UChar)((bits<<2)|(base64Value>>4));
 389                         if(offsets!=NULL) {
 390                             *offsets++=sourceIndex;
 391                             sourceIndex=nextSourceIndex-1;
 392                         }
 393                         bytes[0]=b; /* keep this byte in case an error occurs */
 394                         byteIndex=1;
 395                         bits=(uint16_t)(base64Value&15);
 396                         base64Counter=6;
 397                         break;
 398                     case 7:
 399                         *target++=(UChar)((bits<<6)|base64Value);
 400                         if(offsets!=NULL) {
 401                             *offsets++=sourceIndex;
 402                             sourceIndex=nextSourceIndex;
 403                         }
 404                         byteIndex=0;
 405                         bits=0;
 406                         base64Counter=0;
 407                         break;
 408                     default:
 409                         /* will never occur */
 410                         break;
 411                     }
 412                 } else /*base64Value==-2*/ {
 413                     /* minus sign terminates the base64 sequence */
 414                     inDirectMode=TRUE;
 415                     if(base64Counter==-1) {
 416                         /* +- i.e. a minus immediately following a plus */
 417                         *target++=PLUS;
 418                         if(offsets!=NULL) {
 419                             *offsets++=sourceIndex-1;
 420                         }
 421                     } else {
 422                         /* absorb the minus and leave the Unicode Mode */
 423                         if(bits!=0) {
 424                             /* bits are illegally left over, a UChar is incomplete */
 425                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 426                             break;
 427                         }
 428                     }
 429                     sourceIndex=nextSourceIndex;
 430                     goto directMode;
 431                 }
 432             } else {
 433                 /* target is full */
 434                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 435                 break;
 436             }
 437         }
 438     }
 439
 440     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
 441         /*
 442          * if we are in Unicode mode, then the byteIndex might not be 0,
 443          * but that is ok if bits==0
 444          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
 445          * (not true for IMAP-mailbox-name where we must end in direct mode)
 446          */
 447         byteIndex=0;
 448     }
 449
 450     /* set the converter state back into UConverter */
 451     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
 452     cnv->toULength=byteIndex;
 453
 454     /* write back the updated pointers */
 455     pArgs->source=(const char *)source;
 456     pArgs->target=target;
 457     pArgs->offsets=offsets;
 458     return;
 459 }
 460
 461 static void U_CALLCONV
 462 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 463                             UErrorCode *pErrorCode) {
 464     UConverter *cnv;
 465     const UChar *source, *sourceLimit;
 466     uint8_t *target, *targetLimit;
 467     int32_t *offsets;
 468
 469     int32_t length, targetCapacity, sourceIndex;
 470     UChar c;
 471
 472     /* UTF-7 state */
 473     const UBool *encodeDirectly;
 474     uint8_t bits;
 475     int8_t base64Counter;
 476     UBool inDirectMode;
 477
 478     /* set up the local pointers */
 479     cnv=pArgs->converter;
 480
 481     /* set up the local pointers */
 482     source=pArgs->source;
 483     sourceLimit=pArgs->sourceLimit;
 484     target=(uint8_t *)pArgs->target;
 485     targetLimit=(uint8_t *)pArgs->targetLimit;
 486     offsets=pArgs->offsets;
 487
 488     /* get the state machine state */
 489     {
 490         uint32_t status=cnv->fromUnicodeStatus;
 491         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
 492         inDirectMode=(UBool)((status>>24)&1);
 493         base64Counter=(int8_t)(status>>16);
 494         bits=(uint8_t)status;
 495         U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
 496     }
 497
 498     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
 499     sourceIndex=0;
 500
 501     if(inDirectMode) {
 502 directMode:
 503         length=(int32_t)(sourceLimit-source);
 504         targetCapacity=(int32_t)(targetLimit-target);
 505         if(length>targetCapacity) {
 506             length=targetCapacity;
 507         }
 508         while(length>0) {
 509             c=*source++;
 510             /* currently always encode CR LF SP TAB directly */
 511             if(c<=127 && encodeDirectly[c]) {
 512                 /* encode directly */
 513                 *target++=(uint8_t)c;
 514                 if(offsets!=NULL) {
 515                     *offsets++=sourceIndex++;
 516                 }
 517             } else if(c==PLUS) {
 518                 /* output +- for + */
 519                 *target++=PLUS;
 520                 if(target<targetLimit) {
 521                     *target++=MINUS;
 522                     if(offsets!=NULL) {
 523                         *offsets++=sourceIndex;
 524                         *offsets++=sourceIndex++;
 525                     }
 526                     /* realign length and targetCapacity */
 527                     goto directMode;
 528                 } else {
 529                     if(offsets!=NULL) {
 530                         *offsets++=sourceIndex++;
 531                     }
 532                     cnv->charErrorBuffer[0]=MINUS;
 533                     cnv->charErrorBufferLength=1;
 534                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 535                     break;
 536                 }
 537             } else {
 538                 /* un-read this character and switch to Unicode Mode */
 539                 --source;
 540                 *target++=PLUS;
 541                 if(offsets!=NULL) {
 542                     *offsets++=sourceIndex;
 543                 }
 544                 inDirectMode=FALSE;
 545                 base64Counter=0;
 546                 goto unicodeMode;
 547             }
 548             --length;
 549         }
 550         if(source<sourceLimit && target>=targetLimit) {
 551             /* target is full */
 552             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 553         }
 554     } else {
 555 unicodeMode:
 556         while(source<sourceLimit) {
 557             if(target<targetLimit) {
 558                 c=*source++;
 559                 if(c<=127 && encodeDirectly[c]) {
 560                     /* encode directly */
 561                     inDirectMode=TRUE;
 562
 563                     /* trick: back out this character to make this easier */
 564                     --source;
 565
 566                     /* terminate the base64 sequence */
 567                     if(base64Counter!=0) {
 568                         /* write remaining bits for the previous character */
 569                         *target++=toBase64[bits];
 570                         if(offsets!=NULL) {
 571                             *offsets++=sourceIndex-1;
 572                         }
 573                     }
 574                     if(fromBase64[c]!=-1) {
 575                         /* need to terminate with a minus */
 576                         if(target<targetLimit) {
 577                             *target++=MINUS;
 578                             if(offsets!=NULL) {
 579                                 *offsets++=sourceIndex-1;
 580                             }
 581                         } else {
 582                             cnv->charErrorBuffer[0]=MINUS;
 583                             cnv->charErrorBufferLength=1;
 584                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 585                             break;
 586                         }
 587                     }
 588                     goto directMode;
 589                 } else {
 590                     /*
 591                      * base64 this character:
 592                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
 593                      * and the bits of this character, each implicitly in UTF-16BE.
 594                      *
 595                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
 596                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
 597                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
 598                      */
 599                     switch(base64Counter) {
 600                     case 0:
 601                         *target++=toBase64[c>>10];
 602                         if(target<targetLimit) {
 603                             *target++=toBase64[(c>>4)&0x3f];
 604                             if(offsets!=NULL) {
 605                                 *offsets++=sourceIndex;
 606                                 *offsets++=sourceIndex++;
 607                             }
 608                         } else {
 609                             if(offsets!=NULL) {
 610                                 *offsets++=sourceIndex++;
 611                             }
 612                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
 613                             cnv->charErrorBufferLength=1;
 614                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 615                         }
 616                         bits=(uint8_t)((c&15)<<2);
 617                         base64Counter=1;
 618                         break;
 619                     case 1:
 620                         *target++=toBase64[bits|(c>>14)];
 621                         if(target<targetLimit) {
 622                             *target++=toBase64[(c>>8)&0x3f];
 623                             if(target<targetLimit) {
 624                                 *target++=toBase64[(c>>2)&0x3f];
 625                                 if(offsets!=NULL) {
 626                                     *offsets++=sourceIndex;
 627                                     *offsets++=sourceIndex;
 628                                     *offsets++=sourceIndex++;
 629                                 }
 630                             } else {
 631                                 if(offsets!=NULL) {
 632                                     *offsets++=sourceIndex;
 633                                     *offsets++=sourceIndex++;
 634                                 }
 635                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
 636                                 cnv->charErrorBufferLength=1;
 637                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 638                             }
 639                         } else {
 640                             if(offsets!=NULL) {
 641                                 *offsets++=sourceIndex++;
 642                             }
 643                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
 644                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
 645                             cnv->charErrorBufferLength=2;
 646                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 647                         }
 648                         bits=(uint8_t)((c&3)<<4);
 649                         base64Counter=2;
 650                         break;
 651                     case 2:
 652                         *target++=toBase64[bits|(c>>12)];
 653                         if(target<targetLimit) {
 654                             *target++=toBase64[(c>>6)&0x3f];
 655                             if(target<targetLimit) {
 656                                 *target++=toBase64[c&0x3f];
 657                                 if(offsets!=NULL) {
 658                                     *offsets++=sourceIndex;
 659                                     *offsets++=sourceIndex;
 660                                     *offsets++=sourceIndex++;
 661                                 }
 662                             } else {
 663                                 if(offsets!=NULL) {
 664                                     *offsets++=sourceIndex;
 665                                     *offsets++=sourceIndex++;
 666                                 }
 667                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
 668                                 cnv->charErrorBufferLength=1;
 669                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 670                             }
 671                         } else {
 672                             if(offsets!=NULL) {
 673                                 *offsets++=sourceIndex++;
 674                             }
 675                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
 676                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
 677                             cnv->charErrorBufferLength=2;
 678                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 679                         }
 680                         bits=0;
 681                         base64Counter=0;
 682                         break;
 683                     default:
 684                         /* will never occur */
 685                         break;
 686                     }
 687                 }
 688             } else {
 689                 /* target is full */
 690                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 691                 break;
 692             }
 693         }
 694     }
 695
 696     if(pArgs->flush && source>=sourceLimit) {
 697         /* flush remaining bits to the target */
 698         if(!inDirectMode) {
 699             if (base64Counter!=0) {
 700                 if(target<targetLimit) {
 701                     *target++=toBase64[bits];
 702                     if(offsets!=NULL) {
 703                         *offsets++=sourceIndex-1;
 704                     }
 705                 } else {
 706                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
 707                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 708                 }
 709             }
 710             /* Add final MINUS to terminate unicodeMode */
 711             if(target<targetLimit) {
 712                 *target++=MINUS;
 713                 if(offsets!=NULL) {
 714                     *offsets++=sourceIndex-1;
 715                 }
 716             } else {
 717                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
 718                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 719             }
 720         }
 721         /* reset the state for the next conversion */
 722         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 723     } else {
 724         /* set the converter state back into UConverter */
 725         cnv->fromUnicodeStatus=
 726             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
 727             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
 728     }
 729
 730     /* write back the updated pointers */
 731     pArgs->source=source;
 732     pArgs->target=(char *)target;
 733     pArgs->offsets=offsets;
 734     return;
 735 }
 736
 737 static const char * U_CALLCONV
 738 _UTF7GetName(const UConverter *cnv) {
 739     switch(cnv->fromUnicodeStatus>>28) {
 740     case 1:
 741         return "UTF-7,version=1";
 742     default:
 743         return "UTF-7";
 744     }
 745 }
 746 U_CDECL_END
 747
 748 static const UConverterImpl _UTF7Impl={
 749     UCNV_UTF7,
 750
 751     NULL,
 752     NULL,
 753
 754     _UTF7Open,
 755     NULL,
 756     _UTF7Reset,
 757
 758     _UTF7ToUnicodeWithOffsets,
 759     _UTF7ToUnicodeWithOffsets,
 760     _UTF7FromUnicodeWithOffsets,
 761     _UTF7FromUnicodeWithOffsets,
 762     NULL,
 763
 764     NULL,
 765     _UTF7GetName,
 766     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
 767     NULL,
 768     ucnv_getCompleteUnicodeSet,
 769
 770     NULL,
 771     NULL
 772 };
 773
 774 static const UConverterStaticData _UTF7StaticData={
 775     sizeof(UConverterStaticData),
 776     "UTF-7",
 777     0, /* TODO CCSID for UTF-7 */
 778     UCNV_IBM, UCNV_UTF7,
 779     1, 4,
 780     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
 781     FALSE, FALSE,
 782     0,
 783     0,
 784     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 785 };
 786
 787 const UConverterSharedData _UTF7Data=
 788         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
 789
 790 /* IMAP mailbox name encoding ----------------------------------------------- */
 791
 792 /*
 793  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 794  * http://www.ietf.org/rfc/rfc2060.txt
 795  *
 796  * 5.1.3.  Mailbox International Naming Convention
 797  *
 798  * By convention, international mailbox names are specified using a
 799  * modified version of the UTF-7 encoding described in [UTF-7].  The
 800  * purpose of these modifications is to correct the following problems
 801  * with UTF-7:
 802  *
 803  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
 804  *       the common use of "+" in mailbox names, in particular USENET
 805  *       newsgroup names.
 806  *
 807  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
 808  *       conflicts with the use of "/" as a popular hierarchy delimiter.
 809  *
 810  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
 811  *       the use of "\" as a popular hierarchy delimiter.
 812  *
 813  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
 814  *       the use of "~" in some servers as a home directory indicator.
 815  *
 816  *    5) UTF-7 permits multiple alternate forms to represent the same
 817  *       string; in particular, printable US-ASCII chararacters can be
 818  *       represented in encoded form.
 819  *
 820  * In modified UTF-7, printable US-ASCII characters except for "&"
 821  * represent themselves; that is, characters with octet values 0x20-0x25
 822  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
 823  * octet sequence "&-".
 824  *
 825  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
 826  * Unicode 16-bit octets) are represented in modified BASE64, with a
 827  * further modification from [UTF-7] that "," is used instead of "/".
 828  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
 829  * character which can represent itself.
 830  *
 831  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
 832  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
 833  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
 834  * ").
 835  *
 836  * For example, here is a mailbox name which mixes English, Japanese,
 837  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
 838  */
 839
 840 /*
 841  * Tests for US-ASCII characters belonging to character classes
 842  * defined in UTF-7.
 843  *
 844  * Set D (directly encoded characters) consists of the following
 845  * characters: the upper and lower case letters A through Z
 846  * and a through z, the 10 digits 0-9, and the following nine special
 847  * characters (note that "+" and "=" are omitted):
 848  *     '(),-./:?
 849  *
 850  * Set O (optional direct characters) consists of the following
 851  * characters (note that "\" and "~" are omitted):
 852  *     !"#$%&*;<=>@[]^_`{|}
 853  *
 854  * According to the rules in RFC 2152, the byte values for the following
 855  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
 856  * - all C0 control codes except for CR LF TAB
 857  * - BACKSLASH
 858  * - TILDE
 859  * - DEL
 860  * - all codes beyond US-ASCII, i.e. all >127
 861  */
 862
 863 /* uses '&' not '+' to start a base64 sequence */
 864 #define AMPERSAND 0x26
 865 #define COMMA 0x2c
 866 #define SLASH 0x2f
 867
 868 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
 869 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
 870
 871 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
 872 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
 873
 874 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
 875 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
 876
 877 /*
 878  * converter status values:
 879  *
 880  * toUnicodeStatus:
 881  *     24 inDirectMode (boolean)
 882  * 23..16 base64Counter (-1..7)
 883  * 15..0  bits (up to 14 bits incoming base64)
 884  *
 885  * fromUnicodeStatus:
 886  *     24 inDirectMode (boolean)
 887  * 23..16 base64Counter (0..2)
 888  *  7..0  bits (6 bits outgoing base64)
 889  *
 890  * ignore bits 31..25
 891  */
 892
 893 U_CDECL_BEGIN
 894 static void U_CALLCONV
 895 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 896                           UErrorCode *pErrorCode) {
 897     UConverter *cnv;
 898     const uint8_t *source, *sourceLimit;
 899     UChar *target;
 900     const UChar *targetLimit;
 901     int32_t *offsets;
 902
 903     uint8_t *bytes;
 904     uint8_t byteIndex;
 905
 906     int32_t length, targetCapacity;
 907
 908     /* UTF-7 state */
 909     uint16_t bits;
 910     int8_t base64Counter;
 911     UBool inDirectMode;
 912
 913     int8_t base64Value;
 914
 915     int32_t sourceIndex, nextSourceIndex;
 916
 917     UChar c;
 918     uint8_t b;
 919
 920     /* set up the local pointers */
 921     cnv=pArgs->converter;
 922
 923     source=(const uint8_t *)pArgs->source;
 924     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 925     target=pArgs->target;
 926     targetLimit=pArgs->targetLimit;
 927     offsets=pArgs->offsets;
 928     /* get the state machine state */
 929     {
 930         uint32_t status=cnv->toUnicodeStatus;
 931         inDirectMode=(UBool)((status>>24)&1);
 932         base64Counter=(int8_t)(status>>16);
 933         bits=(uint16_t)status;
 934     }
 935     bytes=cnv->toUBytes;
 936     byteIndex=cnv->toULength;
 937
 938     /* sourceIndex=-1 if the current character began in the previous buffer */
 939     sourceIndex=byteIndex==0 ? 0 : -1;
 940     nextSourceIndex=0;
 941
 942     if(inDirectMode) {
 943 directMode:
 944         /*
 945          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
 946          * with their US-ASCII byte values.
 947          * An ampersand starts Unicode (or "escape") Mode.
 948          *
 949          * In Direct Mode, only the sourceIndex is used.
 950          */
 951         byteIndex=0;
 952         length=(int32_t)(sourceLimit-source);
 953         targetCapacity=(int32_t)(targetLimit-target);
 954         if(length>targetCapacity) {
 955             length=targetCapacity;
 956         }
 957         while(length>0) {
 958             b=*source++;
 959             if(!isLegalIMAP(b)) {
 960                 /* illegal */
 961                 bytes[0]=b;
 962                 byteIndex=1;
 963                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 964                 break;
 965             } else if(b!=AMPERSAND) {
 966                 /* write directly encoded character */
 967                 *target++=b;
 968                 if(offsets!=NULL) {
 969                     *offsets++=sourceIndex++;
 970                 }
 971             } else /* AMPERSAND */ {
 972                 /* switch to Unicode mode */
 973                 nextSourceIndex=++sourceIndex;
 974                 inDirectMode=FALSE;
 975                 byteIndex=0;
 976                 bits=0;
 977                 base64Counter=-1;
 978                 goto unicodeMode;
 979             }
 980             --length;
 981         }
 982         if(source<sourceLimit && target>=targetLimit) {
 983             /* target is full */
 984             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 985         }
 986     } else {
 987 unicodeMode:
 988         /*
 989          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 990          * The base64 sequence ends with any character that is not in the base64 alphabet.
 991          * A terminating minus sign is consumed.
 992          * US-ASCII must not be base64-ed.
 993          *
 994          * In Unicode Mode, the sourceIndex has the index to the start of the current
 995          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 996          * keeping the index to the following byte.
 997          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 998          */
 999         while(source<sourceLimit) {
1000             if(target<targetLimit) {
1001                 bytes[byteIndex++]=b=*source++;
1002                 ++nextSourceIndex;
1003                 if(b>0x7e) {
1004                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1005                     inDirectMode=TRUE;
1006                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1007                     break;
1008                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1009                     /* collect base64 bytes into UChars */
1010                     switch(base64Counter) {
1011                     case -1: /* -1 is immediately after the & */
1012                     case 0:
1013                         bits=base64Value;
1014                         base64Counter=1;
1015                         break;
1016                     case 1:
1017                     case 3:
1018                     case 4:
1019                     case 6:
1020                         bits=(uint16_t)((bits<<6)|base64Value);
1021                         ++base64Counter;
1022                         break;
1023                     case 2:
1024                         c=(UChar)((bits<<4)|(base64Value>>2));
1025                         if(isLegalIMAP(c)) {
1026                             /* illegal */
1027                             inDirectMode=TRUE;
1028                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1029                             goto endloop;
1030                         }
1031                         *target++=c;
1032                         if(offsets!=NULL) {
1033                             *offsets++=sourceIndex;
1034                             sourceIndex=nextSourceIndex-1;
1035                         }
1036                         bytes[0]=b; /* keep this byte in case an error occurs */
1037                         byteIndex=1;
1038                         bits=(uint16_t)(base64Value&3);
1039                         base64Counter=3;
1040                         break;
1041                     case 5:
1042                         c=(UChar)((bits<<2)|(base64Value>>4));
1043                         if(isLegalIMAP(c)) {
1044                             /* illegal */
1045                             inDirectMode=TRUE;
1046                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1047                             goto endloop;
1048                         }
1049                         *target++=c;
1050                         if(offsets!=NULL) {
1051                             *offsets++=sourceIndex;
1052                             sourceIndex=nextSourceIndex-1;
1053                         }
1054                         bytes[0]=b; /* keep this byte in case an error occurs */
1055                         byteIndex=1;
1056                         bits=(uint16_t)(base64Value&15);
1057                         base64Counter=6;
1058                         break;
1059                     case 7:
1060                         c=(UChar)((bits<<6)|base64Value);
1061                         if(isLegalIMAP(c)) {
1062                             /* illegal */
1063                             inDirectMode=TRUE;
1064                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1065                             goto endloop;
1066                         }
1067                         *target++=c;
1068                         if(offsets!=NULL) {
1069                             *offsets++=sourceIndex;
1070                             sourceIndex=nextSourceIndex;
1071                         }
1072                         byteIndex=0;
1073                         bits=0;
1074                         base64Counter=0;
1075                         break;
1076                     default:
1077                         /* will never occur */
1078                         break;
1079                     }
1080                 } else if(base64Value==-2) {
1081                     /* minus sign terminates the base64 sequence */
1082                     inDirectMode=TRUE;
1083                     if(base64Counter==-1) {
1084                         /* &- i.e. a minus immediately following an ampersand */
1085                         *target++=AMPERSAND;
1086                         if(offsets!=NULL) {
1087                             *offsets++=sourceIndex-1;
1088                         }
1089                     } else {
1090                         /* absorb the minus and leave the Unicode Mode */
1091                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1092                             /* bits are illegally left over, a UChar is incomplete */
1093                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1094                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1095                             break;
1096                         }
1097                     }
1098                     sourceIndex=nextSourceIndex;
1099                     goto directMode;
1100                 } else {
1101                     if(base64Counter==-1) {
1102                         /* illegal: & immediately followed by something other than base64 or minus sign */
1103                         /* include the ampersand in the reported sequence */
1104                         --sourceIndex;
1105                         bytes[0]=AMPERSAND;
1106                         bytes[1]=b;
1107                         byteIndex=2;
1108                     }
1109                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1110                     /* base64Value==-3 for illegal characters */
1111                     /* illegal */
1112                     inDirectMode=TRUE;
1113                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1114                     break;
1115                 }
1116             } else {
1117                 /* target is full */
1118                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1119                 break;
1120             }
1121         }
1122     }
1123 endloop:
1124
1125     /*
1126      * the end of the input stream and detection of truncated input
1127      * are handled by the framework, but here we must check if we are in Unicode
1128      * mode and byteIndex==0 because we must end in direct mode
1129      *
1130      * conditions:
1131      *   successful
1132      *   in Unicode mode and byteIndex==0
1133      *   end of input and no truncated input
1134      */
1135     if( U_SUCCESS(*pErrorCode) &&
1136         !inDirectMode && byteIndex==0 &&
1137         pArgs->flush && source>=sourceLimit
1138     ) {
1139         if(base64Counter==-1) {
1140             /* & at the very end of the input */
1141             /* make the ampersand the reported sequence */
1142             bytes[0]=AMPERSAND;
1143             byteIndex=1;
1144         }
1145         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1146
1147         inDirectMode=TRUE; /* avoid looping */
1148         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1149     }
1150
1151     /* set the converter state back into UConverter */
1152     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1153     cnv->toULength=byteIndex;
1154
1155     /* write back the updated pointers */
1156     pArgs->source=(const char *)source;
1157     pArgs->target=target;
1158     pArgs->offsets=offsets;
1159     return;
1160 }
1161
1162 static void U_CALLCONV
1163 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1164                             UErrorCode *pErrorCode) {
1165     UConverter *cnv;
1166     const UChar *source, *sourceLimit;
1167     uint8_t *target, *targetLimit;
1168     int32_t *offsets;
1169
1170     int32_t length, targetCapacity, sourceIndex;
1171     UChar c;
1172     uint8_t b;
1173
1174     /* UTF-7 state */
1175     uint8_t bits;
1176     int8_t base64Counter;
1177     UBool inDirectMode;
1178
1179     /* set up the local pointers */
1180     cnv=pArgs->converter;
1181
1182     /* set up the local pointers */
1183     source=pArgs->source;
1184     sourceLimit=pArgs->sourceLimit;
1185     target=(uint8_t *)pArgs->target;
1186     targetLimit=(uint8_t *)pArgs->targetLimit;
1187     offsets=pArgs->offsets;
1188
1189     /* get the state machine state */
1190     {
1191         uint32_t status=cnv->fromUnicodeStatus;
1192         inDirectMode=(UBool)((status>>24)&1);
1193         base64Counter=(int8_t)(status>>16);
1194         bits=(uint8_t)status;
1195     }
1196
1197     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1198     sourceIndex=0;
1199
1200     if(inDirectMode) {
1201 directMode:
1202         length=(int32_t)(sourceLimit-source);
1203         targetCapacity=(int32_t)(targetLimit-target);
1204         if(length>targetCapacity) {
1205             length=targetCapacity;
1206         }
1207         while(length>0) {
1208             c=*source++;
1209             /* encode 0x20..0x7e except '&' directly */
1210             if(inSetDIMAP(c)) {
1211                 /* encode directly */
1212                 *target++=(uint8_t)c;
1213                 if(offsets!=NULL) {
1214                     *offsets++=sourceIndex++;
1215                 }
1216             } else if(c==AMPERSAND) {
1217                 /* output &- for & */
1218                 *target++=AMPERSAND;
1219                 if(target<targetLimit) {
1220                     *target++=MINUS;
1221                     if(offsets!=NULL) {
1222                         *offsets++=sourceIndex;
1223                         *offsets++=sourceIndex++;
1224                     }
1225                     /* realign length and targetCapacity */
1226                     goto directMode;
1227                 } else {
1228                     if(offsets!=NULL) {
1229                         *offsets++=sourceIndex++;
1230                     }
1231                     cnv->charErrorBuffer[0]=MINUS;
1232                     cnv->charErrorBufferLength=1;
1233                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234                     break;
1235                 }
1236             } else {
1237                 /* un-read this character and switch to Unicode Mode */
1238                 --source;
1239                 *target++=AMPERSAND;
1240                 if(offsets!=NULL) {
1241                     *offsets++=sourceIndex;
1242                 }
1243                 inDirectMode=FALSE;
1244                 base64Counter=0;
1245                 goto unicodeMode;
1246             }
1247             --length;
1248         }
1249         if(source<sourceLimit && target>=targetLimit) {
1250             /* target is full */
1251             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1252         }
1253     } else {
1254 unicodeMode:
1255         while(source<sourceLimit) {
1256             if(target<targetLimit) {
1257                 c=*source++;
1258                 if(isLegalIMAP(c)) {
1259                     /* encode directly */
1260                     inDirectMode=TRUE;
1261
1262                     /* trick: back out this character to make this easier */
1263                     --source;
1264
1265                     /* terminate the base64 sequence */
1266                     if(base64Counter!=0) {
1267                         /* write remaining bits for the previous character */
1268                         *target++=TO_BASE64_IMAP(bits);
1269                         if(offsets!=NULL) {
1270                             *offsets++=sourceIndex-1;
1271                         }
1272                     }
1273                     /* need to terminate with a minus */
1274                     if(target<targetLimit) {
1275                         *target++=MINUS;
1276                         if(offsets!=NULL) {
1277                             *offsets++=sourceIndex-1;
1278                         }
1279                     } else {
1280                         cnv->charErrorBuffer[0]=MINUS;
1281                         cnv->charErrorBufferLength=1;
1282                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283                         break;
1284                     }
1285                     goto directMode;
1286                 } else {
1287                     /*
1288                      * base64 this character:
1289                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1290                      * and the bits of this character, each implicitly in UTF-16BE.
1291                      *
1292                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1293                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1294                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1295                      */
1296                     switch(base64Counter) {
1297                     case 0:
1298                         b=(uint8_t)(c>>10);
1299                         *target++=TO_BASE64_IMAP(b);
1300                         if(target<targetLimit) {
1301                             b=(uint8_t)((c>>4)&0x3f);
1302                             *target++=TO_BASE64_IMAP(b);
1303                             if(offsets!=NULL) {
1304                                 *offsets++=sourceIndex;
1305                                 *offsets++=sourceIndex++;
1306                             }
1307                         } else {
1308                             if(offsets!=NULL) {
1309                                 *offsets++=sourceIndex++;
1310                             }
1311                             b=(uint8_t)((c>>4)&0x3f);
1312                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1313                             cnv->charErrorBufferLength=1;
1314                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1315                         }
1316                         bits=(uint8_t)((c&15)<<2);
1317                         base64Counter=1;
1318                         break;
1319                     case 1:
1320                         b=(uint8_t)(bits|(c>>14));
1321                         *target++=TO_BASE64_IMAP(b);
1322                         if(target<targetLimit) {
1323                             b=(uint8_t)((c>>8)&0x3f);
1324                             *target++=TO_BASE64_IMAP(b);
1325                             if(target<targetLimit) {
1326                                 b=(uint8_t)((c>>2)&0x3f);
1327                                 *target++=TO_BASE64_IMAP(b);
1328                                 if(offsets!=NULL) {
1329                                     *offsets++=sourceIndex;
1330                                     *offsets++=sourceIndex;
1331                                     *offsets++=sourceIndex++;
1332                                 }
1333                             } else {
1334                                 if(offsets!=NULL) {
1335                                     *offsets++=sourceIndex;
1336                                     *offsets++=sourceIndex++;
1337                                 }
1338                                 b=(uint8_t)((c>>2)&0x3f);
1339                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340                                 cnv->charErrorBufferLength=1;
1341                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1342                             }
1343                         } else {
1344                             if(offsets!=NULL) {
1345                                 *offsets++=sourceIndex++;
1346                             }
1347                             b=(uint8_t)((c>>8)&0x3f);
1348                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1349                             b=(uint8_t)((c>>2)&0x3f);
1350                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1351                             cnv->charErrorBufferLength=2;
1352                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1353                         }
1354                         bits=(uint8_t)((c&3)<<4);
1355                         base64Counter=2;
1356                         break;
1357                     case 2:
1358                         b=(uint8_t)(bits|(c>>12));
1359                         *target++=TO_BASE64_IMAP(b);
1360                         if(target<targetLimit) {
1361                             b=(uint8_t)((c>>6)&0x3f);
1362                             *target++=TO_BASE64_IMAP(b);
1363                             if(target<targetLimit) {
1364                                 b=(uint8_t)(c&0x3f);
1365                                 *target++=TO_BASE64_IMAP(b);
1366                                 if(offsets!=NULL) {
1367                                     *offsets++=sourceIndex;
1368                                     *offsets++=sourceIndex;
1369                                     *offsets++=sourceIndex++;
1370                                 }
1371                             } else {
1372                                 if(offsets!=NULL) {
1373                                     *offsets++=sourceIndex;
1374                                     *offsets++=sourceIndex++;
1375                                 }
1376                                 b=(uint8_t)(c&0x3f);
1377                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378                                 cnv->charErrorBufferLength=1;
1379                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1380                             }
1381                         } else {
1382                             if(offsets!=NULL) {
1383                                 *offsets++=sourceIndex++;
1384                             }
1385                             b=(uint8_t)((c>>6)&0x3f);
1386                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1387                             b=(uint8_t)(c&0x3f);
1388                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1389                             cnv->charErrorBufferLength=2;
1390                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1391                         }
1392                         bits=0;
1393                         base64Counter=0;
1394                         break;
1395                     default:
1396                         /* will never occur */
1397                         break;
1398                     }
1399                 }
1400             } else {
1401                 /* target is full */
1402                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1403                 break;
1404             }
1405         }
1406     }
1407
1408     if(pArgs->flush && source>=sourceLimit) {
1409         /* flush remaining bits to the target */
1410         if(!inDirectMode) {
1411             if(base64Counter!=0) {
1412                 if(target<targetLimit) {
1413                     *target++=TO_BASE64_IMAP(bits);
1414                     if(offsets!=NULL) {
1415                         *offsets++=sourceIndex-1;
1416                     }
1417                 } else {
1418                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1419                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1420                 }
1421             }
1422             /* need to terminate with a minus */
1423             if(target<targetLimit) {
1424                 *target++=MINUS;
1425                 if(offsets!=NULL) {
1426                     *offsets++=sourceIndex-1;
1427                 }
1428             } else {
1429                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1430                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1431             }
1432         }
1433         /* reset the state for the next conversion */
1434         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1435     } else {
1436         /* set the converter state back into UConverter */
1437         cnv->fromUnicodeStatus=
1438             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1439             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1440     }
1441
1442     /* write back the updated pointers */
1443     pArgs->source=source;
1444     pArgs->target=(char *)target;
1445     pArgs->offsets=offsets;
1446     return;
1447 }
1448 U_CDECL_END
1449
1450 static const UConverterImpl _IMAPImpl={
1451     UCNV_IMAP_MAILBOX,
1452
1453     NULL,
1454     NULL,
1455
1456     _UTF7Open,
1457     NULL,
1458     _UTF7Reset,
1459
1460     _IMAPToUnicodeWithOffsets,
1461     _IMAPToUnicodeWithOffsets,
1462     _IMAPFromUnicodeWithOffsets,
1463     _IMAPFromUnicodeWithOffsets,
1464     NULL,
1465
1466     NULL,
1467     NULL,
1468     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1469     NULL,
1470     ucnv_getCompleteUnicodeSet,
1471     NULL,
1472     NULL
1473 };
1474
1475 static const UConverterStaticData _IMAPStaticData={
1476     sizeof(UConverterStaticData),
1477     "IMAP-mailbox-name",
1478     0, /* TODO CCSID for IMAP-mailbox-name */
1479     UCNV_IBM, UCNV_IMAP_MAILBOX,
1480     1, 4,
1481     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1482     FALSE, FALSE,
1483     0,
1484     0,
1485     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1486 };
1487
1488 const UConverterSharedData _IMAPData=
1489         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1490
1491 #endif