icuSources/common/ucnv_u7.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u7.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "ucnv_bld.h"
  23 #include "ucnv_cnv.h"
  24
  25 /* UTF-7 -------------------------------------------------------------------- */
  26
  27 /*
  28  * UTF-7 is a stateful encoding of Unicode.
  29  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  30  * It was intended for use in Internet email systems, using in its bytewise
  31  * encoding only a subset of 7-bit US-ASCII.
  32  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  33  * occasionally used.
  34  *
  35  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  36  * characters directly or in base64. Especially, the characters in set O
  37  * as defined in the RFC (see below) may be encoded directly but are not
  38  * allowed in, e.g., email headers.
  39  * By default, the ICU UTF-7 converter encodes set O directly.
  40  * By choosing the option "version=1", set O will be escaped instead.
  41  * For example:
  42  *     utf7Converter=ucnv_open("UTF-7,version=1");
  43  *
  44  * For details about email headers see RFC 2047.
  45  */
  46
  47 /*
  48  * Tests for US-ASCII characters belonging to character classes
  49  * defined in UTF-7.
  50  *
  51  * Set D (directly encoded characters) consists of the following
  52  * characters: the upper and lower case letters A through Z
  53  * and a through z, the 10 digits 0-9, and the following nine special
  54  * characters (note that "+" and "=" are omitted):
  55  *     '(),-./:?
  56  *
  57  * Set O (optional direct characters) consists of the following
  58  * characters (note that "\" and "~" are omitted):
  59  *     !"#$%&*;<=>@[]^_`{|}
  60  *
  61  * According to the rules in RFC 2152, the byte values for the following
  62  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  63  * - all C0 control codes except for CR LF TAB
  64  * - BACKSLASH
  65  * - TILDE
  66  * - DEL
  67  * - all codes beyond US-ASCII, i.e. all >127
  68  */
  69 #define inSetD(c) \
  70     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  71      (uint8_t)((c)-48)<10 ||    /* digits */ \
  72      (uint8_t)((c)-39)<3 ||     /* '() */ \
  73      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
  74      (c)==58 || (c)==63         /* :? */ \
  75     )
  76
  77 #define inSetO(c) \
  78     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
  79      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
  80      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
  81      (uint8_t)((c)-123)<3 ||        /* {|} */ \
  82      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
  83     )
  84
  85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  87
  88 #define PLUS  43
  89 #define MINUS 45
  90 #define BACKSLASH 92
  91 #define TILDE 126
  92
  93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  95
  96 /* encode directly sets D and O and CR LF SP TAB */
  97 static const UBool encodeDirectlyMaximum[128]={
  98  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
  99     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 100     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 101
 102     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 103     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 104
 105     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
 107
 108     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
 110 };
 111
 112 /* encode directly set D and CR LF SP TAB but not set O */
 113 static const UBool encodeDirectlyRestricted[128]={
 114  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
 115     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
 116     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 117
 118     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 120
 121     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 122     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 123
 124     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 125     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
 126 };
 127
 128 static const uint8_t
 129 toBase64[64]={
 130     /* A-Z */
 131     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 132     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
 133     /* a-z */
 134     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 135     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
 136     /* 0-9 */
 137     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 138     /* +/ */
 139     43, 47
 140 };
 141
 142 static const int8_t
 143 fromBase64[128]={
 144     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
 145     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
 146     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
 147
 148     /* general punctuation with + and / and a special value (-2) for - */
 149     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
 150     /* digits */
 151     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
 152
 153     /* A-Z */
 154     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 155     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
 156
 157     /* a-z */
 158     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 159     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
 160 };
 161
 162 /*
 163  * converter status values:
 164  *
 165  * toUnicodeStatus:
 166  *     24 inDirectMode (boolean)
 167  * 23..16 base64Counter (-1..7)
 168  * 15..0  bits (up to 14 bits incoming base64)
 169  *
 170  * fromUnicodeStatus:
 171  * 31..28 version (0: set O direct  1: set O escaped)
 172  *     24 inDirectMode (boolean)
 173  * 23..16 base64Counter (0..2)
 174  *  7..0  bits (6 bits outgoing base64)
 175  *
 176  */
 177
 178 static void
 179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
 180     if(choice<=UCNV_RESET_TO_UNICODE) {
 181         /* reset toUnicode */
 182         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
 183         cnv->toULength=0;
 184     }
 185     if(choice!=UCNV_RESET_TO_UNICODE) {
 186         /* reset fromUnicode */
 187         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 188     }
 189 }
 190
 191 static void
 192 _UTF7Open(UConverter *cnv,
 193           const char *name,
 194           const char *locale,
 195           uint32_t options,
 196           UErrorCode *pErrorCode) {
 197     if((options&0xf)<=1) {
 198         cnv->fromUnicodeStatus=(options&0xf)<<28;
 199         _UTF7Reset(cnv, UCNV_RESET_BOTH);
 200     } else {
 201         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 202     }
 203 }
 204
 205 static void
 206 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 207                           UErrorCode *pErrorCode) {
 208     UConverter *cnv;
 209     const uint8_t *source, *sourceLimit;
 210     UChar *target;
 211     const UChar *targetLimit;
 212     int32_t *offsets;
 213
 214     uint8_t *bytes;
 215     uint8_t byteIndex;
 216
 217     int32_t length, targetCapacity;
 218
 219     /* UTF-7 state */
 220     uint16_t bits;
 221     int8_t base64Counter;
 222     UBool inDirectMode;
 223
 224     int8_t base64Value;
 225
 226     int32_t sourceIndex, nextSourceIndex;
 227
 228     uint8_t b;
 229     /* set up the local pointers */
 230     cnv=pArgs->converter;
 231
 232     source=(const uint8_t *)pArgs->source;
 233     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 234     target=pArgs->target;
 235     targetLimit=pArgs->targetLimit;
 236     offsets=pArgs->offsets;
 237     /* get the state machine state */
 238     {
 239         uint32_t status=cnv->toUnicodeStatus;
 240         inDirectMode=(UBool)((status>>24)&1);
 241         base64Counter=(int8_t)(status>>16);
 242         bits=(uint16_t)status;
 243     }
 244     bytes=cnv->toUBytes;
 245     byteIndex=cnv->toULength;
 246
 247     /* sourceIndex=-1 if the current character began in the previous buffer */
 248     sourceIndex=byteIndex==0 ? 0 : -1;
 249     nextSourceIndex=0;
 250
 251     if(inDirectMode) {
 252 directMode:
 253         /*
 254          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
 255          * with their US-ASCII byte values.
 256          * Backslash and Tilde and most control characters are not allowed in UTF-7.
 257          * A plus sign starts Unicode (or "escape") Mode.
 258          *
 259          * In Direct Mode, only the sourceIndex is used.
 260          */
 261         byteIndex=0;
 262         length=sourceLimit-source;
 263         targetCapacity=targetLimit-target;
 264         if(length>targetCapacity) {
 265             length=targetCapacity;
 266         }
 267         while(length>0) {
 268             b=*source++;
 269             if(!isLegalUTF7(b)) {
 270                 /* illegal */
 271                 bytes[0]=b;
 272                 byteIndex=1;
 273                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 274                 break;
 275             } else if(b!=PLUS) {
 276                 /* write directly encoded character */
 277                 *target++=b;
 278                 if(offsets!=NULL) {
 279                     *offsets++=sourceIndex++;
 280                 }
 281             } else /* PLUS */ {
 282                 /* switch to Unicode mode */
 283                 nextSourceIndex=++sourceIndex;
 284                 inDirectMode=FALSE;
 285                 byteIndex=0;
 286                 bits=0;
 287                 base64Counter=-1;
 288                 goto unicodeMode;
 289             }
 290             --length;
 291         }
 292         if(source<sourceLimit && target>=targetLimit) {
 293             /* target is full */
 294             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 295         }
 296     } else {
 297 unicodeMode:
 298         /*
 299          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 300          * The base64 sequence ends with any character that is not in the base64 alphabet.
 301          * A terminating minus sign is consumed.
 302          *
 303          * In Unicode Mode, the sourceIndex has the index to the start of the current
 304          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 305          * keeping the index to the following byte.
 306          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 307          */
 308         while(source<sourceLimit) {
 309             if(target<targetLimit) {
 310                 bytes[byteIndex++]=b=*source++;
 311                 ++nextSourceIndex;
 312                 if(b>=126) {
 313                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
 314                     inDirectMode=TRUE;
 315                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 316                     break;
 317                 } else if((base64Value=fromBase64[b])>=0) {
 318                     /* collect base64 bytes into UChars */
 319                     switch(base64Counter) {
 320                     case -1: /* -1 is immediately after the + */
 321                     case 0:
 322                         bits=base64Value;
 323                         base64Counter=1;
 324                         break;
 325                     case 1:
 326                     case 3:
 327                     case 4:
 328                     case 6:
 329                         bits=(uint16_t)((bits<<6)|base64Value);
 330                         ++base64Counter;
 331                         break;
 332                     case 2:
 333                         *target++=(UChar)((bits<<4)|(base64Value>>2));
 334                         if(offsets!=NULL) {
 335                             *offsets++=sourceIndex;
 336                             sourceIndex=nextSourceIndex-1;
 337                         }
 338                         bytes[0]=b; /* keep this byte in case an error occurs */
 339                         byteIndex=1;
 340                         bits=(uint16_t)(base64Value&3);
 341                         base64Counter=3;
 342                         break;
 343                     case 5:
 344                         *target++=(UChar)((bits<<2)|(base64Value>>4));
 345                         if(offsets!=NULL) {
 346                             *offsets++=sourceIndex;
 347                             sourceIndex=nextSourceIndex-1;
 348                         }
 349                         bytes[0]=b; /* keep this byte in case an error occurs */
 350                         byteIndex=1;
 351                         bits=(uint16_t)(base64Value&15);
 352                         base64Counter=6;
 353                         break;
 354                     case 7:
 355                         *target++=(UChar)((bits<<6)|base64Value);
 356                         if(offsets!=NULL) {
 357                             *offsets++=sourceIndex;
 358                             sourceIndex=nextSourceIndex;
 359                         }
 360                         byteIndex=0;
 361                         bits=0;
 362                         base64Counter=0;
 363                         break;
 364                     default:
 365                         /* will never occur */
 366                         break;
 367                     }
 368                 } else if(base64Value==-2) {
 369                     /* minus sign terminates the base64 sequence */
 370                     inDirectMode=TRUE;
 371                     if(base64Counter==-1) {
 372                         /* +- i.e. a minus immediately following a plus */
 373                         *target++=PLUS;
 374                         if(offsets!=NULL) {
 375                             *offsets++=sourceIndex-1;
 376                         }
 377                     } else {
 378                         /* absorb the minus and leave the Unicode Mode */
 379                         if(bits!=0) {
 380                             /* bits are illegally left over, a UChar is incomplete */
 381                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 382                             break;
 383                         }
 384                     }
 385                     sourceIndex=nextSourceIndex;
 386                     goto directMode;
 387                 } else if(base64Value==-1) /* for any legal character except base64 and minus sign */ {
 388                     /* leave the Unicode Mode */
 389                     inDirectMode=TRUE;
 390                     if(base64Counter==-1) {
 391                         /* illegal: + immediately followed by something other than base64 or minus sign */
 392                         /* include the plus sign in the reported sequence */
 393                         --sourceIndex;
 394                         bytes[0]=PLUS;
 395                         bytes[1]=b;
 396                         byteIndex=2;
 397                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 398                         break;
 399                     } else if(bits==0) {
 400                         /* un-read the character in case it is a plus sign */
 401                         --source;
 402                         sourceIndex=nextSourceIndex-1;
 403                         goto directMode;
 404                     } else {
 405                         /* bits are illegally left over, a UChar is incomplete */
 406                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 407                         break;
 408                     }
 409                 } else /* base64Value==-3 for illegal characters */ {
 410                     /* illegal */
 411                     inDirectMode=TRUE;
 412                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 413                     break;
 414                 }
 415             } else {
 416                 /* target is full */
 417                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 418                 break;
 419             }
 420         }
 421     }
 422
 423     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
 424         /*
 425          * if we are in Unicode mode, then the byteIndex might not be 0,
 426          * but that is ok if bits==0
 427          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
 428          * (not true for IMAP-mailbox-name where we must end in direct mode)
 429          */
 430         byteIndex=0;
 431     }
 432
 433     /* set the converter state back into UConverter */
 434     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
 435     cnv->toULength=byteIndex;
 436
 437     /* write back the updated pointers */
 438     pArgs->source=(const char *)source;
 439     pArgs->target=target;
 440     pArgs->offsets=offsets;
 441     return;
 442 }
 443
 444 static void
 445 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 446                             UErrorCode *pErrorCode) {
 447     UConverter *cnv;
 448     const UChar *source, *sourceLimit;
 449     uint8_t *target, *targetLimit;
 450     int32_t *offsets;
 451
 452     int32_t length, targetCapacity, sourceIndex;
 453     UChar c;
 454
 455     /* UTF-7 state */
 456     const UBool *encodeDirectly;
 457     uint8_t bits;
 458     int8_t base64Counter;
 459     UBool inDirectMode;
 460
 461     /* set up the local pointers */
 462     cnv=pArgs->converter;
 463
 464     /* set up the local pointers */
 465     source=pArgs->source;
 466     sourceLimit=pArgs->sourceLimit;
 467     target=(uint8_t *)pArgs->target;
 468     targetLimit=(uint8_t *)pArgs->targetLimit;
 469     offsets=pArgs->offsets;
 470
 471     /* get the state machine state */
 472     {
 473         uint32_t status=cnv->fromUnicodeStatus;
 474         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
 475         inDirectMode=(UBool)((status>>24)&1);
 476         base64Counter=(int8_t)(status>>16);
 477         bits=(uint8_t)status;
 478     }
 479
 480     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
 481     sourceIndex=0;
 482
 483     if(inDirectMode) {
 484 directMode:
 485         length=sourceLimit-source;
 486         targetCapacity=targetLimit-target;
 487         if(length>targetCapacity) {
 488             length=targetCapacity;
 489         }
 490         while(length>0) {
 491             c=*source++;
 492             /* currently always encode CR LF SP TAB directly */
 493             if(c<=127 && encodeDirectly[c]) {
 494                 /* encode directly */
 495                 *target++=(uint8_t)c;
 496                 if(offsets!=NULL) {
 497                     *offsets++=sourceIndex++;
 498                 }
 499             } else if(c==PLUS) {
 500                 /* output +- for + */
 501                 *target++=PLUS;
 502                 if(target<targetLimit) {
 503                     *target++=MINUS;
 504                     if(offsets!=NULL) {
 505                         *offsets++=sourceIndex;
 506                         *offsets++=sourceIndex++;
 507                     }
 508                     /* realign length and targetCapacity */
 509                     goto directMode;
 510                 } else {
 511                     if(offsets!=NULL) {
 512                         *offsets++=sourceIndex++;
 513                     }
 514                     cnv->charErrorBuffer[0]=MINUS;
 515                     cnv->charErrorBufferLength=1;
 516                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 517                     break;
 518                 }
 519             } else {
 520                 /* un-read this character and switch to Unicode Mode */
 521                 --source;
 522                 *target++=PLUS;
 523                 if(offsets!=NULL) {
 524                     *offsets++=sourceIndex;
 525                 }
 526                 inDirectMode=FALSE;
 527                 base64Counter=0;
 528                 goto unicodeMode;
 529             }
 530             --length;
 531         }
 532         if(source<sourceLimit && target>=targetLimit) {
 533             /* target is full */
 534             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 535         }
 536     } else {
 537 unicodeMode:
 538         while(source<sourceLimit) {
 539             if(target<targetLimit) {
 540                 c=*source++;
 541                 if(c<=127 && encodeDirectly[c]) {
 542                     /* encode directly */
 543                     inDirectMode=TRUE;
 544
 545                     /* trick: back out this character to make this easier */
 546                     --source;
 547
 548                     /* terminate the base64 sequence */
 549                     if(base64Counter!=0) {
 550                         /* write remaining bits for the previous character */
 551                         *target++=toBase64[bits];
 552                         if(offsets!=NULL) {
 553                             *offsets++=sourceIndex-1;
 554                         }
 555                     }
 556                     if(fromBase64[c]!=-1) {
 557                         /* need to terminate with a minus */
 558                         if(target<targetLimit) {
 559                             *target++=MINUS;
 560                             if(offsets!=NULL) {
 561                                 *offsets++=sourceIndex-1;
 562                             }
 563                         } else {
 564                             cnv->charErrorBuffer[0]=MINUS;
 565                             cnv->charErrorBufferLength=1;
 566                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 567                             break;
 568                         }
 569                     }
 570                     goto directMode;
 571                 } else {
 572                     /*
 573                      * base64 this character:
 574                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
 575                      * and the bits of this character, each implicitly in UTF-16BE.
 576                      *
 577                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
 578                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
 579                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
 580                      */
 581                     switch(base64Counter) {
 582                     case 0:
 583                         *target++=toBase64[c>>10];
 584                         if(target<targetLimit) {
 585                             *target++=toBase64[(c>>4)&0x3f];
 586                             if(offsets!=NULL) {
 587                                 *offsets++=sourceIndex;
 588                                 *offsets++=sourceIndex++;
 589                             }
 590                         } else {
 591                             if(offsets!=NULL) {
 592                                 *offsets++=sourceIndex++;
 593                             }
 594                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
 595                             cnv->charErrorBufferLength=1;
 596                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 597                         }
 598                         bits=(uint8_t)((c&15)<<2);
 599                         base64Counter=1;
 600                         break;
 601                     case 1:
 602                         *target++=toBase64[bits|(c>>14)];
 603                         if(target<targetLimit) {
 604                             *target++=toBase64[(c>>8)&0x3f];
 605                             if(target<targetLimit) {
 606                                 *target++=toBase64[(c>>2)&0x3f];
 607                                 if(offsets!=NULL) {
 608                                     *offsets++=sourceIndex;
 609                                     *offsets++=sourceIndex;
 610                                     *offsets++=sourceIndex++;
 611                                 }
 612                             } else {
 613                                 if(offsets!=NULL) {
 614                                     *offsets++=sourceIndex;
 615                                     *offsets++=sourceIndex++;
 616                                 }
 617                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
 618                                 cnv->charErrorBufferLength=1;
 619                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 620                             }
 621                         } else {
 622                             if(offsets!=NULL) {
 623                                 *offsets++=sourceIndex++;
 624                             }
 625                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
 626                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
 627                             cnv->charErrorBufferLength=2;
 628                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 629                         }
 630                         bits=(uint8_t)((c&3)<<4);
 631                         base64Counter=2;
 632                         break;
 633                     case 2:
 634                         *target++=toBase64[bits|(c>>12)];
 635                         if(target<targetLimit) {
 636                             *target++=toBase64[(c>>6)&0x3f];
 637                             if(target<targetLimit) {
 638                                 *target++=toBase64[c&0x3f];
 639                                 if(offsets!=NULL) {
 640                                     *offsets++=sourceIndex;
 641                                     *offsets++=sourceIndex;
 642                                     *offsets++=sourceIndex++;
 643                                 }
 644                             } else {
 645                                 if(offsets!=NULL) {
 646                                     *offsets++=sourceIndex;
 647                                     *offsets++=sourceIndex++;
 648                                 }
 649                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
 650                                 cnv->charErrorBufferLength=1;
 651                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 652                             }
 653                         } else {
 654                             if(offsets!=NULL) {
 655                                 *offsets++=sourceIndex++;
 656                             }
 657                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
 658                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
 659                             cnv->charErrorBufferLength=2;
 660                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 661                         }
 662                         bits=0;
 663                         base64Counter=0;
 664                         break;
 665                     default:
 666                         /* will never occur */
 667                         break;
 668                     }
 669                 }
 670             } else {
 671                 /* target is full */
 672                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 673                 break;
 674             }
 675         }
 676     }
 677
 678     if(pArgs->flush && source>=sourceLimit) {
 679         /* flush remaining bits to the target */
 680         if(!inDirectMode && base64Counter!=0) {
 681             if(target<targetLimit) {
 682                 *target++=toBase64[bits];
 683                 if(offsets!=NULL) {
 684                     *offsets++=sourceIndex-1;
 685                 }
 686             } else {
 687                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
 688                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 689             }
 690         }
 691         /* reset the state for the next conversion */
 692         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
 693     } else {
 694         /* set the converter state back into UConverter */
 695         cnv->fromUnicodeStatus=
 696             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
 697             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
 698     }
 699
 700     /* write back the updated pointers */
 701     pArgs->source=source;
 702     pArgs->target=(char *)target;
 703     pArgs->offsets=offsets;
 704     return;
 705 }
 706
 707 static const char *
 708 _UTF7GetName(const UConverter *cnv) {
 709     switch(cnv->fromUnicodeStatus>>28) {
 710     case 1:
 711         return "UTF-7,version=1";
 712     default:
 713         return "UTF-7";
 714     }
 715 }
 716
 717 static const UConverterImpl _UTF7Impl={
 718     UCNV_UTF7,
 719
 720     NULL,
 721     NULL,
 722
 723     _UTF7Open,
 724     NULL,
 725     _UTF7Reset,
 726
 727     _UTF7ToUnicodeWithOffsets,
 728     _UTF7ToUnicodeWithOffsets,
 729     _UTF7FromUnicodeWithOffsets,
 730     _UTF7FromUnicodeWithOffsets,
 731     NULL,
 732
 733     NULL,
 734     _UTF7GetName,
 735     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
 736     NULL,
 737     ucnv_getCompleteUnicodeSet
 738 };
 739
 740 static const UConverterStaticData _UTF7StaticData={
 741     sizeof(UConverterStaticData),
 742     "UTF-7",
 743     0, /* TODO CCSID for UTF-7 */
 744     UCNV_IBM, UCNV_UTF7,
 745     1, 4,
 746     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
 747     FALSE, FALSE,
 748     0,
 749     0,
 750     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 751 };
 752
 753 const UConverterSharedData _UTF7Data={
 754     sizeof(UConverterSharedData), ~((uint32_t)0),
 755     NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
 756     0
 757 };
 758
 759 /* IMAP mailbox name encoding ----------------------------------------------- */
 760
 761 /*
 762  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 763  * http://www.ietf.org/rfc/rfc2060.txt
 764  *
 765  * 5.1.3.  Mailbox International Naming Convention
 766  *
 767  * By convention, international mailbox names are specified using a
 768  * modified version of the UTF-7 encoding described in [UTF-7].  The
 769  * purpose of these modifications is to correct the following problems
 770  * with UTF-7:
 771  *
 772  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
 773  *       the common use of "+" in mailbox names, in particular USENET
 774  *       newsgroup names.
 775  *
 776  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
 777  *       conflicts with the use of "/" as a popular hierarchy delimiter.
 778  *
 779  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
 780  *       the use of "\" as a popular hierarchy delimiter.
 781  *
 782  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
 783  *       the use of "~" in some servers as a home directory indicator.
 784  *
 785  *    5) UTF-7 permits multiple alternate forms to represent the same
 786  *       string; in particular, printable US-ASCII chararacters can be
 787  *       represented in encoded form.
 788  *
 789  * In modified UTF-7, printable US-ASCII characters except for "&"
 790  * represent themselves; that is, characters with octet values 0x20-0x25
 791  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
 792  * octet sequence "&-".
 793  *
 794  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
 795  * Unicode 16-bit octets) are represented in modified BASE64, with a
 796  * further modification from [UTF-7] that "," is used instead of "/".
 797  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
 798  * character which can represent itself.
 799  *
 800  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
 801  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
 802  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
 803  * ").
 804  *
 805  * For example, here is a mailbox name which mixes English, Japanese,
 806  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
 807  */
 808
 809 /*
 810  * Tests for US-ASCII characters belonging to character classes
 811  * defined in UTF-7.
 812  *
 813  * Set D (directly encoded characters) consists of the following
 814  * characters: the upper and lower case letters A through Z
 815  * and a through z, the 10 digits 0-9, and the following nine special
 816  * characters (note that "+" and "=" are omitted):
 817  *     '(),-./:?
 818  *
 819  * Set O (optional direct characters) consists of the following
 820  * characters (note that "\" and "~" are omitted):
 821  *     !"#$%&*;<=>@[]^_`{|}
 822  *
 823  * According to the rules in RFC 2152, the byte values for the following
 824  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
 825  * - all C0 control codes except for CR LF TAB
 826  * - BACKSLASH
 827  * - TILDE
 828  * - DEL
 829  * - all codes beyond US-ASCII, i.e. all >127
 830  */
 831
 832 /* uses '&' not '+' to start a base64 sequence */
 833 #define AMPERSAND 0x26
 834 #define COMMA 0x2c
 835 #define SLASH 0x2f
 836
 837 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
 838 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
 839
 840 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
 841 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
 842
 843 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
 844 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
 845
 846 /*
 847  * converter status values:
 848  *
 849  * toUnicodeStatus:
 850  *     24 inDirectMode (boolean)
 851  * 23..16 base64Counter (-1..7)
 852  * 15..0  bits (up to 14 bits incoming base64)
 853  *
 854  * fromUnicodeStatus:
 855  *     24 inDirectMode (boolean)
 856  * 23..16 base64Counter (0..2)
 857  *  7..0  bits (6 bits outgoing base64)
 858  *
 859  * ignore bits 31..25
 860  */
 861
 862 static void
 863 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 864                           UErrorCode *pErrorCode) {
 865     UConverter *cnv;
 866     const uint8_t *source, *sourceLimit;
 867     UChar *target;
 868     const UChar *targetLimit;
 869     int32_t *offsets;
 870
 871     uint8_t *bytes;
 872     uint8_t byteIndex;
 873
 874     int32_t length, targetCapacity;
 875
 876     /* UTF-7 state */
 877     uint16_t bits;
 878     int8_t base64Counter;
 879     UBool inDirectMode;
 880
 881     int8_t base64Value;
 882
 883     int32_t sourceIndex, nextSourceIndex;
 884
 885     UChar c;
 886     uint8_t b;
 887
 888     /* set up the local pointers */
 889     cnv=pArgs->converter;
 890
 891     source=(const uint8_t *)pArgs->source;
 892     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 893     target=pArgs->target;
 894     targetLimit=pArgs->targetLimit;
 895     offsets=pArgs->offsets;
 896     /* get the state machine state */
 897     {
 898         uint32_t status=cnv->toUnicodeStatus;
 899         inDirectMode=(UBool)((status>>24)&1);
 900         base64Counter=(int8_t)(status>>16);
 901         bits=(uint16_t)status;
 902     }
 903     bytes=cnv->toUBytes;
 904     byteIndex=cnv->toULength;
 905
 906     /* sourceIndex=-1 if the current character began in the previous buffer */
 907     sourceIndex=byteIndex==0 ? 0 : -1;
 908     nextSourceIndex=0;
 909
 910     if(inDirectMode) {
 911 directMode:
 912         /*
 913          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
 914          * with their US-ASCII byte values.
 915          * An ampersand starts Unicode (or "escape") Mode.
 916          *
 917          * In Direct Mode, only the sourceIndex is used.
 918          */
 919         byteIndex=0;
 920         length=sourceLimit-source;
 921         targetCapacity=targetLimit-target;
 922         if(length>targetCapacity) {
 923             length=targetCapacity;
 924         }
 925         while(length>0) {
 926             b=*source++;
 927             if(!isLegalIMAP(b)) {
 928                 /* illegal */
 929                 bytes[0]=b;
 930                 byteIndex=1;
 931                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 932                 break;
 933             } else if(b!=AMPERSAND) {
 934                 /* write directly encoded character */
 935                 *target++=b;
 936                 if(offsets!=NULL) {
 937                     *offsets++=sourceIndex++;
 938                 }
 939             } else /* AMPERSAND */ {
 940                 /* switch to Unicode mode */
 941                 nextSourceIndex=++sourceIndex;
 942                 inDirectMode=FALSE;
 943                 byteIndex=0;
 944                 bits=0;
 945                 base64Counter=-1;
 946                 goto unicodeMode;
 947             }
 948             --length;
 949         }
 950         if(source<sourceLimit && target>=targetLimit) {
 951             /* target is full */
 952             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 953         }
 954     } else {
 955 unicodeMode:
 956         /*
 957          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
 958          * The base64 sequence ends with any character that is not in the base64 alphabet.
 959          * A terminating minus sign is consumed.
 960          * US-ASCII must not be base64-ed.
 961          *
 962          * In Unicode Mode, the sourceIndex has the index to the start of the current
 963          * base64 bytes, while nextSourceIndex is precisely parallel to source,
 964          * keeping the index to the following byte.
 965          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
 966          */
 967         while(source<sourceLimit) {
 968             if(target<targetLimit) {
 969                 bytes[byteIndex++]=b=*source++;
 970                 ++nextSourceIndex;
 971                 if(b>0x7e) {
 972                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
 973                     inDirectMode=TRUE;
 974                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 975                     break;
 976                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
 977                     /* collect base64 bytes into UChars */
 978                     switch(base64Counter) {
 979                     case -1: /* -1 is immediately after the & */
 980                     case 0:
 981                         bits=base64Value;
 982                         base64Counter=1;
 983                         break;
 984                     case 1:
 985                     case 3:
 986                     case 4:
 987                     case 6:
 988                         bits=(uint16_t)((bits<<6)|base64Value);
 989                         ++base64Counter;
 990                         break;
 991                     case 2:
 992                         c=(UChar)((bits<<4)|(base64Value>>2));
 993                         if(isLegalIMAP(c)) {
 994                             /* illegal */
 995                             inDirectMode=TRUE;
 996                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 997                             goto endloop;
 998                         }
 999                         *target++=c;
1000                         if(offsets!=NULL) {
1001                             *offsets++=sourceIndex;
1002                             sourceIndex=nextSourceIndex-1;
1003                         }
1004                         bytes[0]=b; /* keep this byte in case an error occurs */
1005                         byteIndex=1;
1006                         bits=(uint16_t)(base64Value&3);
1007                         base64Counter=3;
1008                         break;
1009                     case 5:
1010                         c=(UChar)((bits<<2)|(base64Value>>4));
1011                         if(isLegalIMAP(c)) {
1012                             /* illegal */
1013                             inDirectMode=TRUE;
1014                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1015                             goto endloop;
1016                         }
1017                         *target++=c;
1018                         if(offsets!=NULL) {
1019                             *offsets++=sourceIndex;
1020                             sourceIndex=nextSourceIndex-1;
1021                         }
1022                         bytes[0]=b; /* keep this byte in case an error occurs */
1023                         byteIndex=1;
1024                         bits=(uint16_t)(base64Value&15);
1025                         base64Counter=6;
1026                         break;
1027                     case 7:
1028                         c=(UChar)((bits<<6)|base64Value);
1029                         if(isLegalIMAP(c)) {
1030                             /* illegal */
1031                             inDirectMode=TRUE;
1032                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1033                             goto endloop;
1034                         }
1035                         *target++=c;
1036                         if(offsets!=NULL) {
1037                             *offsets++=sourceIndex;
1038                             sourceIndex=nextSourceIndex;
1039                         }
1040                         byteIndex=0;
1041                         bits=0;
1042                         base64Counter=0;
1043                         break;
1044                     default:
1045                         /* will never occur */
1046                         break;
1047                     }
1048                 } else if(base64Value==-2) {
1049                     /* minus sign terminates the base64 sequence */
1050                     inDirectMode=TRUE;
1051                     if(base64Counter==-1) {
1052                         /* &- i.e. a minus immediately following an ampersand */
1053                         *target++=AMPERSAND;
1054                         if(offsets!=NULL) {
1055                             *offsets++=sourceIndex-1;
1056                         }
1057                     } else {
1058                         /* absorb the minus and leave the Unicode Mode */
1059                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1060                             /* bits are illegally left over, a UChar is incomplete */
1061                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1062                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063                             break;
1064                         }
1065                     }
1066                     sourceIndex=nextSourceIndex;
1067                     goto directMode;
1068                 } else {
1069                     if(base64Counter==-1) {
1070                         /* illegal: & immediately followed by something other than base64 or minus sign */
1071                         /* include the ampersand in the reported sequence */
1072                         --sourceIndex;
1073                         bytes[0]=AMPERSAND;
1074                         bytes[1]=b;
1075                         byteIndex=2;
1076                     }
1077                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1078                     /* base64Value==-3 for illegal characters */
1079                     /* illegal */
1080                     inDirectMode=TRUE;
1081                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1082                     break;
1083                 }
1084             } else {
1085                 /* target is full */
1086                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1087                 break;
1088             }
1089         }
1090     }
1091 endloop:
1092
1093     /*
1094      * the end of the input stream and detection of truncated input
1095      * are handled by the framework, but here we must check if we are in Unicode
1096      * mode and byteIndex==0 because we must end in direct mode
1097      *
1098      * conditions:
1099      *   successful
1100      *   in Unicode mode and byteIndex==0
1101      *   end of input and no truncated input
1102      */
1103     if( U_SUCCESS(*pErrorCode) &&
1104         !inDirectMode && byteIndex==0 &&
1105         pArgs->flush && source>=sourceLimit
1106     ) {
1107         if(base64Counter==-1) {
1108             /* & at the very end of the input */
1109             /* make the ampersand the reported sequence */
1110             bytes[0]=AMPERSAND;
1111             byteIndex=1;
1112         }
1113         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1114
1115         inDirectMode=TRUE; /* avoid looping */
1116         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1117     }
1118
1119     /* set the converter state back into UConverter */
1120     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1121     cnv->toULength=byteIndex;
1122
1123     /* write back the updated pointers */
1124     pArgs->source=(const char *)source;
1125     pArgs->target=target;
1126     pArgs->offsets=offsets;
1127     return;
1128 }
1129
1130 static void
1131 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1132                             UErrorCode *pErrorCode) {
1133     UConverter *cnv;
1134     const UChar *source, *sourceLimit;
1135     uint8_t *target, *targetLimit;
1136     int32_t *offsets;
1137
1138     int32_t length, targetCapacity, sourceIndex;
1139     UChar c;
1140     uint8_t b;
1141
1142     /* UTF-7 state */
1143     uint8_t bits;
1144     int8_t base64Counter;
1145     UBool inDirectMode;
1146
1147     /* set up the local pointers */
1148     cnv=pArgs->converter;
1149
1150     /* set up the local pointers */
1151     source=pArgs->source;
1152     sourceLimit=pArgs->sourceLimit;
1153     target=(uint8_t *)pArgs->target;
1154     targetLimit=(uint8_t *)pArgs->targetLimit;
1155     offsets=pArgs->offsets;
1156
1157     /* get the state machine state */
1158     {
1159         uint32_t status=cnv->fromUnicodeStatus;
1160         inDirectMode=(UBool)((status>>24)&1);
1161         base64Counter=(int8_t)(status>>16);
1162         bits=(uint8_t)status;
1163     }
1164
1165     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1166     sourceIndex=0;
1167
1168     if(inDirectMode) {
1169 directMode:
1170         length=sourceLimit-source;
1171         targetCapacity=targetLimit-target;
1172         if(length>targetCapacity) {
1173             length=targetCapacity;
1174         }
1175         while(length>0) {
1176             c=*source++;
1177             /* encode 0x20..0x7e except '&' directly */
1178             if(inSetDIMAP(c)) {
1179                 /* encode directly */
1180                 *target++=(uint8_t)c;
1181                 if(offsets!=NULL) {
1182                     *offsets++=sourceIndex++;
1183                 }
1184             } else if(c==AMPERSAND) {
1185                 /* output &- for & */
1186                 *target++=AMPERSAND;
1187                 if(target<targetLimit) {
1188                     *target++=MINUS;
1189                     if(offsets!=NULL) {
1190                         *offsets++=sourceIndex;
1191                         *offsets++=sourceIndex++;
1192                     }
1193                     /* realign length and targetCapacity */
1194                     goto directMode;
1195                 } else {
1196                     if(offsets!=NULL) {
1197                         *offsets++=sourceIndex++;
1198                     }
1199                     cnv->charErrorBuffer[0]=MINUS;
1200                     cnv->charErrorBufferLength=1;
1201                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1202                     break;
1203                 }
1204             } else {
1205                 /* un-read this character and switch to Unicode Mode */
1206                 --source;
1207                 *target++=AMPERSAND;
1208                 if(offsets!=NULL) {
1209                     *offsets++=sourceIndex;
1210                 }
1211                 inDirectMode=FALSE;
1212                 base64Counter=0;
1213                 goto unicodeMode;
1214             }
1215             --length;
1216         }
1217         if(source<sourceLimit && target>=targetLimit) {
1218             /* target is full */
1219             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1220         }
1221     } else {
1222 unicodeMode:
1223         while(source<sourceLimit) {
1224             if(target<targetLimit) {
1225                 c=*source++;
1226                 if(isLegalIMAP(c)) {
1227                     /* encode directly */
1228                     inDirectMode=TRUE;
1229
1230                     /* trick: back out this character to make this easier */
1231                     --source;
1232
1233                     /* terminate the base64 sequence */
1234                     if(base64Counter!=0) {
1235                         /* write remaining bits for the previous character */
1236                         *target++=TO_BASE64_IMAP(bits);
1237                         if(offsets!=NULL) {
1238                             *offsets++=sourceIndex-1;
1239                         }
1240                     }
1241                     /* need to terminate with a minus */
1242                     if(target<targetLimit) {
1243                         *target++=MINUS;
1244                         if(offsets!=NULL) {
1245                             *offsets++=sourceIndex-1;
1246                         }
1247                     } else {
1248                         cnv->charErrorBuffer[0]=MINUS;
1249                         cnv->charErrorBufferLength=1;
1250                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1251                         break;
1252                     }
1253                     goto directMode;
1254                 } else {
1255                     /*
1256                      * base64 this character:
1257                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1258                      * and the bits of this character, each implicitly in UTF-16BE.
1259                      *
1260                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1261                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1262                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1263                      */
1264                     switch(base64Counter) {
1265                     case 0:
1266                         b=(uint8_t)(c>>10);
1267                         *target++=TO_BASE64_IMAP(b);
1268                         if(target<targetLimit) {
1269                             b=(uint8_t)((c>>4)&0x3f);
1270                             *target++=TO_BASE64_IMAP(b);
1271                             if(offsets!=NULL) {
1272                                 *offsets++=sourceIndex;
1273                                 *offsets++=sourceIndex++;
1274                             }
1275                         } else {
1276                             if(offsets!=NULL) {
1277                                 *offsets++=sourceIndex++;
1278                             }
1279                             b=(uint8_t)((c>>4)&0x3f);
1280                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1281                             cnv->charErrorBufferLength=1;
1282                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283                         }
1284                         bits=(uint8_t)((c&15)<<2);
1285                         base64Counter=1;
1286                         break;
1287                     case 1:
1288                         b=(uint8_t)(bits|(c>>14));
1289                         *target++=TO_BASE64_IMAP(b);
1290                         if(target<targetLimit) {
1291                             b=(uint8_t)((c>>8)&0x3f);
1292                             *target++=TO_BASE64_IMAP(b);
1293                             if(target<targetLimit) {
1294                                 b=(uint8_t)((c>>2)&0x3f);
1295                                 *target++=TO_BASE64_IMAP(b);
1296                                 if(offsets!=NULL) {
1297                                     *offsets++=sourceIndex;
1298                                     *offsets++=sourceIndex;
1299                                     *offsets++=sourceIndex++;
1300                                 }
1301                             } else {
1302                                 if(offsets!=NULL) {
1303                                     *offsets++=sourceIndex;
1304                                     *offsets++=sourceIndex++;
1305                                 }
1306                                 b=(uint8_t)((c>>2)&0x3f);
1307                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1308                                 cnv->charErrorBufferLength=1;
1309                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1310                             }
1311                         } else {
1312                             if(offsets!=NULL) {
1313                                 *offsets++=sourceIndex++;
1314                             }
1315                             b=(uint8_t)((c>>8)&0x3f);
1316                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1317                             b=(uint8_t)((c>>2)&0x3f);
1318                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1319                             cnv->charErrorBufferLength=2;
1320                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1321                         }
1322                         bits=(uint8_t)((c&3)<<4);
1323                         base64Counter=2;
1324                         break;
1325                     case 2:
1326                         b=(uint8_t)(bits|(c>>12));
1327                         *target++=TO_BASE64_IMAP(b);
1328                         if(target<targetLimit) {
1329                             b=(uint8_t)((c>>6)&0x3f);
1330                             *target++=TO_BASE64_IMAP(b);
1331                             if(target<targetLimit) {
1332                                 b=(uint8_t)(c&0x3f);
1333                                 *target++=TO_BASE64_IMAP(b);
1334                                 if(offsets!=NULL) {
1335                                     *offsets++=sourceIndex;
1336                                     *offsets++=sourceIndex;
1337                                     *offsets++=sourceIndex++;
1338                                 }
1339                             } else {
1340                                 if(offsets!=NULL) {
1341                                     *offsets++=sourceIndex;
1342                                     *offsets++=sourceIndex++;
1343                                 }
1344                                 b=(uint8_t)(c&0x3f);
1345                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1346                                 cnv->charErrorBufferLength=1;
1347                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1348                             }
1349                         } else {
1350                             if(offsets!=NULL) {
1351                                 *offsets++=sourceIndex++;
1352                             }
1353                             b=(uint8_t)((c>>6)&0x3f);
1354                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1355                             b=(uint8_t)(c&0x3f);
1356                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1357                             cnv->charErrorBufferLength=2;
1358                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1359                         }
1360                         bits=0;
1361                         base64Counter=0;
1362                         break;
1363                     default:
1364                         /* will never occur */
1365                         break;
1366                     }
1367                 }
1368             } else {
1369                 /* target is full */
1370                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371                 break;
1372             }
1373         }
1374     }
1375
1376     if(pArgs->flush && source>=sourceLimit) {
1377         /* flush remaining bits to the target */
1378         if(!inDirectMode) {
1379             if(base64Counter!=0) {
1380                 if(target<targetLimit) {
1381                     *target++=TO_BASE64_IMAP(bits);
1382                     if(offsets!=NULL) {
1383                         *offsets++=sourceIndex-1;
1384                     }
1385                 } else {
1386                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1387                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1388                 }
1389             }
1390             /* need to terminate with a minus */
1391             if(target<targetLimit) {
1392                 *target++=MINUS;
1393                 if(offsets!=NULL) {
1394                     *offsets++=sourceIndex-1;
1395                 }
1396             } else {
1397                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1398                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1399             }
1400         }
1401         /* reset the state for the next conversion */
1402         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1403     } else {
1404         /* set the converter state back into UConverter */
1405         cnv->fromUnicodeStatus=
1406             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1407             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1408     }
1409
1410     /* write back the updated pointers */
1411     pArgs->source=source;
1412     pArgs->target=(char *)target;
1413     pArgs->offsets=offsets;
1414     return;
1415 }
1416
1417 static const UConverterImpl _IMAPImpl={
1418     UCNV_IMAP_MAILBOX,
1419
1420     NULL,
1421     NULL,
1422
1423     _UTF7Open,
1424     NULL,
1425     _UTF7Reset,
1426
1427     _IMAPToUnicodeWithOffsets,
1428     _IMAPToUnicodeWithOffsets,
1429     _IMAPFromUnicodeWithOffsets,
1430     _IMAPFromUnicodeWithOffsets,
1431     NULL,
1432
1433     NULL,
1434     NULL,
1435     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1436     NULL,
1437     ucnv_getCompleteUnicodeSet
1438 };
1439
1440 static const UConverterStaticData _IMAPStaticData={
1441     sizeof(UConverterStaticData),
1442     "IMAP-mailbox-name",
1443     0, /* TODO CCSID for IMAP-mailbox-name */
1444     UCNV_IBM, UCNV_IMAP_MAILBOX,
1445     1, 4,
1446     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1447     FALSE, FALSE,
1448     0,
1449     0,
1450     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1451 };
1452
1453 const UConverterSharedData _IMAPData={
1454     sizeof(UConverterSharedData), ~((uint32_t)0),
1455     NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1456     0
1457 };
1458
1459 #endif