]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u7.c
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u7.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
2ca993e8 3* Copyright (C) 2002-2016, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv_u7.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2002jul01
12* created by: Markus W. Scherer
13*
14* UTF-7 converter implementation. Used to be in ucnv_utf.c.
15*/
16
17#include "unicode/utypes.h"
374ca955 18
b331163b 19#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
374ca955 20
2ca993e8 21#include "cmemory.h"
b75a7d8f 22#include "unicode/ucnv.h"
b75a7d8f
A
23#include "ucnv_bld.h"
24#include "ucnv_cnv.h"
4388f060 25#include "uassert.h"
b75a7d8f
A
26
27/* UTF-7 -------------------------------------------------------------------- */
28
b75a7d8f
A
29/*
30 * UTF-7 is a stateful encoding of Unicode.
31 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
32 * It was intended for use in Internet email systems, using in its bytewise
33 * encoding only a subset of 7-bit US-ASCII.
34 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
35 * occasionally used.
36 *
37 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
38 * characters directly or in base64. Especially, the characters in set O
39 * as defined in the RFC (see below) may be encoded directly but are not
40 * allowed in, e.g., email headers.
41 * By default, the ICU UTF-7 converter encodes set O directly.
42 * By choosing the option "version=1", set O will be escaped instead.
43 * For example:
44 * utf7Converter=ucnv_open("UTF-7,version=1");
45 *
46 * For details about email headers see RFC 2047.
47 */
48
49/*
50 * Tests for US-ASCII characters belonging to character classes
51 * defined in UTF-7.
52 *
53 * Set D (directly encoded characters) consists of the following
54 * characters: the upper and lower case letters A through Z
55 * and a through z, the 10 digits 0-9, and the following nine special
56 * characters (note that "+" and "=" are omitted):
57 * '(),-./:?
58 *
59 * Set O (optional direct characters) consists of the following
60 * characters (note that "\" and "~" are omitted):
61 * !"#$%&*;<=>@[]^_`{|}
62 *
63 * According to the rules in RFC 2152, the byte values for the following
64 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
65 * - all C0 control codes except for CR LF TAB
66 * - BACKSLASH
67 * - TILDE
68 * - DEL
69 * - all codes beyond US-ASCII, i.e. all >127
70 */
71#define inSetD(c) \
72 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
73 (uint8_t)((c)-48)<10 || /* digits */ \
74 (uint8_t)((c)-39)<3 || /* '() */ \
75 (uint8_t)((c)-44)<4 || /* ,-./ */ \
76 (c)==58 || (c)==63 /* :? */ \
77 )
78
79#define inSetO(c) \
80 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
81 (uint8_t)((c)-59)<4 || /* ;<=> */ \
82 (uint8_t)((c)-93)<4 || /* ]^_` */ \
83 (uint8_t)((c)-123)<3 || /* {|} */ \
84 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
85 )
86
87#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
88#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
89
90#define PLUS 43
91#define MINUS 45
92#define BACKSLASH 92
93#define TILDE 126
94
95/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
96#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
97
98/* encode directly sets D and O and CR LF SP TAB */
99static const UBool encodeDirectlyMaximum[128]={
100 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
109
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
112};
113
114/* encode directly set D and CR LF SP TAB but not set O */
115static const UBool encodeDirectlyRestricted[128]={
116 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119
120 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
122
123 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
125
126 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
128};
129
130static const uint8_t
131toBase64[64]={
132 /* A-Z */
133 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
134 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
135 /* a-z */
136 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
137 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
138 /* 0-9 */
139 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
140 /* +/ */
141 43, 47
142};
143
144static const int8_t
145fromBase64[128]={
146 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
147 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
148 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
149
150 /* general punctuation with + and / and a special value (-2) for - */
151 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
152 /* digits */
153 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
154
155 /* A-Z */
156 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
157 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
158
159 /* a-z */
160 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
161 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
162};
163
164/*
165 * converter status values:
166 *
167 * toUnicodeStatus:
168 * 24 inDirectMode (boolean)
169 * 23..16 base64Counter (-1..7)
170 * 15..0 bits (up to 14 bits incoming base64)
171 *
172 * fromUnicodeStatus:
173 * 31..28 version (0: set O direct 1: set O escaped)
174 * 24 inDirectMode (boolean)
175 * 23..16 base64Counter (0..2)
176 * 7..0 bits (6 bits outgoing base64)
177 *
178 */
179
180static void
181_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
182 if(choice<=UCNV_RESET_TO_UNICODE) {
183 /* reset toUnicode */
184 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
185 cnv->toULength=0;
186 }
187 if(choice!=UCNV_RESET_TO_UNICODE) {
188 /* reset fromUnicode */
189 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
190 }
191}
192
193static void
194_UTF7Open(UConverter *cnv,
729e4ab9 195 UConverterLoadArgs *pArgs,
b75a7d8f 196 UErrorCode *pErrorCode) {
729e4ab9
A
197 if(UCNV_GET_VERSION(cnv)<=1) {
198 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
199 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
b75a7d8f
A
200 _UTF7Reset(cnv, UCNV_RESET_BOTH);
201 } else {
202 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
203 }
204}
205
206static void
207_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
208 UErrorCode *pErrorCode) {
209 UConverter *cnv;
210 const uint8_t *source, *sourceLimit;
211 UChar *target;
212 const UChar *targetLimit;
213 int32_t *offsets;
214
215 uint8_t *bytes;
216 uint8_t byteIndex;
217
218 int32_t length, targetCapacity;
219
220 /* UTF-7 state */
221 uint16_t bits;
222 int8_t base64Counter;
223 UBool inDirectMode;
224
225 int8_t base64Value;
226
227 int32_t sourceIndex, nextSourceIndex;
228
229 uint8_t b;
230 /* set up the local pointers */
231 cnv=pArgs->converter;
232
233 source=(const uint8_t *)pArgs->source;
234 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
235 target=pArgs->target;
236 targetLimit=pArgs->targetLimit;
237 offsets=pArgs->offsets;
238 /* get the state machine state */
239 {
240 uint32_t status=cnv->toUnicodeStatus;
241 inDirectMode=(UBool)((status>>24)&1);
242 base64Counter=(int8_t)(status>>16);
243 bits=(uint16_t)status;
244 }
245 bytes=cnv->toUBytes;
246 byteIndex=cnv->toULength;
247
248 /* sourceIndex=-1 if the current character began in the previous buffer */
249 sourceIndex=byteIndex==0 ? 0 : -1;
250 nextSourceIndex=0;
251
b75a7d8f
A
252 if(inDirectMode) {
253directMode:
254 /*
255 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
256 * with their US-ASCII byte values.
257 * Backslash and Tilde and most control characters are not allowed in UTF-7.
258 * A plus sign starts Unicode (or "escape") Mode.
259 *
260 * In Direct Mode, only the sourceIndex is used.
261 */
262 byteIndex=0;
73c04bcf
A
263 length=(int32_t)(sourceLimit-source);
264 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
265 if(length>targetCapacity) {
266 length=targetCapacity;
267 }
268 while(length>0) {
269 b=*source++;
270 if(!isLegalUTF7(b)) {
271 /* illegal */
272 bytes[0]=b;
273 byteIndex=1;
374ca955
A
274 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
275 break;
b75a7d8f
A
276 } else if(b!=PLUS) {
277 /* write directly encoded character */
278 *target++=b;
279 if(offsets!=NULL) {
280 *offsets++=sourceIndex++;
281 }
282 } else /* PLUS */ {
283 /* switch to Unicode mode */
284 nextSourceIndex=++sourceIndex;
285 inDirectMode=FALSE;
286 byteIndex=0;
287 bits=0;
288 base64Counter=-1;
289 goto unicodeMode;
290 }
291 --length;
292 }
293 if(source<sourceLimit && target>=targetLimit) {
294 /* target is full */
295 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
296 }
297 } else {
298unicodeMode:
299 /*
300 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
301 * The base64 sequence ends with any character that is not in the base64 alphabet.
302 * A terminating minus sign is consumed.
303 *
304 * In Unicode Mode, the sourceIndex has the index to the start of the current
305 * base64 bytes, while nextSourceIndex is precisely parallel to source,
306 * keeping the index to the following byte.
307 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
308 */
309 while(source<sourceLimit) {
310 if(target<targetLimit) {
311 bytes[byteIndex++]=b=*source++;
312 ++nextSourceIndex;
729e4ab9
A
313 base64Value = -3; /* initialize as illegal */
314 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
315 /* either
316 * base64Value==-1 for any legal character except base64 and minus sign, or
317 * base64Value==-3 for illegal characters:
318 * 1. In either case, leave Unicode mode.
319 * 2.1. If we ended with an incomplete UChar or none after the +, then
320 * generate an error for the preceding erroneous sequence and deal with
321 * the current (possibly illegal) character next time through.
322 * 2.2. Else the current char comes after a complete UChar, which was already
323 * pushed to the output buf, so:
324 * 2.2.1. If the current char is legal, just save it for processing next time.
325 * It may be for example, a plus which we need to deal with in direct mode.
326 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
327 */
b75a7d8f 328 inDirectMode=TRUE;
729e4ab9
A
329 if(base64Counter==-1) {
330 /* illegal: + immediately followed by something other than base64 or minus sign */
331 /* include the plus sign in the reported sequence, but not the subsequent char */
332 --source;
333 bytes[0]=PLUS;
334 byteIndex=1;
335 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
336 break;
337 } else if(bits!=0) {
338 /* bits are illegally left over, a UChar is incomplete */
339 /* don't include current char (legal or illegal) in error seq */
340 --source;
341 --byteIndex;
342 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
343 break;
344 } else {
345 /* previous UChar was complete */
4388f060 346 if(base64Value==-3) {
729e4ab9
A
347 /* current character is illegal, deal with it here */
348 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
349 break;
350 } else {
351 /* un-read the current character in case it is a plus sign */
352 --source;
353 sourceIndex=nextSourceIndex-1;
354 goto directMode;
355 }
356 }
357 } else if(base64Value>=0) {
b75a7d8f
A
358 /* collect base64 bytes into UChars */
359 switch(base64Counter) {
360 case -1: /* -1 is immediately after the + */
361 case 0:
362 bits=base64Value;
363 base64Counter=1;
364 break;
365 case 1:
366 case 3:
367 case 4:
368 case 6:
369 bits=(uint16_t)((bits<<6)|base64Value);
370 ++base64Counter;
371 break;
372 case 2:
373 *target++=(UChar)((bits<<4)|(base64Value>>2));
374 if(offsets!=NULL) {
375 *offsets++=sourceIndex;
376 sourceIndex=nextSourceIndex-1;
377 }
378 bytes[0]=b; /* keep this byte in case an error occurs */
379 byteIndex=1;
380 bits=(uint16_t)(base64Value&3);
381 base64Counter=3;
382 break;
383 case 5:
384 *target++=(UChar)((bits<<2)|(base64Value>>4));
385 if(offsets!=NULL) {
386 *offsets++=sourceIndex;
387 sourceIndex=nextSourceIndex-1;
388 }
389 bytes[0]=b; /* keep this byte in case an error occurs */
390 byteIndex=1;
391 bits=(uint16_t)(base64Value&15);
392 base64Counter=6;
393 break;
394 case 7:
395 *target++=(UChar)((bits<<6)|base64Value);
396 if(offsets!=NULL) {
397 *offsets++=sourceIndex;
398 sourceIndex=nextSourceIndex;
399 }
400 byteIndex=0;
401 bits=0;
402 base64Counter=0;
403 break;
404 default:
405 /* will never occur */
406 break;
407 }
729e4ab9 408 } else /*base64Value==-2*/ {
b75a7d8f
A
409 /* minus sign terminates the base64 sequence */
410 inDirectMode=TRUE;
411 if(base64Counter==-1) {
412 /* +- i.e. a minus immediately following a plus */
413 *target++=PLUS;
414 if(offsets!=NULL) {
415 *offsets++=sourceIndex-1;
416 }
417 } else {
418 /* absorb the minus and leave the Unicode Mode */
419 if(bits!=0) {
420 /* bits are illegally left over, a UChar is incomplete */
374ca955
A
421 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
422 break;
b75a7d8f
A
423 }
424 }
425 sourceIndex=nextSourceIndex;
426 goto directMode;
b75a7d8f
A
427 }
428 } else {
429 /* target is full */
430 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
431 break;
432 }
433 }
434 }
b75a7d8f 435
374ca955
A
436 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
437 /*
438 * if we are in Unicode mode, then the byteIndex might not be 0,
439 * but that is ok if bits==0
440 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
441 * (not true for IMAP-mailbox-name where we must end in direct mode)
442 */
443 byteIndex=0;
b75a7d8f
A
444 }
445
374ca955
A
446 /* set the converter state back into UConverter */
447 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
448 cnv->toULength=byteIndex;
449
b75a7d8f
A
450 /* write back the updated pointers */
451 pArgs->source=(const char *)source;
452 pArgs->target=target;
453 pArgs->offsets=offsets;
454 return;
b75a7d8f
A
455}
456
457static void
458_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
459 UErrorCode *pErrorCode) {
460 UConverter *cnv;
461 const UChar *source, *sourceLimit;
462 uint8_t *target, *targetLimit;
463 int32_t *offsets;
464
465 int32_t length, targetCapacity, sourceIndex;
466 UChar c;
467
468 /* UTF-7 state */
469 const UBool *encodeDirectly;
470 uint8_t bits;
471 int8_t base64Counter;
472 UBool inDirectMode;
473
474 /* set up the local pointers */
475 cnv=pArgs->converter;
476
477 /* set up the local pointers */
478 source=pArgs->source;
479 sourceLimit=pArgs->sourceLimit;
480 target=(uint8_t *)pArgs->target;
481 targetLimit=(uint8_t *)pArgs->targetLimit;
482 offsets=pArgs->offsets;
483
484 /* get the state machine state */
485 {
486 uint32_t status=cnv->fromUnicodeStatus;
487 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
488 inDirectMode=(UBool)((status>>24)&1);
489 base64Counter=(int8_t)(status>>16);
490 bits=(uint8_t)status;
2ca993e8 491 U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
b75a7d8f
A
492 }
493
494 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
495 sourceIndex=0;
496
497 if(inDirectMode) {
498directMode:
73c04bcf
A
499 length=(int32_t)(sourceLimit-source);
500 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
501 if(length>targetCapacity) {
502 length=targetCapacity;
503 }
504 while(length>0) {
505 c=*source++;
506 /* currently always encode CR LF SP TAB directly */
507 if(c<=127 && encodeDirectly[c]) {
508 /* encode directly */
509 *target++=(uint8_t)c;
510 if(offsets!=NULL) {
511 *offsets++=sourceIndex++;
512 }
513 } else if(c==PLUS) {
514 /* output +- for + */
515 *target++=PLUS;
516 if(target<targetLimit) {
517 *target++=MINUS;
518 if(offsets!=NULL) {
519 *offsets++=sourceIndex;
520 *offsets++=sourceIndex++;
521 }
522 /* realign length and targetCapacity */
523 goto directMode;
524 } else {
525 if(offsets!=NULL) {
526 *offsets++=sourceIndex++;
527 }
528 cnv->charErrorBuffer[0]=MINUS;
529 cnv->charErrorBufferLength=1;
530 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
531 break;
532 }
533 } else {
534 /* un-read this character and switch to Unicode Mode */
535 --source;
536 *target++=PLUS;
537 if(offsets!=NULL) {
538 *offsets++=sourceIndex;
539 }
540 inDirectMode=FALSE;
541 base64Counter=0;
542 goto unicodeMode;
543 }
544 --length;
545 }
546 if(source<sourceLimit && target>=targetLimit) {
547 /* target is full */
548 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
549 }
550 } else {
551unicodeMode:
552 while(source<sourceLimit) {
553 if(target<targetLimit) {
554 c=*source++;
555 if(c<=127 && encodeDirectly[c]) {
556 /* encode directly */
557 inDirectMode=TRUE;
558
559 /* trick: back out this character to make this easier */
560 --source;
561
562 /* terminate the base64 sequence */
563 if(base64Counter!=0) {
564 /* write remaining bits for the previous character */
565 *target++=toBase64[bits];
566 if(offsets!=NULL) {
567 *offsets++=sourceIndex-1;
568 }
569 }
570 if(fromBase64[c]!=-1) {
571 /* need to terminate with a minus */
572 if(target<targetLimit) {
573 *target++=MINUS;
574 if(offsets!=NULL) {
575 *offsets++=sourceIndex-1;
576 }
577 } else {
578 cnv->charErrorBuffer[0]=MINUS;
579 cnv->charErrorBufferLength=1;
580 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
581 break;
582 }
583 }
584 goto directMode;
585 } else {
586 /*
587 * base64 this character:
588 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
589 * and the bits of this character, each implicitly in UTF-16BE.
590 *
591 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
592 * character to the next. The actual 2 or 4 bits are shifted to the left edge
593 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
594 */
595 switch(base64Counter) {
596 case 0:
597 *target++=toBase64[c>>10];
598 if(target<targetLimit) {
599 *target++=toBase64[(c>>4)&0x3f];
600 if(offsets!=NULL) {
601 *offsets++=sourceIndex;
602 *offsets++=sourceIndex++;
603 }
604 } else {
605 if(offsets!=NULL) {
606 *offsets++=sourceIndex++;
607 }
608 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
609 cnv->charErrorBufferLength=1;
610 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
611 }
612 bits=(uint8_t)((c&15)<<2);
613 base64Counter=1;
614 break;
615 case 1:
616 *target++=toBase64[bits|(c>>14)];
617 if(target<targetLimit) {
618 *target++=toBase64[(c>>8)&0x3f];
619 if(target<targetLimit) {
620 *target++=toBase64[(c>>2)&0x3f];
621 if(offsets!=NULL) {
622 *offsets++=sourceIndex;
623 *offsets++=sourceIndex;
624 *offsets++=sourceIndex++;
625 }
626 } else {
627 if(offsets!=NULL) {
628 *offsets++=sourceIndex;
629 *offsets++=sourceIndex++;
630 }
631 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
632 cnv->charErrorBufferLength=1;
633 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
634 }
635 } else {
636 if(offsets!=NULL) {
637 *offsets++=sourceIndex++;
638 }
639 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
640 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
641 cnv->charErrorBufferLength=2;
642 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
643 }
644 bits=(uint8_t)((c&3)<<4);
645 base64Counter=2;
646 break;
647 case 2:
648 *target++=toBase64[bits|(c>>12)];
649 if(target<targetLimit) {
650 *target++=toBase64[(c>>6)&0x3f];
651 if(target<targetLimit) {
652 *target++=toBase64[c&0x3f];
653 if(offsets!=NULL) {
654 *offsets++=sourceIndex;
655 *offsets++=sourceIndex;
656 *offsets++=sourceIndex++;
657 }
658 } else {
659 if(offsets!=NULL) {
660 *offsets++=sourceIndex;
661 *offsets++=sourceIndex++;
662 }
663 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
664 cnv->charErrorBufferLength=1;
665 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
666 }
667 } else {
668 if(offsets!=NULL) {
669 *offsets++=sourceIndex++;
670 }
671 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
672 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
673 cnv->charErrorBufferLength=2;
674 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
675 }
676 bits=0;
677 base64Counter=0;
678 break;
679 default:
680 /* will never occur */
681 break;
682 }
683 }
684 } else {
685 /* target is full */
686 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
687 break;
688 }
689 }
690 }
691
692 if(pArgs->flush && source>=sourceLimit) {
693 /* flush remaining bits to the target */
4388f060
A
694 if(!inDirectMode) {
695 if (base64Counter!=0) {
696 if(target<targetLimit) {
697 *target++=toBase64[bits];
698 if(offsets!=NULL) {
699 *offsets++=sourceIndex-1;
700 }
701 } else {
702 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
703 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
704 }
705 }
706 /* Add final MINUS to terminate unicodeMode */
b75a7d8f 707 if(target<targetLimit) {
4388f060 708 *target++=MINUS;
b75a7d8f
A
709 if(offsets!=NULL) {
710 *offsets++=sourceIndex-1;
711 }
712 } else {
4388f060 713 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
b75a7d8f
A
714 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
715 }
716 }
717 /* reset the state for the next conversion */
718 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
719 } else {
720 /* set the converter state back into UConverter */
721 cnv->fromUnicodeStatus=
722 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
723 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
724 }
725
726 /* write back the updated pointers */
727 pArgs->source=source;
728 pArgs->target=(char *)target;
729 pArgs->offsets=offsets;
730 return;
731}
732
733static const char *
734_UTF7GetName(const UConverter *cnv) {
735 switch(cnv->fromUnicodeStatus>>28) {
736 case 1:
737 return "UTF-7,version=1";
738 default:
739 return "UTF-7";
740 }
741}
742
743static const UConverterImpl _UTF7Impl={
744 UCNV_UTF7,
745
746 NULL,
747 NULL,
748
749 _UTF7Open,
750 NULL,
751 _UTF7Reset,
752
753 _UTF7ToUnicodeWithOffsets,
754 _UTF7ToUnicodeWithOffsets,
755 _UTF7FromUnicodeWithOffsets,
756 _UTF7FromUnicodeWithOffsets,
374ca955 757 NULL,
b75a7d8f
A
758
759 NULL,
760 _UTF7GetName,
761 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
762 NULL,
763 ucnv_getCompleteUnicodeSet
764};
765
766static const UConverterStaticData _UTF7StaticData={
767 sizeof(UConverterStaticData),
768 "UTF-7",
769 0, /* TODO CCSID for UTF-7 */
770 UCNV_IBM, UCNV_UTF7,
771 1, 4,
772 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
773 FALSE, FALSE,
774 0,
775 0,
776 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
777};
778
2ca993e8
A
779const UConverterSharedData _UTF7Data=
780 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
b75a7d8f
A
781
782/* IMAP mailbox name encoding ----------------------------------------------- */
783
784/*
785 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
786 * http://www.ietf.org/rfc/rfc2060.txt
787 *
788 * 5.1.3. Mailbox International Naming Convention
789 *
790 * By convention, international mailbox names are specified using a
791 * modified version of the UTF-7 encoding described in [UTF-7]. The
792 * purpose of these modifications is to correct the following problems
793 * with UTF-7:
794 *
795 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
796 * the common use of "+" in mailbox names, in particular USENET
797 * newsgroup names.
798 *
799 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
800 * conflicts with the use of "/" as a popular hierarchy delimiter.
801 *
802 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
803 * the use of "\" as a popular hierarchy delimiter.
804 *
805 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
806 * the use of "~" in some servers as a home directory indicator.
807 *
808 * 5) UTF-7 permits multiple alternate forms to represent the same
809 * string; in particular, printable US-ASCII chararacters can be
810 * represented in encoded form.
811 *
812 * In modified UTF-7, printable US-ASCII characters except for "&"
813 * represent themselves; that is, characters with octet values 0x20-0x25
814 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
815 * octet sequence "&-".
816 *
817 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
818 * Unicode 16-bit octets) are represented in modified BASE64, with a
819 * further modification from [UTF-7] that "," is used instead of "/".
820 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
821 * character which can represent itself.
822 *
823 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
824 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
825 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
826 * ").
827 *
828 * For example, here is a mailbox name which mixes English, Japanese,
829 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
830 */
831
832/*
833 * Tests for US-ASCII characters belonging to character classes
834 * defined in UTF-7.
835 *
836 * Set D (directly encoded characters) consists of the following
837 * characters: the upper and lower case letters A through Z
838 * and a through z, the 10 digits 0-9, and the following nine special
839 * characters (note that "+" and "=" are omitted):
840 * '(),-./:?
841 *
842 * Set O (optional direct characters) consists of the following
843 * characters (note that "\" and "~" are omitted):
844 * !"#$%&*;<=>@[]^_`{|}
845 *
846 * According to the rules in RFC 2152, the byte values for the following
847 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
848 * - all C0 control codes except for CR LF TAB
849 * - BACKSLASH
850 * - TILDE
851 * - DEL
852 * - all codes beyond US-ASCII, i.e. all >127
853 */
854
855/* uses '&' not '+' to start a base64 sequence */
856#define AMPERSAND 0x26
857#define COMMA 0x2c
858#define SLASH 0x2f
859
860/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
861#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
862
863/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
864#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
865
866#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
867#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
868
869/*
870 * converter status values:
871 *
872 * toUnicodeStatus:
873 * 24 inDirectMode (boolean)
874 * 23..16 base64Counter (-1..7)
875 * 15..0 bits (up to 14 bits incoming base64)
876 *
877 * fromUnicodeStatus:
878 * 24 inDirectMode (boolean)
879 * 23..16 base64Counter (0..2)
880 * 7..0 bits (6 bits outgoing base64)
881 *
882 * ignore bits 31..25
883 */
884
885static void
886_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
887 UErrorCode *pErrorCode) {
888 UConverter *cnv;
889 const uint8_t *source, *sourceLimit;
890 UChar *target;
891 const UChar *targetLimit;
892 int32_t *offsets;
893
894 uint8_t *bytes;
895 uint8_t byteIndex;
896
897 int32_t length, targetCapacity;
898
899 /* UTF-7 state */
900 uint16_t bits;
901 int8_t base64Counter;
902 UBool inDirectMode;
903
904 int8_t base64Value;
905
906 int32_t sourceIndex, nextSourceIndex;
907
908 UChar c;
909 uint8_t b;
910
911 /* set up the local pointers */
912 cnv=pArgs->converter;
913
914 source=(const uint8_t *)pArgs->source;
915 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
916 target=pArgs->target;
917 targetLimit=pArgs->targetLimit;
918 offsets=pArgs->offsets;
919 /* get the state machine state */
920 {
921 uint32_t status=cnv->toUnicodeStatus;
922 inDirectMode=(UBool)((status>>24)&1);
923 base64Counter=(int8_t)(status>>16);
924 bits=(uint16_t)status;
925 }
926 bytes=cnv->toUBytes;
927 byteIndex=cnv->toULength;
928
929 /* sourceIndex=-1 if the current character began in the previous buffer */
930 sourceIndex=byteIndex==0 ? 0 : -1;
931 nextSourceIndex=0;
932
b75a7d8f
A
933 if(inDirectMode) {
934directMode:
935 /*
936 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
937 * with their US-ASCII byte values.
938 * An ampersand starts Unicode (or "escape") Mode.
939 *
940 * In Direct Mode, only the sourceIndex is used.
941 */
942 byteIndex=0;
73c04bcf
A
943 length=(int32_t)(sourceLimit-source);
944 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
945 if(length>targetCapacity) {
946 length=targetCapacity;
947 }
948 while(length>0) {
949 b=*source++;
950 if(!isLegalIMAP(b)) {
951 /* illegal */
952 bytes[0]=b;
953 byteIndex=1;
374ca955
A
954 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
955 break;
b75a7d8f
A
956 } else if(b!=AMPERSAND) {
957 /* write directly encoded character */
958 *target++=b;
959 if(offsets!=NULL) {
960 *offsets++=sourceIndex++;
961 }
962 } else /* AMPERSAND */ {
963 /* switch to Unicode mode */
964 nextSourceIndex=++sourceIndex;
965 inDirectMode=FALSE;
966 byteIndex=0;
967 bits=0;
968 base64Counter=-1;
969 goto unicodeMode;
970 }
971 --length;
972 }
973 if(source<sourceLimit && target>=targetLimit) {
974 /* target is full */
975 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
976 }
977 } else {
978unicodeMode:
979 /*
980 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
981 * The base64 sequence ends with any character that is not in the base64 alphabet.
982 * A terminating minus sign is consumed.
983 * US-ASCII must not be base64-ed.
984 *
985 * In Unicode Mode, the sourceIndex has the index to the start of the current
986 * base64 bytes, while nextSourceIndex is precisely parallel to source,
987 * keeping the index to the following byte.
988 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
989 */
990 while(source<sourceLimit) {
991 if(target<targetLimit) {
992 bytes[byteIndex++]=b=*source++;
993 ++nextSourceIndex;
994 if(b>0x7e) {
995 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
996 inDirectMode=TRUE;
374ca955
A
997 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
998 break;
b75a7d8f
A
999 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1000 /* collect base64 bytes into UChars */
1001 switch(base64Counter) {
1002 case -1: /* -1 is immediately after the & */
1003 case 0:
1004 bits=base64Value;
1005 base64Counter=1;
1006 break;
1007 case 1:
1008 case 3:
1009 case 4:
1010 case 6:
1011 bits=(uint16_t)((bits<<6)|base64Value);
1012 ++base64Counter;
1013 break;
1014 case 2:
1015 c=(UChar)((bits<<4)|(base64Value>>2));
1016 if(isLegalIMAP(c)) {
1017 /* illegal */
1018 inDirectMode=TRUE;
374ca955
A
1019 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1020 goto endloop;
b75a7d8f
A
1021 }
1022 *target++=c;
1023 if(offsets!=NULL) {
1024 *offsets++=sourceIndex;
1025 sourceIndex=nextSourceIndex-1;
1026 }
1027 bytes[0]=b; /* keep this byte in case an error occurs */
1028 byteIndex=1;
1029 bits=(uint16_t)(base64Value&3);
1030 base64Counter=3;
1031 break;
1032 case 5:
1033 c=(UChar)((bits<<2)|(base64Value>>4));
1034 if(isLegalIMAP(c)) {
1035 /* illegal */
1036 inDirectMode=TRUE;
374ca955
A
1037 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1038 goto endloop;
b75a7d8f
A
1039 }
1040 *target++=c;
1041 if(offsets!=NULL) {
1042 *offsets++=sourceIndex;
1043 sourceIndex=nextSourceIndex-1;
1044 }
1045 bytes[0]=b; /* keep this byte in case an error occurs */
1046 byteIndex=1;
1047 bits=(uint16_t)(base64Value&15);
1048 base64Counter=6;
1049 break;
1050 case 7:
1051 c=(UChar)((bits<<6)|base64Value);
1052 if(isLegalIMAP(c)) {
1053 /* illegal */
1054 inDirectMode=TRUE;
374ca955
A
1055 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1056 goto endloop;
b75a7d8f
A
1057 }
1058 *target++=c;
1059 if(offsets!=NULL) {
1060 *offsets++=sourceIndex;
1061 sourceIndex=nextSourceIndex;
1062 }
1063 byteIndex=0;
1064 bits=0;
1065 base64Counter=0;
1066 break;
1067 default:
1068 /* will never occur */
1069 break;
1070 }
1071 } else if(base64Value==-2) {
1072 /* minus sign terminates the base64 sequence */
1073 inDirectMode=TRUE;
1074 if(base64Counter==-1) {
1075 /* &- i.e. a minus immediately following an ampersand */
1076 *target++=AMPERSAND;
1077 if(offsets!=NULL) {
1078 *offsets++=sourceIndex-1;
1079 }
1080 } else {
1081 /* absorb the minus and leave the Unicode Mode */
1082 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1083 /* bits are illegally left over, a UChar is incomplete */
1084 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
374ca955
A
1085 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1086 break;
b75a7d8f
A
1087 }
1088 }
1089 sourceIndex=nextSourceIndex;
1090 goto directMode;
1091 } else {
1092 if(base64Counter==-1) {
1093 /* illegal: & immediately followed by something other than base64 or minus sign */
1094 /* include the ampersand in the reported sequence */
1095 --sourceIndex;
1096 bytes[0]=AMPERSAND;
1097 bytes[1]=b;
1098 byteIndex=2;
1099 }
1100 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1101 /* base64Value==-3 for illegal characters */
1102 /* illegal */
1103 inDirectMode=TRUE;
374ca955
A
1104 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1105 break;
b75a7d8f
A
1106 }
1107 } else {
1108 /* target is full */
1109 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1110 break;
1111 }
1112 }
1113 }
1114endloop:
1115
374ca955
A
1116 /*
1117 * the end of the input stream and detection of truncated input
1118 * are handled by the framework, but here we must check if we are in Unicode
1119 * mode and byteIndex==0 because we must end in direct mode
1120 *
1121 * conditions:
1122 * successful
1123 * in Unicode mode and byteIndex==0
1124 * end of input and no truncated input
1125 */
1126 if( U_SUCCESS(*pErrorCode) &&
1127 !inDirectMode && byteIndex==0 &&
1128 pArgs->flush && source>=sourceLimit
1129 ) {
1130 if(base64Counter==-1) {
1131 /* & at the very end of the input */
1132 /* make the ampersand the reported sequence */
1133 bytes[0]=AMPERSAND;
1134 byteIndex=1;
b75a7d8f 1135 }
374ca955
A
1136 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1137
1138 inDirectMode=TRUE; /* avoid looping */
1139 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
b75a7d8f
A
1140 }
1141
374ca955
A
1142 /* set the converter state back into UConverter */
1143 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1144 cnv->toULength=byteIndex;
1145
b75a7d8f
A
1146 /* write back the updated pointers */
1147 pArgs->source=(const char *)source;
1148 pArgs->target=target;
1149 pArgs->offsets=offsets;
1150 return;
b75a7d8f
A
1151}
1152
1153static void
1154_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1155 UErrorCode *pErrorCode) {
1156 UConverter *cnv;
1157 const UChar *source, *sourceLimit;
1158 uint8_t *target, *targetLimit;
1159 int32_t *offsets;
1160
1161 int32_t length, targetCapacity, sourceIndex;
1162 UChar c;
1163 uint8_t b;
1164
1165 /* UTF-7 state */
1166 uint8_t bits;
1167 int8_t base64Counter;
1168 UBool inDirectMode;
1169
1170 /* set up the local pointers */
1171 cnv=pArgs->converter;
1172
1173 /* set up the local pointers */
1174 source=pArgs->source;
1175 sourceLimit=pArgs->sourceLimit;
1176 target=(uint8_t *)pArgs->target;
1177 targetLimit=(uint8_t *)pArgs->targetLimit;
1178 offsets=pArgs->offsets;
1179
1180 /* get the state machine state */
1181 {
1182 uint32_t status=cnv->fromUnicodeStatus;
1183 inDirectMode=(UBool)((status>>24)&1);
1184 base64Counter=(int8_t)(status>>16);
1185 bits=(uint8_t)status;
1186 }
1187
1188 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1189 sourceIndex=0;
1190
1191 if(inDirectMode) {
1192directMode:
73c04bcf
A
1193 length=(int32_t)(sourceLimit-source);
1194 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
1195 if(length>targetCapacity) {
1196 length=targetCapacity;
1197 }
1198 while(length>0) {
1199 c=*source++;
1200 /* encode 0x20..0x7e except '&' directly */
1201 if(inSetDIMAP(c)) {
1202 /* encode directly */
1203 *target++=(uint8_t)c;
1204 if(offsets!=NULL) {
1205 *offsets++=sourceIndex++;
1206 }
1207 } else if(c==AMPERSAND) {
1208 /* output &- for & */
1209 *target++=AMPERSAND;
1210 if(target<targetLimit) {
1211 *target++=MINUS;
1212 if(offsets!=NULL) {
1213 *offsets++=sourceIndex;
1214 *offsets++=sourceIndex++;
1215 }
1216 /* realign length and targetCapacity */
1217 goto directMode;
1218 } else {
1219 if(offsets!=NULL) {
1220 *offsets++=sourceIndex++;
1221 }
1222 cnv->charErrorBuffer[0]=MINUS;
1223 cnv->charErrorBufferLength=1;
1224 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1225 break;
1226 }
1227 } else {
1228 /* un-read this character and switch to Unicode Mode */
1229 --source;
1230 *target++=AMPERSAND;
1231 if(offsets!=NULL) {
1232 *offsets++=sourceIndex;
1233 }
1234 inDirectMode=FALSE;
1235 base64Counter=0;
1236 goto unicodeMode;
1237 }
1238 --length;
1239 }
1240 if(source<sourceLimit && target>=targetLimit) {
1241 /* target is full */
1242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1243 }
1244 } else {
1245unicodeMode:
1246 while(source<sourceLimit) {
1247 if(target<targetLimit) {
1248 c=*source++;
1249 if(isLegalIMAP(c)) {
1250 /* encode directly */
1251 inDirectMode=TRUE;
1252
1253 /* trick: back out this character to make this easier */
1254 --source;
1255
1256 /* terminate the base64 sequence */
1257 if(base64Counter!=0) {
1258 /* write remaining bits for the previous character */
1259 *target++=TO_BASE64_IMAP(bits);
1260 if(offsets!=NULL) {
1261 *offsets++=sourceIndex-1;
1262 }
1263 }
1264 /* need to terminate with a minus */
1265 if(target<targetLimit) {
1266 *target++=MINUS;
1267 if(offsets!=NULL) {
1268 *offsets++=sourceIndex-1;
1269 }
1270 } else {
1271 cnv->charErrorBuffer[0]=MINUS;
1272 cnv->charErrorBufferLength=1;
1273 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1274 break;
1275 }
1276 goto directMode;
1277 } else {
1278 /*
1279 * base64 this character:
1280 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1281 * and the bits of this character, each implicitly in UTF-16BE.
1282 *
1283 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1284 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1285 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1286 */
1287 switch(base64Counter) {
1288 case 0:
1289 b=(uint8_t)(c>>10);
1290 *target++=TO_BASE64_IMAP(b);
1291 if(target<targetLimit) {
1292 b=(uint8_t)((c>>4)&0x3f);
1293 *target++=TO_BASE64_IMAP(b);
1294 if(offsets!=NULL) {
1295 *offsets++=sourceIndex;
1296 *offsets++=sourceIndex++;
1297 }
1298 } else {
1299 if(offsets!=NULL) {
1300 *offsets++=sourceIndex++;
1301 }
1302 b=(uint8_t)((c>>4)&0x3f);
1303 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1304 cnv->charErrorBufferLength=1;
1305 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1306 }
1307 bits=(uint8_t)((c&15)<<2);
1308 base64Counter=1;
1309 break;
1310 case 1:
1311 b=(uint8_t)(bits|(c>>14));
1312 *target++=TO_BASE64_IMAP(b);
1313 if(target<targetLimit) {
1314 b=(uint8_t)((c>>8)&0x3f);
1315 *target++=TO_BASE64_IMAP(b);
1316 if(target<targetLimit) {
1317 b=(uint8_t)((c>>2)&0x3f);
1318 *target++=TO_BASE64_IMAP(b);
1319 if(offsets!=NULL) {
1320 *offsets++=sourceIndex;
1321 *offsets++=sourceIndex;
1322 *offsets++=sourceIndex++;
1323 }
1324 } else {
1325 if(offsets!=NULL) {
1326 *offsets++=sourceIndex;
1327 *offsets++=sourceIndex++;
1328 }
1329 b=(uint8_t)((c>>2)&0x3f);
1330 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1331 cnv->charErrorBufferLength=1;
1332 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1333 }
1334 } else {
1335 if(offsets!=NULL) {
1336 *offsets++=sourceIndex++;
1337 }
1338 b=(uint8_t)((c>>8)&0x3f);
1339 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340 b=(uint8_t)((c>>2)&0x3f);
1341 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1342 cnv->charErrorBufferLength=2;
1343 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344 }
1345 bits=(uint8_t)((c&3)<<4);
1346 base64Counter=2;
1347 break;
1348 case 2:
1349 b=(uint8_t)(bits|(c>>12));
1350 *target++=TO_BASE64_IMAP(b);
1351 if(target<targetLimit) {
1352 b=(uint8_t)((c>>6)&0x3f);
1353 *target++=TO_BASE64_IMAP(b);
1354 if(target<targetLimit) {
1355 b=(uint8_t)(c&0x3f);
1356 *target++=TO_BASE64_IMAP(b);
1357 if(offsets!=NULL) {
1358 *offsets++=sourceIndex;
1359 *offsets++=sourceIndex;
1360 *offsets++=sourceIndex++;
1361 }
1362 } else {
1363 if(offsets!=NULL) {
1364 *offsets++=sourceIndex;
1365 *offsets++=sourceIndex++;
1366 }
1367 b=(uint8_t)(c&0x3f);
1368 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1369 cnv->charErrorBufferLength=1;
1370 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371 }
1372 } else {
1373 if(offsets!=NULL) {
1374 *offsets++=sourceIndex++;
1375 }
1376 b=(uint8_t)((c>>6)&0x3f);
1377 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378 b=(uint8_t)(c&0x3f);
1379 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1380 cnv->charErrorBufferLength=2;
1381 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382 }
1383 bits=0;
1384 base64Counter=0;
1385 break;
1386 default:
1387 /* will never occur */
1388 break;
1389 }
1390 }
1391 } else {
1392 /* target is full */
1393 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1394 break;
1395 }
1396 }
1397 }
1398
1399 if(pArgs->flush && source>=sourceLimit) {
1400 /* flush remaining bits to the target */
1401 if(!inDirectMode) {
1402 if(base64Counter!=0) {
1403 if(target<targetLimit) {
1404 *target++=TO_BASE64_IMAP(bits);
1405 if(offsets!=NULL) {
1406 *offsets++=sourceIndex-1;
1407 }
1408 } else {
1409 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1410 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1411 }
1412 }
1413 /* need to terminate with a minus */
1414 if(target<targetLimit) {
1415 *target++=MINUS;
1416 if(offsets!=NULL) {
1417 *offsets++=sourceIndex-1;
1418 }
1419 } else {
1420 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1421 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1422 }
1423 }
1424 /* reset the state for the next conversion */
1425 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1426 } else {
1427 /* set the converter state back into UConverter */
1428 cnv->fromUnicodeStatus=
1429 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1430 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1431 }
1432
1433 /* write back the updated pointers */
1434 pArgs->source=source;
1435 pArgs->target=(char *)target;
1436 pArgs->offsets=offsets;
1437 return;
1438}
1439
1440static const UConverterImpl _IMAPImpl={
1441 UCNV_IMAP_MAILBOX,
1442
1443 NULL,
1444 NULL,
1445
1446 _UTF7Open,
1447 NULL,
1448 _UTF7Reset,
1449
1450 _IMAPToUnicodeWithOffsets,
1451 _IMAPToUnicodeWithOffsets,
1452 _IMAPFromUnicodeWithOffsets,
1453 _IMAPFromUnicodeWithOffsets,
374ca955 1454 NULL,
b75a7d8f
A
1455
1456 NULL,
1457 NULL,
1458 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1459 NULL,
1460 ucnv_getCompleteUnicodeSet
1461};
1462
1463static const UConverterStaticData _IMAPStaticData={
1464 sizeof(UConverterStaticData),
1465 "IMAP-mailbox-name",
374ca955 1466 0, /* TODO CCSID for IMAP-mailbox-name */
b75a7d8f
A
1467 UCNV_IBM, UCNV_IMAP_MAILBOX,
1468 1, 4,
1469 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1470 FALSE, FALSE,
1471 0,
1472 0,
1473 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1474};
1475
2ca993e8
A
1476const UConverterSharedData _IMAPData=
1477 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
374ca955
A
1478
1479#endif