]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv_u7.c
ICU-461.13.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u7.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u7.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24
25 /* UTF-7 -------------------------------------------------------------------- */
26
27 /*
28 * UTF-7 is a stateful encoding of Unicode.
29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30 * It was intended for use in Internet email systems, using in its bytewise
31 * encoding only a subset of 7-bit US-ASCII.
32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
33 * occasionally used.
34 *
35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36 * characters directly or in base64. Especially, the characters in set O
37 * as defined in the RFC (see below) may be encoded directly but are not
38 * allowed in, e.g., email headers.
39 * By default, the ICU UTF-7 converter encodes set O directly.
40 * By choosing the option "version=1", set O will be escaped instead.
41 * For example:
42 * utf7Converter=ucnv_open("UTF-7,version=1");
43 *
44 * For details about email headers see RFC 2047.
45 */
46
47 /*
48 * Tests for US-ASCII characters belonging to character classes
49 * defined in UTF-7.
50 *
51 * Set D (directly encoded characters) consists of the following
52 * characters: the upper and lower case letters A through Z
53 * and a through z, the 10 digits 0-9, and the following nine special
54 * characters (note that "+" and "=" are omitted):
55 * '(),-./:?
56 *
57 * Set O (optional direct characters) consists of the following
58 * characters (note that "\" and "~" are omitted):
59 * !"#$%&*;<=>@[]^_`{|}
60 *
61 * According to the rules in RFC 2152, the byte values for the following
62 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63 * - all C0 control codes except for CR LF TAB
64 * - BACKSLASH
65 * - TILDE
66 * - DEL
67 * - all codes beyond US-ASCII, i.e. all >127
68 */
69 #define inSetD(c) \
70 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71 (uint8_t)((c)-48)<10 || /* digits */ \
72 (uint8_t)((c)-39)<3 || /* '() */ \
73 (uint8_t)((c)-44)<4 || /* ,-./ */ \
74 (c)==58 || (c)==63 /* :? */ \
75 )
76
77 #define inSetO(c) \
78 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
79 (uint8_t)((c)-59)<4 || /* ;<=> */ \
80 (uint8_t)((c)-93)<4 || /* ]^_` */ \
81 (uint8_t)((c)-123)<3 || /* {|} */ \
82 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
83 )
84
85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
87
88 #define PLUS 43
89 #define MINUS 45
90 #define BACKSLASH 92
91 #define TILDE 126
92
93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
95
96 /* encode directly sets D and O and CR LF SP TAB */
97 static const UBool encodeDirectlyMaximum[128]={
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
107
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
110 };
111
112 /* encode directly set D and CR LF SP TAB but not set O */
113 static const UBool encodeDirectlyRestricted[128]={
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117
118 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
120
121 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
123
124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
126 };
127
128 static const uint8_t
129 toBase64[64]={
130 /* A-Z */
131 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
133 /* a-z */
134 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
136 /* 0-9 */
137 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
138 /* +/ */
139 43, 47
140 };
141
142 static const int8_t
143 fromBase64[128]={
144 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
147
148 /* general punctuation with + and / and a special value (-2) for - */
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
150 /* digits */
151 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
152
153 /* A-Z */
154 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
155 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
156
157 /* a-z */
158 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
160 };
161
162 /*
163 * converter status values:
164 *
165 * toUnicodeStatus:
166 * 24 inDirectMode (boolean)
167 * 23..16 base64Counter (-1..7)
168 * 15..0 bits (up to 14 bits incoming base64)
169 *
170 * fromUnicodeStatus:
171 * 31..28 version (0: set O direct 1: set O escaped)
172 * 24 inDirectMode (boolean)
173 * 23..16 base64Counter (0..2)
174 * 7..0 bits (6 bits outgoing base64)
175 *
176 */
177
178 static void
179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
180 if(choice<=UCNV_RESET_TO_UNICODE) {
181 /* reset toUnicode */
182 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
183 cnv->toULength=0;
184 }
185 if(choice!=UCNV_RESET_TO_UNICODE) {
186 /* reset fromUnicode */
187 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
188 }
189 }
190
191 static void
192 _UTF7Open(UConverter *cnv,
193 UConverterLoadArgs *pArgs,
194 UErrorCode *pErrorCode) {
195 if(UCNV_GET_VERSION(cnv)<=1) {
196 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
197 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
198 _UTF7Reset(cnv, UCNV_RESET_BOTH);
199 } else {
200 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201 }
202 }
203
204 static void
205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
206 UErrorCode *pErrorCode) {
207 UConverter *cnv;
208 const uint8_t *source, *sourceLimit;
209 UChar *target;
210 const UChar *targetLimit;
211 int32_t *offsets;
212
213 uint8_t *bytes;
214 uint8_t byteIndex;
215
216 int32_t length, targetCapacity;
217
218 /* UTF-7 state */
219 uint16_t bits;
220 int8_t base64Counter;
221 UBool inDirectMode;
222
223 int8_t base64Value;
224
225 int32_t sourceIndex, nextSourceIndex;
226
227 uint8_t b;
228 /* set up the local pointers */
229 cnv=pArgs->converter;
230
231 source=(const uint8_t *)pArgs->source;
232 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
233 target=pArgs->target;
234 targetLimit=pArgs->targetLimit;
235 offsets=pArgs->offsets;
236 /* get the state machine state */
237 {
238 uint32_t status=cnv->toUnicodeStatus;
239 inDirectMode=(UBool)((status>>24)&1);
240 base64Counter=(int8_t)(status>>16);
241 bits=(uint16_t)status;
242 }
243 bytes=cnv->toUBytes;
244 byteIndex=cnv->toULength;
245
246 /* sourceIndex=-1 if the current character began in the previous buffer */
247 sourceIndex=byteIndex==0 ? 0 : -1;
248 nextSourceIndex=0;
249
250 if(inDirectMode) {
251 directMode:
252 /*
253 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
254 * with their US-ASCII byte values.
255 * Backslash and Tilde and most control characters are not allowed in UTF-7.
256 * A plus sign starts Unicode (or "escape") Mode.
257 *
258 * In Direct Mode, only the sourceIndex is used.
259 */
260 byteIndex=0;
261 length=(int32_t)(sourceLimit-source);
262 targetCapacity=(int32_t)(targetLimit-target);
263 if(length>targetCapacity) {
264 length=targetCapacity;
265 }
266 while(length>0) {
267 b=*source++;
268 if(!isLegalUTF7(b)) {
269 /* illegal */
270 bytes[0]=b;
271 byteIndex=1;
272 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
273 break;
274 } else if(b!=PLUS) {
275 /* write directly encoded character */
276 *target++=b;
277 if(offsets!=NULL) {
278 *offsets++=sourceIndex++;
279 }
280 } else /* PLUS */ {
281 /* switch to Unicode mode */
282 nextSourceIndex=++sourceIndex;
283 inDirectMode=FALSE;
284 byteIndex=0;
285 bits=0;
286 base64Counter=-1;
287 goto unicodeMode;
288 }
289 --length;
290 }
291 if(source<sourceLimit && target>=targetLimit) {
292 /* target is full */
293 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
294 }
295 } else {
296 unicodeMode:
297 /*
298 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
299 * The base64 sequence ends with any character that is not in the base64 alphabet.
300 * A terminating minus sign is consumed.
301 *
302 * In Unicode Mode, the sourceIndex has the index to the start of the current
303 * base64 bytes, while nextSourceIndex is precisely parallel to source,
304 * keeping the index to the following byte.
305 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
306 */
307 while(source<sourceLimit) {
308 if(target<targetLimit) {
309 bytes[byteIndex++]=b=*source++;
310 ++nextSourceIndex;
311 base64Value = -3; /* initialize as illegal */
312 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
313 /* either
314 * base64Value==-1 for any legal character except base64 and minus sign, or
315 * base64Value==-3 for illegal characters:
316 * 1. In either case, leave Unicode mode.
317 * 2.1. If we ended with an incomplete UChar or none after the +, then
318 * generate an error for the preceding erroneous sequence and deal with
319 * the current (possibly illegal) character next time through.
320 * 2.2. Else the current char comes after a complete UChar, which was already
321 * pushed to the output buf, so:
322 * 2.2.1. If the current char is legal, just save it for processing next time.
323 * It may be for example, a plus which we need to deal with in direct mode.
324 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
325 */
326 inDirectMode=TRUE;
327 if(base64Counter==-1) {
328 /* illegal: + immediately followed by something other than base64 or minus sign */
329 /* include the plus sign in the reported sequence, but not the subsequent char */
330 --source;
331 bytes[0]=PLUS;
332 byteIndex=1;
333 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
334 break;
335 } else if(bits!=0) {
336 /* bits are illegally left over, a UChar is incomplete */
337 /* don't include current char (legal or illegal) in error seq */
338 --source;
339 --byteIndex;
340 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
341 break;
342 } else {
343 /* previous UChar was complete */
344 if (base64Value==-3) {
345 /* current character is illegal, deal with it here */
346 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347 break;
348 } else {
349 /* un-read the current character in case it is a plus sign */
350 --source;
351 sourceIndex=nextSourceIndex-1;
352 goto directMode;
353 }
354 }
355 } else if(base64Value>=0) {
356 /* collect base64 bytes into UChars */
357 switch(base64Counter) {
358 case -1: /* -1 is immediately after the + */
359 case 0:
360 bits=base64Value;
361 base64Counter=1;
362 break;
363 case 1:
364 case 3:
365 case 4:
366 case 6:
367 bits=(uint16_t)((bits<<6)|base64Value);
368 ++base64Counter;
369 break;
370 case 2:
371 *target++=(UChar)((bits<<4)|(base64Value>>2));
372 if(offsets!=NULL) {
373 *offsets++=sourceIndex;
374 sourceIndex=nextSourceIndex-1;
375 }
376 bytes[0]=b; /* keep this byte in case an error occurs */
377 byteIndex=1;
378 bits=(uint16_t)(base64Value&3);
379 base64Counter=3;
380 break;
381 case 5:
382 *target++=(UChar)((bits<<2)|(base64Value>>4));
383 if(offsets!=NULL) {
384 *offsets++=sourceIndex;
385 sourceIndex=nextSourceIndex-1;
386 }
387 bytes[0]=b; /* keep this byte in case an error occurs */
388 byteIndex=1;
389 bits=(uint16_t)(base64Value&15);
390 base64Counter=6;
391 break;
392 case 7:
393 *target++=(UChar)((bits<<6)|base64Value);
394 if(offsets!=NULL) {
395 *offsets++=sourceIndex;
396 sourceIndex=nextSourceIndex;
397 }
398 byteIndex=0;
399 bits=0;
400 base64Counter=0;
401 break;
402 default:
403 /* will never occur */
404 break;
405 }
406 } else /*base64Value==-2*/ {
407 /* minus sign terminates the base64 sequence */
408 inDirectMode=TRUE;
409 if(base64Counter==-1) {
410 /* +- i.e. a minus immediately following a plus */
411 *target++=PLUS;
412 if(offsets!=NULL) {
413 *offsets++=sourceIndex-1;
414 }
415 } else {
416 /* absorb the minus and leave the Unicode Mode */
417 if(bits!=0) {
418 /* bits are illegally left over, a UChar is incomplete */
419 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
420 break;
421 }
422 }
423 sourceIndex=nextSourceIndex;
424 goto directMode;
425 }
426 } else {
427 /* target is full */
428 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
429 break;
430 }
431 }
432 }
433
434 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
435 /*
436 * if we are in Unicode mode, then the byteIndex might not be 0,
437 * but that is ok if bits==0
438 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
439 * (not true for IMAP-mailbox-name where we must end in direct mode)
440 */
441 byteIndex=0;
442 }
443
444 /* set the converter state back into UConverter */
445 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
446 cnv->toULength=byteIndex;
447
448 /* write back the updated pointers */
449 pArgs->source=(const char *)source;
450 pArgs->target=target;
451 pArgs->offsets=offsets;
452 return;
453 }
454
455 static void
456 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
457 UErrorCode *pErrorCode) {
458 UConverter *cnv;
459 const UChar *source, *sourceLimit;
460 uint8_t *target, *targetLimit;
461 int32_t *offsets;
462
463 int32_t length, targetCapacity, sourceIndex;
464 UChar c;
465
466 /* UTF-7 state */
467 const UBool *encodeDirectly;
468 uint8_t bits;
469 int8_t base64Counter;
470 UBool inDirectMode;
471
472 /* set up the local pointers */
473 cnv=pArgs->converter;
474
475 /* set up the local pointers */
476 source=pArgs->source;
477 sourceLimit=pArgs->sourceLimit;
478 target=(uint8_t *)pArgs->target;
479 targetLimit=(uint8_t *)pArgs->targetLimit;
480 offsets=pArgs->offsets;
481
482 /* get the state machine state */
483 {
484 uint32_t status=cnv->fromUnicodeStatus;
485 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
486 inDirectMode=(UBool)((status>>24)&1);
487 base64Counter=(int8_t)(status>>16);
488 bits=(uint8_t)status;
489 }
490
491 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
492 sourceIndex=0;
493
494 if(inDirectMode) {
495 directMode:
496 length=(int32_t)(sourceLimit-source);
497 targetCapacity=(int32_t)(targetLimit-target);
498 if(length>targetCapacity) {
499 length=targetCapacity;
500 }
501 while(length>0) {
502 c=*source++;
503 /* currently always encode CR LF SP TAB directly */
504 if(c<=127 && encodeDirectly[c]) {
505 /* encode directly */
506 *target++=(uint8_t)c;
507 if(offsets!=NULL) {
508 *offsets++=sourceIndex++;
509 }
510 } else if(c==PLUS) {
511 /* output +- for + */
512 *target++=PLUS;
513 if(target<targetLimit) {
514 *target++=MINUS;
515 if(offsets!=NULL) {
516 *offsets++=sourceIndex;
517 *offsets++=sourceIndex++;
518 }
519 /* realign length and targetCapacity */
520 goto directMode;
521 } else {
522 if(offsets!=NULL) {
523 *offsets++=sourceIndex++;
524 }
525 cnv->charErrorBuffer[0]=MINUS;
526 cnv->charErrorBufferLength=1;
527 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
528 break;
529 }
530 } else {
531 /* un-read this character and switch to Unicode Mode */
532 --source;
533 *target++=PLUS;
534 if(offsets!=NULL) {
535 *offsets++=sourceIndex;
536 }
537 inDirectMode=FALSE;
538 base64Counter=0;
539 goto unicodeMode;
540 }
541 --length;
542 }
543 if(source<sourceLimit && target>=targetLimit) {
544 /* target is full */
545 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
546 }
547 } else {
548 unicodeMode:
549 while(source<sourceLimit) {
550 if(target<targetLimit) {
551 c=*source++;
552 if(c<=127 && encodeDirectly[c]) {
553 /* encode directly */
554 inDirectMode=TRUE;
555
556 /* trick: back out this character to make this easier */
557 --source;
558
559 /* terminate the base64 sequence */
560 if(base64Counter!=0) {
561 /* write remaining bits for the previous character */
562 *target++=toBase64[bits];
563 if(offsets!=NULL) {
564 *offsets++=sourceIndex-1;
565 }
566 }
567 if(fromBase64[c]!=-1) {
568 /* need to terminate with a minus */
569 if(target<targetLimit) {
570 *target++=MINUS;
571 if(offsets!=NULL) {
572 *offsets++=sourceIndex-1;
573 }
574 } else {
575 cnv->charErrorBuffer[0]=MINUS;
576 cnv->charErrorBufferLength=1;
577 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
578 break;
579 }
580 }
581 goto directMode;
582 } else {
583 /*
584 * base64 this character:
585 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
586 * and the bits of this character, each implicitly in UTF-16BE.
587 *
588 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
589 * character to the next. The actual 2 or 4 bits are shifted to the left edge
590 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
591 */
592 switch(base64Counter) {
593 case 0:
594 *target++=toBase64[c>>10];
595 if(target<targetLimit) {
596 *target++=toBase64[(c>>4)&0x3f];
597 if(offsets!=NULL) {
598 *offsets++=sourceIndex;
599 *offsets++=sourceIndex++;
600 }
601 } else {
602 if(offsets!=NULL) {
603 *offsets++=sourceIndex++;
604 }
605 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
606 cnv->charErrorBufferLength=1;
607 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
608 }
609 bits=(uint8_t)((c&15)<<2);
610 base64Counter=1;
611 break;
612 case 1:
613 *target++=toBase64[bits|(c>>14)];
614 if(target<targetLimit) {
615 *target++=toBase64[(c>>8)&0x3f];
616 if(target<targetLimit) {
617 *target++=toBase64[(c>>2)&0x3f];
618 if(offsets!=NULL) {
619 *offsets++=sourceIndex;
620 *offsets++=sourceIndex;
621 *offsets++=sourceIndex++;
622 }
623 } else {
624 if(offsets!=NULL) {
625 *offsets++=sourceIndex;
626 *offsets++=sourceIndex++;
627 }
628 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
629 cnv->charErrorBufferLength=1;
630 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631 }
632 } else {
633 if(offsets!=NULL) {
634 *offsets++=sourceIndex++;
635 }
636 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
637 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
638 cnv->charErrorBufferLength=2;
639 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
640 }
641 bits=(uint8_t)((c&3)<<4);
642 base64Counter=2;
643 break;
644 case 2:
645 *target++=toBase64[bits|(c>>12)];
646 if(target<targetLimit) {
647 *target++=toBase64[(c>>6)&0x3f];
648 if(target<targetLimit) {
649 *target++=toBase64[c&0x3f];
650 if(offsets!=NULL) {
651 *offsets++=sourceIndex;
652 *offsets++=sourceIndex;
653 *offsets++=sourceIndex++;
654 }
655 } else {
656 if(offsets!=NULL) {
657 *offsets++=sourceIndex;
658 *offsets++=sourceIndex++;
659 }
660 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
661 cnv->charErrorBufferLength=1;
662 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
663 }
664 } else {
665 if(offsets!=NULL) {
666 *offsets++=sourceIndex++;
667 }
668 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
669 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
670 cnv->charErrorBufferLength=2;
671 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
672 }
673 bits=0;
674 base64Counter=0;
675 break;
676 default:
677 /* will never occur */
678 break;
679 }
680 }
681 } else {
682 /* target is full */
683 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
684 break;
685 }
686 }
687 }
688
689 if(pArgs->flush && source>=sourceLimit) {
690 /* flush remaining bits to the target */
691 if(!inDirectMode && base64Counter!=0) {
692 if(target<targetLimit) {
693 *target++=toBase64[bits];
694 if(offsets!=NULL) {
695 *offsets++=sourceIndex-1;
696 }
697 } else {
698 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
699 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
700 }
701 }
702 /* reset the state for the next conversion */
703 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
704 } else {
705 /* set the converter state back into UConverter */
706 cnv->fromUnicodeStatus=
707 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
708 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
709 }
710
711 /* write back the updated pointers */
712 pArgs->source=source;
713 pArgs->target=(char *)target;
714 pArgs->offsets=offsets;
715 return;
716 }
717
718 static const char *
719 _UTF7GetName(const UConverter *cnv) {
720 switch(cnv->fromUnicodeStatus>>28) {
721 case 1:
722 return "UTF-7,version=1";
723 default:
724 return "UTF-7";
725 }
726 }
727
728 static const UConverterImpl _UTF7Impl={
729 UCNV_UTF7,
730
731 NULL,
732 NULL,
733
734 _UTF7Open,
735 NULL,
736 _UTF7Reset,
737
738 _UTF7ToUnicodeWithOffsets,
739 _UTF7ToUnicodeWithOffsets,
740 _UTF7FromUnicodeWithOffsets,
741 _UTF7FromUnicodeWithOffsets,
742 NULL,
743
744 NULL,
745 _UTF7GetName,
746 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
747 NULL,
748 ucnv_getCompleteUnicodeSet
749 };
750
751 static const UConverterStaticData _UTF7StaticData={
752 sizeof(UConverterStaticData),
753 "UTF-7",
754 0, /* TODO CCSID for UTF-7 */
755 UCNV_IBM, UCNV_UTF7,
756 1, 4,
757 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
758 FALSE, FALSE,
759 0,
760 0,
761 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
762 };
763
764 const UConverterSharedData _UTF7Data={
765 sizeof(UConverterSharedData), ~((uint32_t)0),
766 NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
767 0
768 };
769
770 /* IMAP mailbox name encoding ----------------------------------------------- */
771
772 /*
773 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
774 * http://www.ietf.org/rfc/rfc2060.txt
775 *
776 * 5.1.3. Mailbox International Naming Convention
777 *
778 * By convention, international mailbox names are specified using a
779 * modified version of the UTF-7 encoding described in [UTF-7]. The
780 * purpose of these modifications is to correct the following problems
781 * with UTF-7:
782 *
783 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
784 * the common use of "+" in mailbox names, in particular USENET
785 * newsgroup names.
786 *
787 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
788 * conflicts with the use of "/" as a popular hierarchy delimiter.
789 *
790 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
791 * the use of "\" as a popular hierarchy delimiter.
792 *
793 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
794 * the use of "~" in some servers as a home directory indicator.
795 *
796 * 5) UTF-7 permits multiple alternate forms to represent the same
797 * string; in particular, printable US-ASCII chararacters can be
798 * represented in encoded form.
799 *
800 * In modified UTF-7, printable US-ASCII characters except for "&"
801 * represent themselves; that is, characters with octet values 0x20-0x25
802 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
803 * octet sequence "&-".
804 *
805 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
806 * Unicode 16-bit octets) are represented in modified BASE64, with a
807 * further modification from [UTF-7] that "," is used instead of "/".
808 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
809 * character which can represent itself.
810 *
811 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
812 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
813 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
814 * ").
815 *
816 * For example, here is a mailbox name which mixes English, Japanese,
817 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
818 */
819
820 /*
821 * Tests for US-ASCII characters belonging to character classes
822 * defined in UTF-7.
823 *
824 * Set D (directly encoded characters) consists of the following
825 * characters: the upper and lower case letters A through Z
826 * and a through z, the 10 digits 0-9, and the following nine special
827 * characters (note that "+" and "=" are omitted):
828 * '(),-./:?
829 *
830 * Set O (optional direct characters) consists of the following
831 * characters (note that "\" and "~" are omitted):
832 * !"#$%&*;<=>@[]^_`{|}
833 *
834 * According to the rules in RFC 2152, the byte values for the following
835 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
836 * - all C0 control codes except for CR LF TAB
837 * - BACKSLASH
838 * - TILDE
839 * - DEL
840 * - all codes beyond US-ASCII, i.e. all >127
841 */
842
843 /* uses '&' not '+' to start a base64 sequence */
844 #define AMPERSAND 0x26
845 #define COMMA 0x2c
846 #define SLASH 0x2f
847
848 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
849 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
850
851 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
852 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
853
854 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
855 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
856
857 /*
858 * converter status values:
859 *
860 * toUnicodeStatus:
861 * 24 inDirectMode (boolean)
862 * 23..16 base64Counter (-1..7)
863 * 15..0 bits (up to 14 bits incoming base64)
864 *
865 * fromUnicodeStatus:
866 * 24 inDirectMode (boolean)
867 * 23..16 base64Counter (0..2)
868 * 7..0 bits (6 bits outgoing base64)
869 *
870 * ignore bits 31..25
871 */
872
873 static void
874 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
875 UErrorCode *pErrorCode) {
876 UConverter *cnv;
877 const uint8_t *source, *sourceLimit;
878 UChar *target;
879 const UChar *targetLimit;
880 int32_t *offsets;
881
882 uint8_t *bytes;
883 uint8_t byteIndex;
884
885 int32_t length, targetCapacity;
886
887 /* UTF-7 state */
888 uint16_t bits;
889 int8_t base64Counter;
890 UBool inDirectMode;
891
892 int8_t base64Value;
893
894 int32_t sourceIndex, nextSourceIndex;
895
896 UChar c;
897 uint8_t b;
898
899 /* set up the local pointers */
900 cnv=pArgs->converter;
901
902 source=(const uint8_t *)pArgs->source;
903 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
904 target=pArgs->target;
905 targetLimit=pArgs->targetLimit;
906 offsets=pArgs->offsets;
907 /* get the state machine state */
908 {
909 uint32_t status=cnv->toUnicodeStatus;
910 inDirectMode=(UBool)((status>>24)&1);
911 base64Counter=(int8_t)(status>>16);
912 bits=(uint16_t)status;
913 }
914 bytes=cnv->toUBytes;
915 byteIndex=cnv->toULength;
916
917 /* sourceIndex=-1 if the current character began in the previous buffer */
918 sourceIndex=byteIndex==0 ? 0 : -1;
919 nextSourceIndex=0;
920
921 if(inDirectMode) {
922 directMode:
923 /*
924 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
925 * with their US-ASCII byte values.
926 * An ampersand starts Unicode (or "escape") Mode.
927 *
928 * In Direct Mode, only the sourceIndex is used.
929 */
930 byteIndex=0;
931 length=(int32_t)(sourceLimit-source);
932 targetCapacity=(int32_t)(targetLimit-target);
933 if(length>targetCapacity) {
934 length=targetCapacity;
935 }
936 while(length>0) {
937 b=*source++;
938 if(!isLegalIMAP(b)) {
939 /* illegal */
940 bytes[0]=b;
941 byteIndex=1;
942 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
943 break;
944 } else if(b!=AMPERSAND) {
945 /* write directly encoded character */
946 *target++=b;
947 if(offsets!=NULL) {
948 *offsets++=sourceIndex++;
949 }
950 } else /* AMPERSAND */ {
951 /* switch to Unicode mode */
952 nextSourceIndex=++sourceIndex;
953 inDirectMode=FALSE;
954 byteIndex=0;
955 bits=0;
956 base64Counter=-1;
957 goto unicodeMode;
958 }
959 --length;
960 }
961 if(source<sourceLimit && target>=targetLimit) {
962 /* target is full */
963 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
964 }
965 } else {
966 unicodeMode:
967 /*
968 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
969 * The base64 sequence ends with any character that is not in the base64 alphabet.
970 * A terminating minus sign is consumed.
971 * US-ASCII must not be base64-ed.
972 *
973 * In Unicode Mode, the sourceIndex has the index to the start of the current
974 * base64 bytes, while nextSourceIndex is precisely parallel to source,
975 * keeping the index to the following byte.
976 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
977 */
978 while(source<sourceLimit) {
979 if(target<targetLimit) {
980 bytes[byteIndex++]=b=*source++;
981 ++nextSourceIndex;
982 if(b>0x7e) {
983 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
984 inDirectMode=TRUE;
985 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
986 break;
987 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
988 /* collect base64 bytes into UChars */
989 switch(base64Counter) {
990 case -1: /* -1 is immediately after the & */
991 case 0:
992 bits=base64Value;
993 base64Counter=1;
994 break;
995 case 1:
996 case 3:
997 case 4:
998 case 6:
999 bits=(uint16_t)((bits<<6)|base64Value);
1000 ++base64Counter;
1001 break;
1002 case 2:
1003 c=(UChar)((bits<<4)|(base64Value>>2));
1004 if(isLegalIMAP(c)) {
1005 /* illegal */
1006 inDirectMode=TRUE;
1007 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1008 goto endloop;
1009 }
1010 *target++=c;
1011 if(offsets!=NULL) {
1012 *offsets++=sourceIndex;
1013 sourceIndex=nextSourceIndex-1;
1014 }
1015 bytes[0]=b; /* keep this byte in case an error occurs */
1016 byteIndex=1;
1017 bits=(uint16_t)(base64Value&3);
1018 base64Counter=3;
1019 break;
1020 case 5:
1021 c=(UChar)((bits<<2)|(base64Value>>4));
1022 if(isLegalIMAP(c)) {
1023 /* illegal */
1024 inDirectMode=TRUE;
1025 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1026 goto endloop;
1027 }
1028 *target++=c;
1029 if(offsets!=NULL) {
1030 *offsets++=sourceIndex;
1031 sourceIndex=nextSourceIndex-1;
1032 }
1033 bytes[0]=b; /* keep this byte in case an error occurs */
1034 byteIndex=1;
1035 bits=(uint16_t)(base64Value&15);
1036 base64Counter=6;
1037 break;
1038 case 7:
1039 c=(UChar)((bits<<6)|base64Value);
1040 if(isLegalIMAP(c)) {
1041 /* illegal */
1042 inDirectMode=TRUE;
1043 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1044 goto endloop;
1045 }
1046 *target++=c;
1047 if(offsets!=NULL) {
1048 *offsets++=sourceIndex;
1049 sourceIndex=nextSourceIndex;
1050 }
1051 byteIndex=0;
1052 bits=0;
1053 base64Counter=0;
1054 break;
1055 default:
1056 /* will never occur */
1057 break;
1058 }
1059 } else if(base64Value==-2) {
1060 /* minus sign terminates the base64 sequence */
1061 inDirectMode=TRUE;
1062 if(base64Counter==-1) {
1063 /* &- i.e. a minus immediately following an ampersand */
1064 *target++=AMPERSAND;
1065 if(offsets!=NULL) {
1066 *offsets++=sourceIndex-1;
1067 }
1068 } else {
1069 /* absorb the minus and leave the Unicode Mode */
1070 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1071 /* bits are illegally left over, a UChar is incomplete */
1072 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1073 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1074 break;
1075 }
1076 }
1077 sourceIndex=nextSourceIndex;
1078 goto directMode;
1079 } else {
1080 if(base64Counter==-1) {
1081 /* illegal: & immediately followed by something other than base64 or minus sign */
1082 /* include the ampersand in the reported sequence */
1083 --sourceIndex;
1084 bytes[0]=AMPERSAND;
1085 bytes[1]=b;
1086 byteIndex=2;
1087 }
1088 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1089 /* base64Value==-3 for illegal characters */
1090 /* illegal */
1091 inDirectMode=TRUE;
1092 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1093 break;
1094 }
1095 } else {
1096 /* target is full */
1097 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1098 break;
1099 }
1100 }
1101 }
1102 endloop:
1103
1104 /*
1105 * the end of the input stream and detection of truncated input
1106 * are handled by the framework, but here we must check if we are in Unicode
1107 * mode and byteIndex==0 because we must end in direct mode
1108 *
1109 * conditions:
1110 * successful
1111 * in Unicode mode and byteIndex==0
1112 * end of input and no truncated input
1113 */
1114 if( U_SUCCESS(*pErrorCode) &&
1115 !inDirectMode && byteIndex==0 &&
1116 pArgs->flush && source>=sourceLimit
1117 ) {
1118 if(base64Counter==-1) {
1119 /* & at the very end of the input */
1120 /* make the ampersand the reported sequence */
1121 bytes[0]=AMPERSAND;
1122 byteIndex=1;
1123 }
1124 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1125
1126 inDirectMode=TRUE; /* avoid looping */
1127 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1128 }
1129
1130 /* set the converter state back into UConverter */
1131 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1132 cnv->toULength=byteIndex;
1133
1134 /* write back the updated pointers */
1135 pArgs->source=(const char *)source;
1136 pArgs->target=target;
1137 pArgs->offsets=offsets;
1138 return;
1139 }
1140
1141 static void
1142 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1143 UErrorCode *pErrorCode) {
1144 UConverter *cnv;
1145 const UChar *source, *sourceLimit;
1146 uint8_t *target, *targetLimit;
1147 int32_t *offsets;
1148
1149 int32_t length, targetCapacity, sourceIndex;
1150 UChar c;
1151 uint8_t b;
1152
1153 /* UTF-7 state */
1154 uint8_t bits;
1155 int8_t base64Counter;
1156 UBool inDirectMode;
1157
1158 /* set up the local pointers */
1159 cnv=pArgs->converter;
1160
1161 /* set up the local pointers */
1162 source=pArgs->source;
1163 sourceLimit=pArgs->sourceLimit;
1164 target=(uint8_t *)pArgs->target;
1165 targetLimit=(uint8_t *)pArgs->targetLimit;
1166 offsets=pArgs->offsets;
1167
1168 /* get the state machine state */
1169 {
1170 uint32_t status=cnv->fromUnicodeStatus;
1171 inDirectMode=(UBool)((status>>24)&1);
1172 base64Counter=(int8_t)(status>>16);
1173 bits=(uint8_t)status;
1174 }
1175
1176 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1177 sourceIndex=0;
1178
1179 if(inDirectMode) {
1180 directMode:
1181 length=(int32_t)(sourceLimit-source);
1182 targetCapacity=(int32_t)(targetLimit-target);
1183 if(length>targetCapacity) {
1184 length=targetCapacity;
1185 }
1186 while(length>0) {
1187 c=*source++;
1188 /* encode 0x20..0x7e except '&' directly */
1189 if(inSetDIMAP(c)) {
1190 /* encode directly */
1191 *target++=(uint8_t)c;
1192 if(offsets!=NULL) {
1193 *offsets++=sourceIndex++;
1194 }
1195 } else if(c==AMPERSAND) {
1196 /* output &- for & */
1197 *target++=AMPERSAND;
1198 if(target<targetLimit) {
1199 *target++=MINUS;
1200 if(offsets!=NULL) {
1201 *offsets++=sourceIndex;
1202 *offsets++=sourceIndex++;
1203 }
1204 /* realign length and targetCapacity */
1205 goto directMode;
1206 } else {
1207 if(offsets!=NULL) {
1208 *offsets++=sourceIndex++;
1209 }
1210 cnv->charErrorBuffer[0]=MINUS;
1211 cnv->charErrorBufferLength=1;
1212 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1213 break;
1214 }
1215 } else {
1216 /* un-read this character and switch to Unicode Mode */
1217 --source;
1218 *target++=AMPERSAND;
1219 if(offsets!=NULL) {
1220 *offsets++=sourceIndex;
1221 }
1222 inDirectMode=FALSE;
1223 base64Counter=0;
1224 goto unicodeMode;
1225 }
1226 --length;
1227 }
1228 if(source<sourceLimit && target>=targetLimit) {
1229 /* target is full */
1230 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1231 }
1232 } else {
1233 unicodeMode:
1234 while(source<sourceLimit) {
1235 if(target<targetLimit) {
1236 c=*source++;
1237 if(isLegalIMAP(c)) {
1238 /* encode directly */
1239 inDirectMode=TRUE;
1240
1241 /* trick: back out this character to make this easier */
1242 --source;
1243
1244 /* terminate the base64 sequence */
1245 if(base64Counter!=0) {
1246 /* write remaining bits for the previous character */
1247 *target++=TO_BASE64_IMAP(bits);
1248 if(offsets!=NULL) {
1249 *offsets++=sourceIndex-1;
1250 }
1251 }
1252 /* need to terminate with a minus */
1253 if(target<targetLimit) {
1254 *target++=MINUS;
1255 if(offsets!=NULL) {
1256 *offsets++=sourceIndex-1;
1257 }
1258 } else {
1259 cnv->charErrorBuffer[0]=MINUS;
1260 cnv->charErrorBufferLength=1;
1261 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1262 break;
1263 }
1264 goto directMode;
1265 } else {
1266 /*
1267 * base64 this character:
1268 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1269 * and the bits of this character, each implicitly in UTF-16BE.
1270 *
1271 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1272 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1273 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1274 */
1275 switch(base64Counter) {
1276 case 0:
1277 b=(uint8_t)(c>>10);
1278 *target++=TO_BASE64_IMAP(b);
1279 if(target<targetLimit) {
1280 b=(uint8_t)((c>>4)&0x3f);
1281 *target++=TO_BASE64_IMAP(b);
1282 if(offsets!=NULL) {
1283 *offsets++=sourceIndex;
1284 *offsets++=sourceIndex++;
1285 }
1286 } else {
1287 if(offsets!=NULL) {
1288 *offsets++=sourceIndex++;
1289 }
1290 b=(uint8_t)((c>>4)&0x3f);
1291 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1292 cnv->charErrorBufferLength=1;
1293 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1294 }
1295 bits=(uint8_t)((c&15)<<2);
1296 base64Counter=1;
1297 break;
1298 case 1:
1299 b=(uint8_t)(bits|(c>>14));
1300 *target++=TO_BASE64_IMAP(b);
1301 if(target<targetLimit) {
1302 b=(uint8_t)((c>>8)&0x3f);
1303 *target++=TO_BASE64_IMAP(b);
1304 if(target<targetLimit) {
1305 b=(uint8_t)((c>>2)&0x3f);
1306 *target++=TO_BASE64_IMAP(b);
1307 if(offsets!=NULL) {
1308 *offsets++=sourceIndex;
1309 *offsets++=sourceIndex;
1310 *offsets++=sourceIndex++;
1311 }
1312 } else {
1313 if(offsets!=NULL) {
1314 *offsets++=sourceIndex;
1315 *offsets++=sourceIndex++;
1316 }
1317 b=(uint8_t)((c>>2)&0x3f);
1318 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1319 cnv->charErrorBufferLength=1;
1320 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1321 }
1322 } else {
1323 if(offsets!=NULL) {
1324 *offsets++=sourceIndex++;
1325 }
1326 b=(uint8_t)((c>>8)&0x3f);
1327 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1328 b=(uint8_t)((c>>2)&0x3f);
1329 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1330 cnv->charErrorBufferLength=2;
1331 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332 }
1333 bits=(uint8_t)((c&3)<<4);
1334 base64Counter=2;
1335 break;
1336 case 2:
1337 b=(uint8_t)(bits|(c>>12));
1338 *target++=TO_BASE64_IMAP(b);
1339 if(target<targetLimit) {
1340 b=(uint8_t)((c>>6)&0x3f);
1341 *target++=TO_BASE64_IMAP(b);
1342 if(target<targetLimit) {
1343 b=(uint8_t)(c&0x3f);
1344 *target++=TO_BASE64_IMAP(b);
1345 if(offsets!=NULL) {
1346 *offsets++=sourceIndex;
1347 *offsets++=sourceIndex;
1348 *offsets++=sourceIndex++;
1349 }
1350 } else {
1351 if(offsets!=NULL) {
1352 *offsets++=sourceIndex;
1353 *offsets++=sourceIndex++;
1354 }
1355 b=(uint8_t)(c&0x3f);
1356 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1357 cnv->charErrorBufferLength=1;
1358 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1359 }
1360 } else {
1361 if(offsets!=NULL) {
1362 *offsets++=sourceIndex++;
1363 }
1364 b=(uint8_t)((c>>6)&0x3f);
1365 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1366 b=(uint8_t)(c&0x3f);
1367 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1368 cnv->charErrorBufferLength=2;
1369 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1370 }
1371 bits=0;
1372 base64Counter=0;
1373 break;
1374 default:
1375 /* will never occur */
1376 break;
1377 }
1378 }
1379 } else {
1380 /* target is full */
1381 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382 break;
1383 }
1384 }
1385 }
1386
1387 if(pArgs->flush && source>=sourceLimit) {
1388 /* flush remaining bits to the target */
1389 if(!inDirectMode) {
1390 if(base64Counter!=0) {
1391 if(target<targetLimit) {
1392 *target++=TO_BASE64_IMAP(bits);
1393 if(offsets!=NULL) {
1394 *offsets++=sourceIndex-1;
1395 }
1396 } else {
1397 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1398 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1399 }
1400 }
1401 /* need to terminate with a minus */
1402 if(target<targetLimit) {
1403 *target++=MINUS;
1404 if(offsets!=NULL) {
1405 *offsets++=sourceIndex-1;
1406 }
1407 } else {
1408 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1409 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1410 }
1411 }
1412 /* reset the state for the next conversion */
1413 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1414 } else {
1415 /* set the converter state back into UConverter */
1416 cnv->fromUnicodeStatus=
1417 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1418 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1419 }
1420
1421 /* write back the updated pointers */
1422 pArgs->source=source;
1423 pArgs->target=(char *)target;
1424 pArgs->offsets=offsets;
1425 return;
1426 }
1427
1428 static const UConverterImpl _IMAPImpl={
1429 UCNV_IMAP_MAILBOX,
1430
1431 NULL,
1432 NULL,
1433
1434 _UTF7Open,
1435 NULL,
1436 _UTF7Reset,
1437
1438 _IMAPToUnicodeWithOffsets,
1439 _IMAPToUnicodeWithOffsets,
1440 _IMAPFromUnicodeWithOffsets,
1441 _IMAPFromUnicodeWithOffsets,
1442 NULL,
1443
1444 NULL,
1445 NULL,
1446 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1447 NULL,
1448 ucnv_getCompleteUnicodeSet
1449 };
1450
1451 static const UConverterStaticData _IMAPStaticData={
1452 sizeof(UConverterStaticData),
1453 "IMAP-mailbox-name",
1454 0, /* TODO CCSID for IMAP-mailbox-name */
1455 UCNV_IBM, UCNV_IMAP_MAILBOX,
1456 1, 4,
1457 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1458 FALSE, FALSE,
1459 0,
1460 0,
1461 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1462 };
1463
1464 const UConverterSharedData _IMAPData={
1465 sizeof(UConverterSharedData), ~((uint32_t)0),
1466 NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1467 0
1468 };
1469
1470 #endif