]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u7.cpp
ICU-62108.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u7.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2002-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* file name: ucnv_u7.c
f3c0d7a5 9* encoding: UTF-8
b75a7d8f
A
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002jul01
14* created by: Markus W. Scherer
15*
16* UTF-7 converter implementation. Used to be in ucnv_utf.c.
17*/
18
19#include "unicode/utypes.h"
374ca955 20
b331163b 21#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
374ca955 22
2ca993e8 23#include "cmemory.h"
b75a7d8f 24#include "unicode/ucnv.h"
b75a7d8f
A
25#include "ucnv_bld.h"
26#include "ucnv_cnv.h"
4388f060 27#include "uassert.h"
b75a7d8f
A
28
29/* UTF-7 -------------------------------------------------------------------- */
30
b75a7d8f
A
31/*
32 * UTF-7 is a stateful encoding of Unicode.
33 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34 * It was intended for use in Internet email systems, using in its bytewise
35 * encoding only a subset of 7-bit US-ASCII.
36 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37 * occasionally used.
38 *
39 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40 * characters directly or in base64. Especially, the characters in set O
41 * as defined in the RFC (see below) may be encoded directly but are not
42 * allowed in, e.g., email headers.
43 * By default, the ICU UTF-7 converter encodes set O directly.
44 * By choosing the option "version=1", set O will be escaped instead.
45 * For example:
46 * utf7Converter=ucnv_open("UTF-7,version=1");
47 *
48 * For details about email headers see RFC 2047.
49 */
50
51/*
52 * Tests for US-ASCII characters belonging to character classes
53 * defined in UTF-7.
54 *
55 * Set D (directly encoded characters) consists of the following
56 * characters: the upper and lower case letters A through Z
57 * and a through z, the 10 digits 0-9, and the following nine special
58 * characters (note that "+" and "=" are omitted):
59 * '(),-./:?
60 *
61 * Set O (optional direct characters) consists of the following
62 * characters (note that "\" and "~" are omitted):
63 * !"#$%&*;<=>@[]^_`{|}
64 *
65 * According to the rules in RFC 2152, the byte values for the following
66 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67 * - all C0 control codes except for CR LF TAB
68 * - BACKSLASH
69 * - TILDE
70 * - DEL
71 * - all codes beyond US-ASCII, i.e. all >127
72 */
73#define inSetD(c) \
74 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75 (uint8_t)((c)-48)<10 || /* digits */ \
76 (uint8_t)((c)-39)<3 || /* '() */ \
77 (uint8_t)((c)-44)<4 || /* ,-./ */ \
78 (c)==58 || (c)==63 /* :? */ \
79 )
80
81#define inSetO(c) \
82 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
83 (uint8_t)((c)-59)<4 || /* ;<=> */ \
84 (uint8_t)((c)-93)<4 || /* ]^_` */ \
85 (uint8_t)((c)-123)<3 || /* {|} */ \
86 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
87 )
88
89#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91
92#define PLUS 43
93#define MINUS 45
94#define BACKSLASH 92
95#define TILDE 126
96
97/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99
100/* encode directly sets D and O and CR LF SP TAB */
101static const UBool encodeDirectlyMaximum[128]={
102 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114};
115
116/* encode directly set D and CR LF SP TAB but not set O */
117static const UBool encodeDirectlyRestricted[128]={
118 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121
122 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127
128 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130};
131
132static const uint8_t
133toBase64[64]={
134 /* A-Z */
135 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137 /* a-z */
138 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140 /* 0-9 */
141 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142 /* +/ */
143 43, 47
144};
145
146static const int8_t
147fromBase64[128]={
148 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151
152 /* general punctuation with + and / and a special value (-2) for - */
153 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154 /* digits */
155 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156
157 /* A-Z */
158 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
159 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160
161 /* a-z */
162 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164};
165
166/*
167 * converter status values:
168 *
169 * toUnicodeStatus:
170 * 24 inDirectMode (boolean)
171 * 23..16 base64Counter (-1..7)
172 * 15..0 bits (up to 14 bits incoming base64)
173 *
174 * fromUnicodeStatus:
175 * 31..28 version (0: set O direct 1: set O escaped)
176 * 24 inDirectMode (boolean)
177 * 23..16 base64Counter (0..2)
178 * 7..0 bits (6 bits outgoing base64)
179 *
180 */
181
f3c0d7a5
A
182U_CDECL_BEGIN
183static void U_CALLCONV
b75a7d8f
A
184_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185 if(choice<=UCNV_RESET_TO_UNICODE) {
186 /* reset toUnicode */
187 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
188 cnv->toULength=0;
189 }
190 if(choice!=UCNV_RESET_TO_UNICODE) {
191 /* reset fromUnicode */
192 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
193 }
194}
195
f3c0d7a5 196static void U_CALLCONV
b75a7d8f 197_UTF7Open(UConverter *cnv,
729e4ab9 198 UConverterLoadArgs *pArgs,
b75a7d8f 199 UErrorCode *pErrorCode) {
f3c0d7a5 200 (void)pArgs;
729e4ab9
A
201 if(UCNV_GET_VERSION(cnv)<=1) {
202 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
203 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
b75a7d8f
A
204 _UTF7Reset(cnv, UCNV_RESET_BOTH);
205 } else {
206 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207 }
208}
209
f3c0d7a5 210static void U_CALLCONV
b75a7d8f
A
211_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212 UErrorCode *pErrorCode) {
213 UConverter *cnv;
214 const uint8_t *source, *sourceLimit;
215 UChar *target;
216 const UChar *targetLimit;
217 int32_t *offsets;
218
219 uint8_t *bytes;
220 uint8_t byteIndex;
221
222 int32_t length, targetCapacity;
223
224 /* UTF-7 state */
225 uint16_t bits;
226 int8_t base64Counter;
227 UBool inDirectMode;
228
229 int8_t base64Value;
230
231 int32_t sourceIndex, nextSourceIndex;
232
233 uint8_t b;
234 /* set up the local pointers */
235 cnv=pArgs->converter;
236
237 source=(const uint8_t *)pArgs->source;
238 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239 target=pArgs->target;
240 targetLimit=pArgs->targetLimit;
241 offsets=pArgs->offsets;
242 /* get the state machine state */
243 {
244 uint32_t status=cnv->toUnicodeStatus;
245 inDirectMode=(UBool)((status>>24)&1);
246 base64Counter=(int8_t)(status>>16);
247 bits=(uint16_t)status;
248 }
249 bytes=cnv->toUBytes;
250 byteIndex=cnv->toULength;
251
252 /* sourceIndex=-1 if the current character began in the previous buffer */
253 sourceIndex=byteIndex==0 ? 0 : -1;
254 nextSourceIndex=0;
255
b75a7d8f
A
256 if(inDirectMode) {
257directMode:
258 /*
259 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260 * with their US-ASCII byte values.
261 * Backslash and Tilde and most control characters are not allowed in UTF-7.
262 * A plus sign starts Unicode (or "escape") Mode.
263 *
264 * In Direct Mode, only the sourceIndex is used.
265 */
266 byteIndex=0;
73c04bcf
A
267 length=(int32_t)(sourceLimit-source);
268 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
269 if(length>targetCapacity) {
270 length=targetCapacity;
271 }
272 while(length>0) {
273 b=*source++;
274 if(!isLegalUTF7(b)) {
275 /* illegal */
276 bytes[0]=b;
277 byteIndex=1;
374ca955
A
278 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279 break;
b75a7d8f
A
280 } else if(b!=PLUS) {
281 /* write directly encoded character */
282 *target++=b;
283 if(offsets!=NULL) {
284 *offsets++=sourceIndex++;
285 }
286 } else /* PLUS */ {
287 /* switch to Unicode mode */
288 nextSourceIndex=++sourceIndex;
289 inDirectMode=FALSE;
290 byteIndex=0;
291 bits=0;
292 base64Counter=-1;
293 goto unicodeMode;
294 }
295 --length;
296 }
297 if(source<sourceLimit && target>=targetLimit) {
298 /* target is full */
299 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300 }
301 } else {
302unicodeMode:
303 /*
304 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305 * The base64 sequence ends with any character that is not in the base64 alphabet.
306 * A terminating minus sign is consumed.
307 *
308 * In Unicode Mode, the sourceIndex has the index to the start of the current
309 * base64 bytes, while nextSourceIndex is precisely parallel to source,
310 * keeping the index to the following byte.
311 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312 */
313 while(source<sourceLimit) {
314 if(target<targetLimit) {
315 bytes[byteIndex++]=b=*source++;
316 ++nextSourceIndex;
729e4ab9
A
317 base64Value = -3; /* initialize as illegal */
318 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
319 /* either
320 * base64Value==-1 for any legal character except base64 and minus sign, or
321 * base64Value==-3 for illegal characters:
322 * 1. In either case, leave Unicode mode.
323 * 2.1. If we ended with an incomplete UChar or none after the +, then
324 * generate an error for the preceding erroneous sequence and deal with
325 * the current (possibly illegal) character next time through.
326 * 2.2. Else the current char comes after a complete UChar, which was already
327 * pushed to the output buf, so:
328 * 2.2.1. If the current char is legal, just save it for processing next time.
329 * It may be for example, a plus which we need to deal with in direct mode.
330 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331 */
b75a7d8f 332 inDirectMode=TRUE;
729e4ab9
A
333 if(base64Counter==-1) {
334 /* illegal: + immediately followed by something other than base64 or minus sign */
335 /* include the plus sign in the reported sequence, but not the subsequent char */
336 --source;
337 bytes[0]=PLUS;
338 byteIndex=1;
339 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340 break;
341 } else if(bits!=0) {
342 /* bits are illegally left over, a UChar is incomplete */
343 /* don't include current char (legal or illegal) in error seq */
344 --source;
345 --byteIndex;
346 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347 break;
348 } else {
349 /* previous UChar was complete */
4388f060 350 if(base64Value==-3) {
729e4ab9
A
351 /* current character is illegal, deal with it here */
352 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353 break;
354 } else {
355 /* un-read the current character in case it is a plus sign */
356 --source;
357 sourceIndex=nextSourceIndex-1;
358 goto directMode;
359 }
360 }
361 } else if(base64Value>=0) {
b75a7d8f
A
362 /* collect base64 bytes into UChars */
363 switch(base64Counter) {
364 case -1: /* -1 is immediately after the + */
365 case 0:
366 bits=base64Value;
367 base64Counter=1;
368 break;
369 case 1:
370 case 3:
371 case 4:
372 case 6:
373 bits=(uint16_t)((bits<<6)|base64Value);
374 ++base64Counter;
375 break;
376 case 2:
377 *target++=(UChar)((bits<<4)|(base64Value>>2));
378 if(offsets!=NULL) {
379 *offsets++=sourceIndex;
380 sourceIndex=nextSourceIndex-1;
381 }
382 bytes[0]=b; /* keep this byte in case an error occurs */
383 byteIndex=1;
384 bits=(uint16_t)(base64Value&3);
385 base64Counter=3;
386 break;
387 case 5:
388 *target++=(UChar)((bits<<2)|(base64Value>>4));
389 if(offsets!=NULL) {
390 *offsets++=sourceIndex;
391 sourceIndex=nextSourceIndex-1;
392 }
393 bytes[0]=b; /* keep this byte in case an error occurs */
394 byteIndex=1;
395 bits=(uint16_t)(base64Value&15);
396 base64Counter=6;
397 break;
398 case 7:
399 *target++=(UChar)((bits<<6)|base64Value);
400 if(offsets!=NULL) {
401 *offsets++=sourceIndex;
402 sourceIndex=nextSourceIndex;
403 }
404 byteIndex=0;
405 bits=0;
406 base64Counter=0;
407 break;
408 default:
409 /* will never occur */
410 break;
411 }
729e4ab9 412 } else /*base64Value==-2*/ {
b75a7d8f
A
413 /* minus sign terminates the base64 sequence */
414 inDirectMode=TRUE;
415 if(base64Counter==-1) {
416 /* +- i.e. a minus immediately following a plus */
417 *target++=PLUS;
418 if(offsets!=NULL) {
419 *offsets++=sourceIndex-1;
420 }
421 } else {
422 /* absorb the minus and leave the Unicode Mode */
423 if(bits!=0) {
424 /* bits are illegally left over, a UChar is incomplete */
374ca955
A
425 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
426 break;
b75a7d8f
A
427 }
428 }
429 sourceIndex=nextSourceIndex;
430 goto directMode;
b75a7d8f
A
431 }
432 } else {
433 /* target is full */
434 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435 break;
436 }
437 }
438 }
b75a7d8f 439
374ca955
A
440 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
441 /*
442 * if we are in Unicode mode, then the byteIndex might not be 0,
443 * but that is ok if bits==0
444 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445 * (not true for IMAP-mailbox-name where we must end in direct mode)
446 */
447 byteIndex=0;
b75a7d8f
A
448 }
449
374ca955
A
450 /* set the converter state back into UConverter */
451 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
452 cnv->toULength=byteIndex;
453
b75a7d8f
A
454 /* write back the updated pointers */
455 pArgs->source=(const char *)source;
456 pArgs->target=target;
457 pArgs->offsets=offsets;
458 return;
b75a7d8f
A
459}
460
f3c0d7a5 461static void U_CALLCONV
b75a7d8f
A
462_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
463 UErrorCode *pErrorCode) {
464 UConverter *cnv;
465 const UChar *source, *sourceLimit;
466 uint8_t *target, *targetLimit;
467 int32_t *offsets;
468
469 int32_t length, targetCapacity, sourceIndex;
470 UChar c;
471
472 /* UTF-7 state */
473 const UBool *encodeDirectly;
474 uint8_t bits;
475 int8_t base64Counter;
476 UBool inDirectMode;
477
478 /* set up the local pointers */
479 cnv=pArgs->converter;
480
481 /* set up the local pointers */
482 source=pArgs->source;
483 sourceLimit=pArgs->sourceLimit;
484 target=(uint8_t *)pArgs->target;
485 targetLimit=(uint8_t *)pArgs->targetLimit;
486 offsets=pArgs->offsets;
487
488 /* get the state machine state */
489 {
490 uint32_t status=cnv->fromUnicodeStatus;
491 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
492 inDirectMode=(UBool)((status>>24)&1);
493 base64Counter=(int8_t)(status>>16);
494 bits=(uint8_t)status;
2ca993e8 495 U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
b75a7d8f
A
496 }
497
498 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
499 sourceIndex=0;
500
501 if(inDirectMode) {
502directMode:
73c04bcf
A
503 length=(int32_t)(sourceLimit-source);
504 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
505 if(length>targetCapacity) {
506 length=targetCapacity;
507 }
508 while(length>0) {
509 c=*source++;
510 /* currently always encode CR LF SP TAB directly */
511 if(c<=127 && encodeDirectly[c]) {
512 /* encode directly */
513 *target++=(uint8_t)c;
514 if(offsets!=NULL) {
515 *offsets++=sourceIndex++;
516 }
517 } else if(c==PLUS) {
518 /* output +- for + */
519 *target++=PLUS;
520 if(target<targetLimit) {
521 *target++=MINUS;
522 if(offsets!=NULL) {
523 *offsets++=sourceIndex;
524 *offsets++=sourceIndex++;
525 }
526 /* realign length and targetCapacity */
527 goto directMode;
528 } else {
529 if(offsets!=NULL) {
530 *offsets++=sourceIndex++;
531 }
532 cnv->charErrorBuffer[0]=MINUS;
533 cnv->charErrorBufferLength=1;
534 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
535 break;
536 }
537 } else {
538 /* un-read this character and switch to Unicode Mode */
539 --source;
540 *target++=PLUS;
541 if(offsets!=NULL) {
542 *offsets++=sourceIndex;
543 }
544 inDirectMode=FALSE;
545 base64Counter=0;
546 goto unicodeMode;
547 }
548 --length;
549 }
550 if(source<sourceLimit && target>=targetLimit) {
551 /* target is full */
552 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
553 }
554 } else {
555unicodeMode:
556 while(source<sourceLimit) {
557 if(target<targetLimit) {
558 c=*source++;
559 if(c<=127 && encodeDirectly[c]) {
560 /* encode directly */
561 inDirectMode=TRUE;
562
563 /* trick: back out this character to make this easier */
564 --source;
565
566 /* terminate the base64 sequence */
567 if(base64Counter!=0) {
568 /* write remaining bits for the previous character */
569 *target++=toBase64[bits];
570 if(offsets!=NULL) {
571 *offsets++=sourceIndex-1;
572 }
573 }
574 if(fromBase64[c]!=-1) {
575 /* need to terminate with a minus */
576 if(target<targetLimit) {
577 *target++=MINUS;
578 if(offsets!=NULL) {
579 *offsets++=sourceIndex-1;
580 }
581 } else {
582 cnv->charErrorBuffer[0]=MINUS;
583 cnv->charErrorBufferLength=1;
584 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
585 break;
586 }
587 }
588 goto directMode;
589 } else {
590 /*
591 * base64 this character:
592 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
593 * and the bits of this character, each implicitly in UTF-16BE.
594 *
595 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
596 * character to the next. The actual 2 or 4 bits are shifted to the left edge
597 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
598 */
599 switch(base64Counter) {
600 case 0:
601 *target++=toBase64[c>>10];
602 if(target<targetLimit) {
603 *target++=toBase64[(c>>4)&0x3f];
604 if(offsets!=NULL) {
605 *offsets++=sourceIndex;
606 *offsets++=sourceIndex++;
607 }
608 } else {
609 if(offsets!=NULL) {
610 *offsets++=sourceIndex++;
611 }
612 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
613 cnv->charErrorBufferLength=1;
614 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
615 }
616 bits=(uint8_t)((c&15)<<2);
617 base64Counter=1;
618 break;
619 case 1:
620 *target++=toBase64[bits|(c>>14)];
621 if(target<targetLimit) {
622 *target++=toBase64[(c>>8)&0x3f];
623 if(target<targetLimit) {
624 *target++=toBase64[(c>>2)&0x3f];
625 if(offsets!=NULL) {
626 *offsets++=sourceIndex;
627 *offsets++=sourceIndex;
628 *offsets++=sourceIndex++;
629 }
630 } else {
631 if(offsets!=NULL) {
632 *offsets++=sourceIndex;
633 *offsets++=sourceIndex++;
634 }
635 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
636 cnv->charErrorBufferLength=1;
637 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
638 }
639 } else {
640 if(offsets!=NULL) {
641 *offsets++=sourceIndex++;
642 }
643 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
644 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
645 cnv->charErrorBufferLength=2;
646 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
647 }
648 bits=(uint8_t)((c&3)<<4);
649 base64Counter=2;
650 break;
651 case 2:
652 *target++=toBase64[bits|(c>>12)];
653 if(target<targetLimit) {
654 *target++=toBase64[(c>>6)&0x3f];
655 if(target<targetLimit) {
656 *target++=toBase64[c&0x3f];
657 if(offsets!=NULL) {
658 *offsets++=sourceIndex;
659 *offsets++=sourceIndex;
660 *offsets++=sourceIndex++;
661 }
662 } else {
663 if(offsets!=NULL) {
664 *offsets++=sourceIndex;
665 *offsets++=sourceIndex++;
666 }
667 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
668 cnv->charErrorBufferLength=1;
669 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
670 }
671 } else {
672 if(offsets!=NULL) {
673 *offsets++=sourceIndex++;
674 }
675 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
676 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
677 cnv->charErrorBufferLength=2;
678 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
679 }
680 bits=0;
681 base64Counter=0;
682 break;
683 default:
684 /* will never occur */
685 break;
686 }
687 }
688 } else {
689 /* target is full */
690 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
691 break;
692 }
693 }
694 }
695
696 if(pArgs->flush && source>=sourceLimit) {
697 /* flush remaining bits to the target */
4388f060
A
698 if(!inDirectMode) {
699 if (base64Counter!=0) {
700 if(target<targetLimit) {
701 *target++=toBase64[bits];
702 if(offsets!=NULL) {
703 *offsets++=sourceIndex-1;
704 }
705 } else {
706 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
707 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
708 }
709 }
710 /* Add final MINUS to terminate unicodeMode */
b75a7d8f 711 if(target<targetLimit) {
4388f060 712 *target++=MINUS;
b75a7d8f
A
713 if(offsets!=NULL) {
714 *offsets++=sourceIndex-1;
715 }
716 } else {
4388f060 717 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
b75a7d8f
A
718 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
719 }
720 }
721 /* reset the state for the next conversion */
722 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
723 } else {
724 /* set the converter state back into UConverter */
725 cnv->fromUnicodeStatus=
726 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
727 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
728 }
729
730 /* write back the updated pointers */
731 pArgs->source=source;
732 pArgs->target=(char *)target;
733 pArgs->offsets=offsets;
734 return;
735}
736
f3c0d7a5 737static const char * U_CALLCONV
b75a7d8f
A
738_UTF7GetName(const UConverter *cnv) {
739 switch(cnv->fromUnicodeStatus>>28) {
740 case 1:
741 return "UTF-7,version=1";
742 default:
743 return "UTF-7";
744 }
745}
f3c0d7a5 746U_CDECL_END
b75a7d8f
A
747
748static const UConverterImpl _UTF7Impl={
749 UCNV_UTF7,
750
751 NULL,
752 NULL,
753
754 _UTF7Open,
755 NULL,
756 _UTF7Reset,
757
758 _UTF7ToUnicodeWithOffsets,
759 _UTF7ToUnicodeWithOffsets,
760 _UTF7FromUnicodeWithOffsets,
761 _UTF7FromUnicodeWithOffsets,
374ca955 762 NULL,
b75a7d8f
A
763
764 NULL,
765 _UTF7GetName,
766 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
767 NULL,
f3c0d7a5
A
768 ucnv_getCompleteUnicodeSet,
769
770 NULL,
771 NULL
b75a7d8f
A
772};
773
774static const UConverterStaticData _UTF7StaticData={
775 sizeof(UConverterStaticData),
776 "UTF-7",
777 0, /* TODO CCSID for UTF-7 */
778 UCNV_IBM, UCNV_UTF7,
779 1, 4,
780 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
781 FALSE, FALSE,
782 0,
783 0,
784 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
785};
786
2ca993e8
A
787const UConverterSharedData _UTF7Data=
788 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
b75a7d8f
A
789
790/* IMAP mailbox name encoding ----------------------------------------------- */
791
792/*
793 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
794 * http://www.ietf.org/rfc/rfc2060.txt
795 *
796 * 5.1.3. Mailbox International Naming Convention
797 *
798 * By convention, international mailbox names are specified using a
799 * modified version of the UTF-7 encoding described in [UTF-7]. The
800 * purpose of these modifications is to correct the following problems
801 * with UTF-7:
802 *
803 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
804 * the common use of "+" in mailbox names, in particular USENET
805 * newsgroup names.
806 *
807 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
808 * conflicts with the use of "/" as a popular hierarchy delimiter.
809 *
810 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
811 * the use of "\" as a popular hierarchy delimiter.
812 *
813 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
814 * the use of "~" in some servers as a home directory indicator.
815 *
816 * 5) UTF-7 permits multiple alternate forms to represent the same
817 * string; in particular, printable US-ASCII chararacters can be
818 * represented in encoded form.
819 *
820 * In modified UTF-7, printable US-ASCII characters except for "&"
821 * represent themselves; that is, characters with octet values 0x20-0x25
822 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
823 * octet sequence "&-".
824 *
825 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
826 * Unicode 16-bit octets) are represented in modified BASE64, with a
827 * further modification from [UTF-7] that "," is used instead of "/".
828 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
829 * character which can represent itself.
830 *
831 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
832 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
833 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
834 * ").
835 *
836 * For example, here is a mailbox name which mixes English, Japanese,
837 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
838 */
839
840/*
841 * Tests for US-ASCII characters belonging to character classes
842 * defined in UTF-7.
843 *
844 * Set D (directly encoded characters) consists of the following
845 * characters: the upper and lower case letters A through Z
846 * and a through z, the 10 digits 0-9, and the following nine special
847 * characters (note that "+" and "=" are omitted):
848 * '(),-./:?
849 *
850 * Set O (optional direct characters) consists of the following
851 * characters (note that "\" and "~" are omitted):
852 * !"#$%&*;<=>@[]^_`{|}
853 *
854 * According to the rules in RFC 2152, the byte values for the following
855 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
856 * - all C0 control codes except for CR LF TAB
857 * - BACKSLASH
858 * - TILDE
859 * - DEL
860 * - all codes beyond US-ASCII, i.e. all >127
861 */
862
863/* uses '&' not '+' to start a base64 sequence */
864#define AMPERSAND 0x26
865#define COMMA 0x2c
866#define SLASH 0x2f
867
868/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
869#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
870
871/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
872#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
873
874#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
875#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
876
877/*
878 * converter status values:
879 *
880 * toUnicodeStatus:
881 * 24 inDirectMode (boolean)
882 * 23..16 base64Counter (-1..7)
883 * 15..0 bits (up to 14 bits incoming base64)
884 *
885 * fromUnicodeStatus:
886 * 24 inDirectMode (boolean)
887 * 23..16 base64Counter (0..2)
888 * 7..0 bits (6 bits outgoing base64)
889 *
890 * ignore bits 31..25
891 */
892
f3c0d7a5
A
893U_CDECL_BEGIN
894static void U_CALLCONV
b75a7d8f
A
895_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
896 UErrorCode *pErrorCode) {
897 UConverter *cnv;
898 const uint8_t *source, *sourceLimit;
899 UChar *target;
900 const UChar *targetLimit;
901 int32_t *offsets;
902
903 uint8_t *bytes;
904 uint8_t byteIndex;
905
906 int32_t length, targetCapacity;
907
908 /* UTF-7 state */
909 uint16_t bits;
910 int8_t base64Counter;
911 UBool inDirectMode;
912
913 int8_t base64Value;
914
915 int32_t sourceIndex, nextSourceIndex;
916
917 UChar c;
918 uint8_t b;
919
920 /* set up the local pointers */
921 cnv=pArgs->converter;
922
923 source=(const uint8_t *)pArgs->source;
924 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
925 target=pArgs->target;
926 targetLimit=pArgs->targetLimit;
927 offsets=pArgs->offsets;
928 /* get the state machine state */
929 {
930 uint32_t status=cnv->toUnicodeStatus;
931 inDirectMode=(UBool)((status>>24)&1);
932 base64Counter=(int8_t)(status>>16);
933 bits=(uint16_t)status;
934 }
935 bytes=cnv->toUBytes;
936 byteIndex=cnv->toULength;
937
938 /* sourceIndex=-1 if the current character began in the previous buffer */
939 sourceIndex=byteIndex==0 ? 0 : -1;
940 nextSourceIndex=0;
941
b75a7d8f
A
942 if(inDirectMode) {
943directMode:
944 /*
945 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
946 * with their US-ASCII byte values.
947 * An ampersand starts Unicode (or "escape") Mode.
948 *
949 * In Direct Mode, only the sourceIndex is used.
950 */
951 byteIndex=0;
73c04bcf
A
952 length=(int32_t)(sourceLimit-source);
953 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
954 if(length>targetCapacity) {
955 length=targetCapacity;
956 }
957 while(length>0) {
958 b=*source++;
959 if(!isLegalIMAP(b)) {
960 /* illegal */
961 bytes[0]=b;
962 byteIndex=1;
374ca955
A
963 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
964 break;
b75a7d8f
A
965 } else if(b!=AMPERSAND) {
966 /* write directly encoded character */
967 *target++=b;
968 if(offsets!=NULL) {
969 *offsets++=sourceIndex++;
970 }
971 } else /* AMPERSAND */ {
972 /* switch to Unicode mode */
973 nextSourceIndex=++sourceIndex;
974 inDirectMode=FALSE;
975 byteIndex=0;
976 bits=0;
977 base64Counter=-1;
978 goto unicodeMode;
979 }
980 --length;
981 }
982 if(source<sourceLimit && target>=targetLimit) {
983 /* target is full */
984 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
985 }
986 } else {
987unicodeMode:
988 /*
989 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
990 * The base64 sequence ends with any character that is not in the base64 alphabet.
991 * A terminating minus sign is consumed.
992 * US-ASCII must not be base64-ed.
993 *
994 * In Unicode Mode, the sourceIndex has the index to the start of the current
995 * base64 bytes, while nextSourceIndex is precisely parallel to source,
996 * keeping the index to the following byte.
997 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
998 */
999 while(source<sourceLimit) {
1000 if(target<targetLimit) {
1001 bytes[byteIndex++]=b=*source++;
1002 ++nextSourceIndex;
1003 if(b>0x7e) {
1004 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1005 inDirectMode=TRUE;
374ca955
A
1006 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1007 break;
b75a7d8f
A
1008 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1009 /* collect base64 bytes into UChars */
1010 switch(base64Counter) {
1011 case -1: /* -1 is immediately after the & */
1012 case 0:
1013 bits=base64Value;
1014 base64Counter=1;
1015 break;
1016 case 1:
1017 case 3:
1018 case 4:
1019 case 6:
1020 bits=(uint16_t)((bits<<6)|base64Value);
1021 ++base64Counter;
1022 break;
1023 case 2:
1024 c=(UChar)((bits<<4)|(base64Value>>2));
1025 if(isLegalIMAP(c)) {
1026 /* illegal */
1027 inDirectMode=TRUE;
374ca955
A
1028 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1029 goto endloop;
b75a7d8f
A
1030 }
1031 *target++=c;
1032 if(offsets!=NULL) {
1033 *offsets++=sourceIndex;
1034 sourceIndex=nextSourceIndex-1;
1035 }
1036 bytes[0]=b; /* keep this byte in case an error occurs */
1037 byteIndex=1;
1038 bits=(uint16_t)(base64Value&3);
1039 base64Counter=3;
1040 break;
1041 case 5:
1042 c=(UChar)((bits<<2)|(base64Value>>4));
1043 if(isLegalIMAP(c)) {
1044 /* illegal */
1045 inDirectMode=TRUE;
374ca955
A
1046 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1047 goto endloop;
b75a7d8f
A
1048 }
1049 *target++=c;
1050 if(offsets!=NULL) {
1051 *offsets++=sourceIndex;
1052 sourceIndex=nextSourceIndex-1;
1053 }
1054 bytes[0]=b; /* keep this byte in case an error occurs */
1055 byteIndex=1;
1056 bits=(uint16_t)(base64Value&15);
1057 base64Counter=6;
1058 break;
1059 case 7:
1060 c=(UChar)((bits<<6)|base64Value);
1061 if(isLegalIMAP(c)) {
1062 /* illegal */
1063 inDirectMode=TRUE;
374ca955
A
1064 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1065 goto endloop;
b75a7d8f
A
1066 }
1067 *target++=c;
1068 if(offsets!=NULL) {
1069 *offsets++=sourceIndex;
1070 sourceIndex=nextSourceIndex;
1071 }
1072 byteIndex=0;
1073 bits=0;
1074 base64Counter=0;
1075 break;
1076 default:
1077 /* will never occur */
1078 break;
1079 }
1080 } else if(base64Value==-2) {
1081 /* minus sign terminates the base64 sequence */
1082 inDirectMode=TRUE;
1083 if(base64Counter==-1) {
1084 /* &- i.e. a minus immediately following an ampersand */
1085 *target++=AMPERSAND;
1086 if(offsets!=NULL) {
1087 *offsets++=sourceIndex-1;
1088 }
1089 } else {
1090 /* absorb the minus and leave the Unicode Mode */
1091 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1092 /* bits are illegally left over, a UChar is incomplete */
1093 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
374ca955
A
1094 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1095 break;
b75a7d8f
A
1096 }
1097 }
1098 sourceIndex=nextSourceIndex;
1099 goto directMode;
1100 } else {
1101 if(base64Counter==-1) {
1102 /* illegal: & immediately followed by something other than base64 or minus sign */
1103 /* include the ampersand in the reported sequence */
1104 --sourceIndex;
1105 bytes[0]=AMPERSAND;
1106 bytes[1]=b;
1107 byteIndex=2;
1108 }
1109 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1110 /* base64Value==-3 for illegal characters */
1111 /* illegal */
1112 inDirectMode=TRUE;
374ca955
A
1113 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1114 break;
b75a7d8f
A
1115 }
1116 } else {
1117 /* target is full */
1118 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1119 break;
1120 }
1121 }
1122 }
1123endloop:
1124
374ca955
A
1125 /*
1126 * the end of the input stream and detection of truncated input
1127 * are handled by the framework, but here we must check if we are in Unicode
1128 * mode and byteIndex==0 because we must end in direct mode
1129 *
1130 * conditions:
1131 * successful
1132 * in Unicode mode and byteIndex==0
1133 * end of input and no truncated input
1134 */
1135 if( U_SUCCESS(*pErrorCode) &&
1136 !inDirectMode && byteIndex==0 &&
1137 pArgs->flush && source>=sourceLimit
1138 ) {
1139 if(base64Counter==-1) {
1140 /* & at the very end of the input */
1141 /* make the ampersand the reported sequence */
1142 bytes[0]=AMPERSAND;
1143 byteIndex=1;
b75a7d8f 1144 }
374ca955
A
1145 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1146
1147 inDirectMode=TRUE; /* avoid looping */
1148 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
b75a7d8f
A
1149 }
1150
374ca955
A
1151 /* set the converter state back into UConverter */
1152 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1153 cnv->toULength=byteIndex;
1154
b75a7d8f
A
1155 /* write back the updated pointers */
1156 pArgs->source=(const char *)source;
1157 pArgs->target=target;
1158 pArgs->offsets=offsets;
1159 return;
b75a7d8f
A
1160}
1161
f3c0d7a5 1162static void U_CALLCONV
b75a7d8f
A
1163_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1164 UErrorCode *pErrorCode) {
1165 UConverter *cnv;
1166 const UChar *source, *sourceLimit;
1167 uint8_t *target, *targetLimit;
1168 int32_t *offsets;
1169
1170 int32_t length, targetCapacity, sourceIndex;
1171 UChar c;
1172 uint8_t b;
1173
1174 /* UTF-7 state */
1175 uint8_t bits;
1176 int8_t base64Counter;
1177 UBool inDirectMode;
1178
1179 /* set up the local pointers */
1180 cnv=pArgs->converter;
1181
1182 /* set up the local pointers */
1183 source=pArgs->source;
1184 sourceLimit=pArgs->sourceLimit;
1185 target=(uint8_t *)pArgs->target;
1186 targetLimit=(uint8_t *)pArgs->targetLimit;
1187 offsets=pArgs->offsets;
1188
1189 /* get the state machine state */
1190 {
1191 uint32_t status=cnv->fromUnicodeStatus;
1192 inDirectMode=(UBool)((status>>24)&1);
1193 base64Counter=(int8_t)(status>>16);
1194 bits=(uint8_t)status;
1195 }
1196
1197 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1198 sourceIndex=0;
1199
1200 if(inDirectMode) {
1201directMode:
73c04bcf
A
1202 length=(int32_t)(sourceLimit-source);
1203 targetCapacity=(int32_t)(targetLimit-target);
b75a7d8f
A
1204 if(length>targetCapacity) {
1205 length=targetCapacity;
1206 }
1207 while(length>0) {
1208 c=*source++;
1209 /* encode 0x20..0x7e except '&' directly */
1210 if(inSetDIMAP(c)) {
1211 /* encode directly */
1212 *target++=(uint8_t)c;
1213 if(offsets!=NULL) {
1214 *offsets++=sourceIndex++;
1215 }
1216 } else if(c==AMPERSAND) {
1217 /* output &- for & */
1218 *target++=AMPERSAND;
1219 if(target<targetLimit) {
1220 *target++=MINUS;
1221 if(offsets!=NULL) {
1222 *offsets++=sourceIndex;
1223 *offsets++=sourceIndex++;
1224 }
1225 /* realign length and targetCapacity */
1226 goto directMode;
1227 } else {
1228 if(offsets!=NULL) {
1229 *offsets++=sourceIndex++;
1230 }
1231 cnv->charErrorBuffer[0]=MINUS;
1232 cnv->charErrorBufferLength=1;
1233 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234 break;
1235 }
1236 } else {
1237 /* un-read this character and switch to Unicode Mode */
1238 --source;
1239 *target++=AMPERSAND;
1240 if(offsets!=NULL) {
1241 *offsets++=sourceIndex;
1242 }
1243 inDirectMode=FALSE;
1244 base64Counter=0;
1245 goto unicodeMode;
1246 }
1247 --length;
1248 }
1249 if(source<sourceLimit && target>=targetLimit) {
1250 /* target is full */
1251 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1252 }
1253 } else {
1254unicodeMode:
1255 while(source<sourceLimit) {
1256 if(target<targetLimit) {
1257 c=*source++;
1258 if(isLegalIMAP(c)) {
1259 /* encode directly */
1260 inDirectMode=TRUE;
1261
1262 /* trick: back out this character to make this easier */
1263 --source;
1264
1265 /* terminate the base64 sequence */
1266 if(base64Counter!=0) {
1267 /* write remaining bits for the previous character */
1268 *target++=TO_BASE64_IMAP(bits);
1269 if(offsets!=NULL) {
1270 *offsets++=sourceIndex-1;
1271 }
1272 }
1273 /* need to terminate with a minus */
1274 if(target<targetLimit) {
1275 *target++=MINUS;
1276 if(offsets!=NULL) {
1277 *offsets++=sourceIndex-1;
1278 }
1279 } else {
1280 cnv->charErrorBuffer[0]=MINUS;
1281 cnv->charErrorBufferLength=1;
1282 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283 break;
1284 }
1285 goto directMode;
1286 } else {
1287 /*
1288 * base64 this character:
1289 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1290 * and the bits of this character, each implicitly in UTF-16BE.
1291 *
1292 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1293 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1294 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1295 */
1296 switch(base64Counter) {
1297 case 0:
1298 b=(uint8_t)(c>>10);
1299 *target++=TO_BASE64_IMAP(b);
1300 if(target<targetLimit) {
1301 b=(uint8_t)((c>>4)&0x3f);
1302 *target++=TO_BASE64_IMAP(b);
1303 if(offsets!=NULL) {
1304 *offsets++=sourceIndex;
1305 *offsets++=sourceIndex++;
1306 }
1307 } else {
1308 if(offsets!=NULL) {
1309 *offsets++=sourceIndex++;
1310 }
1311 b=(uint8_t)((c>>4)&0x3f);
1312 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1313 cnv->charErrorBufferLength=1;
1314 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1315 }
1316 bits=(uint8_t)((c&15)<<2);
1317 base64Counter=1;
1318 break;
1319 case 1:
1320 b=(uint8_t)(bits|(c>>14));
1321 *target++=TO_BASE64_IMAP(b);
1322 if(target<targetLimit) {
1323 b=(uint8_t)((c>>8)&0x3f);
1324 *target++=TO_BASE64_IMAP(b);
1325 if(target<targetLimit) {
1326 b=(uint8_t)((c>>2)&0x3f);
1327 *target++=TO_BASE64_IMAP(b);
1328 if(offsets!=NULL) {
1329 *offsets++=sourceIndex;
1330 *offsets++=sourceIndex;
1331 *offsets++=sourceIndex++;
1332 }
1333 } else {
1334 if(offsets!=NULL) {
1335 *offsets++=sourceIndex;
1336 *offsets++=sourceIndex++;
1337 }
1338 b=(uint8_t)((c>>2)&0x3f);
1339 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340 cnv->charErrorBufferLength=1;
1341 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1342 }
1343 } else {
1344 if(offsets!=NULL) {
1345 *offsets++=sourceIndex++;
1346 }
1347 b=(uint8_t)((c>>8)&0x3f);
1348 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1349 b=(uint8_t)((c>>2)&0x3f);
1350 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1351 cnv->charErrorBufferLength=2;
1352 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1353 }
1354 bits=(uint8_t)((c&3)<<4);
1355 base64Counter=2;
1356 break;
1357 case 2:
1358 b=(uint8_t)(bits|(c>>12));
1359 *target++=TO_BASE64_IMAP(b);
1360 if(target<targetLimit) {
1361 b=(uint8_t)((c>>6)&0x3f);
1362 *target++=TO_BASE64_IMAP(b);
1363 if(target<targetLimit) {
1364 b=(uint8_t)(c&0x3f);
1365 *target++=TO_BASE64_IMAP(b);
1366 if(offsets!=NULL) {
1367 *offsets++=sourceIndex;
1368 *offsets++=sourceIndex;
1369 *offsets++=sourceIndex++;
1370 }
1371 } else {
1372 if(offsets!=NULL) {
1373 *offsets++=sourceIndex;
1374 *offsets++=sourceIndex++;
1375 }
1376 b=(uint8_t)(c&0x3f);
1377 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378 cnv->charErrorBufferLength=1;
1379 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1380 }
1381 } else {
1382 if(offsets!=NULL) {
1383 *offsets++=sourceIndex++;
1384 }
1385 b=(uint8_t)((c>>6)&0x3f);
1386 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1387 b=(uint8_t)(c&0x3f);
1388 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1389 cnv->charErrorBufferLength=2;
1390 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1391 }
1392 bits=0;
1393 base64Counter=0;
1394 break;
1395 default:
1396 /* will never occur */
1397 break;
1398 }
1399 }
1400 } else {
1401 /* target is full */
1402 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1403 break;
1404 }
1405 }
1406 }
1407
1408 if(pArgs->flush && source>=sourceLimit) {
1409 /* flush remaining bits to the target */
1410 if(!inDirectMode) {
1411 if(base64Counter!=0) {
1412 if(target<targetLimit) {
1413 *target++=TO_BASE64_IMAP(bits);
1414 if(offsets!=NULL) {
1415 *offsets++=sourceIndex-1;
1416 }
1417 } else {
1418 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1419 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1420 }
1421 }
1422 /* need to terminate with a minus */
1423 if(target<targetLimit) {
1424 *target++=MINUS;
1425 if(offsets!=NULL) {
1426 *offsets++=sourceIndex-1;
1427 }
1428 } else {
1429 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1430 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1431 }
1432 }
1433 /* reset the state for the next conversion */
1434 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1435 } else {
1436 /* set the converter state back into UConverter */
1437 cnv->fromUnicodeStatus=
1438 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1439 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1440 }
1441
1442 /* write back the updated pointers */
1443 pArgs->source=source;
1444 pArgs->target=(char *)target;
1445 pArgs->offsets=offsets;
1446 return;
1447}
f3c0d7a5 1448U_CDECL_END
b75a7d8f
A
1449
1450static const UConverterImpl _IMAPImpl={
1451 UCNV_IMAP_MAILBOX,
1452
1453 NULL,
1454 NULL,
1455
1456 _UTF7Open,
1457 NULL,
1458 _UTF7Reset,
1459
1460 _IMAPToUnicodeWithOffsets,
1461 _IMAPToUnicodeWithOffsets,
1462 _IMAPFromUnicodeWithOffsets,
1463 _IMAPFromUnicodeWithOffsets,
374ca955 1464 NULL,
b75a7d8f
A
1465
1466 NULL,
1467 NULL,
1468 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1469 NULL,
f3c0d7a5
A
1470 ucnv_getCompleteUnicodeSet,
1471 NULL,
1472 NULL
b75a7d8f
A
1473};
1474
1475static const UConverterStaticData _IMAPStaticData={
1476 sizeof(UConverterStaticData),
1477 "IMAP-mailbox-name",
374ca955 1478 0, /* TODO CCSID for IMAP-mailbox-name */
b75a7d8f
A
1479 UCNV_IBM, UCNV_IMAP_MAILBOX,
1480 1, 4,
1481 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1482 FALSE, FALSE,
1483 0,
1484 0,
1485 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1486};
1487
2ca993e8
A
1488const UConverterSharedData _IMAPData=
1489 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
374ca955
A
1490
1491#endif