2 *******************************************************************************
4 * Copyright (C) 2002-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: punycode.c
10 * tab size: 8 (not used)
13 * created on: 2002jan31
14 * created by: Markus W. Scherer
18 /* This ICU code derived from: */
20 punycode.c 0.4.0 (2001-Nov-17-Sat)
21 http://www.cs.berkeley.edu/~amc/idn/
23 http://www.nicemice.net/amc/
25 Disclaimer and license
27 Regarding this entire document or any portion of it (including
28 the pseudocode and C code), the author makes no guarantees and
29 is not responsible for any damage resulting from its use. The
30 author grants irrevocable permission to anyone to use, modify,
31 and distribute it in any way that does not diminish the rights
32 of anyone else to use, modify, and distribute it, provided that
33 redistributed derivative works do not contain misleading author or
34 version information. Derivative works need not be licensed under
39 * - ICU data types and coding conventions
40 * - ICU string buffer handling with implicit source lengths
41 * and destination preflighting
45 #include "unicode/utypes.h"
53 #include "unicode/ustring.h"
56 /* Punycode ----------------------------------------------------------------- */
58 /* Punycode parameters for Bootstring */
64 #define INITIAL_BIAS 72
65 #define INITIAL_N 0x80
67 /* "Basic" Unicode/ASCII code points */
69 #define DELIMITER _HYPHEN
77 #define _CAPITAL_A 0X41
78 #define _CAPITAL_Z 0X5a
80 #define IS_BASIC(c) ((c)<0x80)
81 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
84 * digitToBasic() returns the basic code point whose value
85 * (when used for representing integers) is d, which must be in the
86 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
87 * nonzero, in which case the uppercase form is used.
90 digitToBasic(int32_t digit
, UBool uppercase
) {
91 /* 0..25 map to ASCII a..z or A..Z */
92 /* 26..35 map to ASCII 0..9 */
95 return (char)(_CAPITAL_A
+digit
);
97 return (char)(_SMALL_A
+digit
);
100 return (char)((_ZERO_
-26)+digit
);
105 * basicToDigit[] contains the numeric value of a basic code
106 * point (for use in representing integers) in the range 0 to
107 * BASE-1, or -1 if b is does not represent a value.
111 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
112 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
114 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
117 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
118 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
121 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
123 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
124 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
137 asciiCaseMap(char b
, UBool uppercase
) {
139 if(_SMALL_A
<=b
&& b
<=_SMALL_Z
) {
140 b
-=(_SMALL_A
-_CAPITAL_A
);
143 if(_CAPITAL_A
<=b
&& b
<=_CAPITAL_Z
) {
144 b
+=(_SMALL_A
-_CAPITAL_A
);
150 /* Punycode-specific Bootstring code ---------------------------------------- */
153 * The following code omits the {parts} of the pseudo-algorithm in the spec
154 * that are not used with the Punycode parameter set.
157 /* Bias adaptation function. */
159 adaptBias(int32_t delta
, int32_t length
, UBool firstTime
) {
169 for(count
=0; delta
>((BASE
-TMIN
)*TMAX
)/2; count
+=BASE
) {
173 return count
+(((BASE
-TMIN
+1)*delta
)/(delta
+SKEW
));
176 #define MAX_CP_COUNT 200
179 u_strToPunycode(const UChar
*src
, int32_t srcLength
,
180 UChar
*dest
, int32_t destCapacity
,
181 const UBool
*caseFlags
,
182 UErrorCode
*pErrorCode
) {
184 int32_t cpBuffer
[MAX_CP_COUNT
];
185 int32_t n
, delta
, handledCPCount
, basicLength
, destLength
, bias
, j
, m
, q
, k
, t
, srcCPCount
;
188 /* argument checking */
189 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
193 if(src
==NULL
|| srcLength
<-1 || (dest
==NULL
&& destCapacity
!=0)) {
194 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
199 * Handle the basic code points and
200 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
202 srcCPCount
=destLength
=0;
204 /* NUL-terminated input */
205 for(j
=0; /* no condition */; ++j
) {
209 if(srcCPCount
==MAX_CP_COUNT
) {
210 /* too many input code points */
211 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
215 cpBuffer
[srcCPCount
++]=0;
216 if(destLength
<destCapacity
) {
219 asciiCaseMap((char)c
, caseFlags
[j
]) :
224 n
=(caseFlags
!=NULL
&& caseFlags
[j
])<<31L;
225 if(UTF_IS_SINGLE(c
)) {
227 } else if(UTF_IS_LEAD(c
) && UTF_IS_TRAIL(c2
=src
[j
+1])) {
229 n
|=(int32_t)UTF16_GET_PAIR_VALUE(c
, c2
);
231 /* error: unmatched surrogate */
232 *pErrorCode
=U_INVALID_CHAR_FOUND
;
235 cpBuffer
[srcCPCount
++]=n
;
239 /* length-specified input */
240 for(j
=0; j
<srcLength
; ++j
) {
241 if(srcCPCount
==MAX_CP_COUNT
) {
242 /* too many input code points */
243 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
248 if(destLength
<destCapacity
) {
249 cpBuffer
[srcCPCount
++]=0;
252 asciiCaseMap((char)c
, caseFlags
[j
]) :
257 n
=(caseFlags
!=NULL
&& caseFlags
[j
])<<31L;
258 if(UTF_IS_SINGLE(c
)) {
260 } else if(UTF_IS_LEAD(c
) && (j
+1)<srcLength
&& UTF_IS_TRAIL(c2
=src
[j
+1])) {
262 n
|=(int32_t)UTF16_GET_PAIR_VALUE(c
, c2
);
264 /* error: unmatched surrogate */
265 *pErrorCode
=U_INVALID_CHAR_FOUND
;
268 cpBuffer
[srcCPCount
++]=n
;
273 /* Finish the basic string - if it is not empty - with a delimiter. */
274 basicLength
=destLength
;
276 if(destLength
<destCapacity
) {
277 dest
[destLength
]=DELIMITER
;
283 * handledCPCount is the number of code points that have been handled
284 * basicLength is the number of basic code points
285 * destLength is the number of chars that have been output
288 /* Initialize the state: */
293 /* Main encoding loop: */
294 for(handledCPCount
=basicLength
; handledCPCount
<srcCPCount
; /* no op */) {
296 * All non-basic code points < n have been handled already.
297 * Find the next larger one:
299 for(m
=0x7fffffff, j
=0; j
<srcCPCount
; ++j
) {
300 q
=cpBuffer
[j
]&0x7fffffff; /* remove case flag from the sign bit */
307 * Increase delta enough to advance the decoder's
308 * <n,i> state to <m,0>, but guard against overflow:
310 if(m
-n
>(0x7fffffff-MAX_CP_COUNT
-delta
)/(handledCPCount
+1)) {
311 *pErrorCode
=U_INTERNAL_PROGRAM_ERROR
;
314 delta
+=(m
-n
)*(handledCPCount
+1);
317 /* Encode a sequence of same code points n */
318 for(j
=0; j
<srcCPCount
; ++j
) {
319 q
=cpBuffer
[j
]&0x7fffffff; /* remove case flag from the sign bit */
323 /* Represent delta as a generalized variable-length integer: */
324 for(q
=delta
, k
=BASE
; /* no condition */; k
+=BASE
) {
326 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
339 } else if(k
>=(bias
+TMAX
)) {
347 if(destLength
<destCapacity
) {
348 dest
[destLength
++]=digitToBasic(t
+(q
-t
)%(BASE
-t
), 0);
353 if(destLength
<destCapacity
) {
354 dest
[destLength
++]=digitToBasic(q
, (UBool
)(cpBuffer
[j
]<0));
356 bias
=adaptBias(delta
, handledCPCount
+1, (UBool
)(handledCPCount
==basicLength
));
366 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
370 u_strFromPunycode(const UChar
*src
, int32_t srcLength
,
371 UChar
*dest
, int32_t destCapacity
,
373 UErrorCode
*pErrorCode
) {
374 int32_t n
, destLength
, i
, bias
, basicLength
, j
, in
, oldi
, w
, k
, digit
, t
,
375 destCPCount
, firstSupplementaryIndex
, cpLength
;
378 /* argument checking */
379 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
383 if(src
==NULL
|| srcLength
<-1 || (dest
==NULL
&& destCapacity
!=0)) {
384 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
389 srcLength
=u_strlen(src
);
393 * Handle the basic code points:
394 * Let basicLength be the number of input code points
395 * before the last delimiter, or 0 if there is none,
396 * then copy the first basicLength code points to the output.
398 * The two following loops iterate backward.
400 for(j
=srcLength
; j
>0;) {
401 if(src
[--j
]==DELIMITER
) {
405 destLength
=basicLength
=destCPCount
=j
;
410 *pErrorCode
=U_INVALID_CHAR_FOUND
;
417 if(caseFlags
!=NULL
) {
418 caseFlags
[j
]=IS_BASIC_UPPERCASE(b
);
423 /* Initialize the state: */
427 firstSupplementaryIndex
=1000000000;
430 * Main decoding loop:
431 * Start just after the last delimiter if any
432 * basic code points were copied; start at the beginning otherwise.
434 for(in
=basicLength
>0 ? basicLength
+1 : 0; in
<srcLength
; /* no op */) {
436 * in is the index of the next character to be consumed, and
437 * destCPCount is the number of code points in the output array.
439 * Decode a generalized variable-length integer into delta,
440 * which gets added to i. The overflow checking is easier
441 * if we increase i as we go, then subtract off its starting
442 * value at the end to obtain delta.
444 for(oldi
=i
, w
=1, k
=BASE
; /* no condition */; k
+=BASE
) {
446 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
450 digit
=basicToDigit
[(uint8_t)src
[in
++]];
452 *pErrorCode
=U_INVALID_CHAR_FOUND
;
455 if(digit
>(0x7fffffff-i
)/w
) {
456 /* integer overflow */
457 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
462 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
473 } else if(k
>=(bias
+TMAX
)) {
480 if(w
>0x7fffffff/(BASE
-t
)) {
481 /* integer overflow */
482 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
489 * Modification from sample code:
490 * Increments destCPCount here,
491 * where needed instead of in for() loop tail.
494 bias
=adaptBias(i
-oldi
, destCPCount
, (UBool
)(oldi
==0));
497 * i was supposed to wrap around from (incremented) destCPCount to 0,
498 * incrementing n each time, so we'll fix that now:
500 if(i
/destCPCount
>(0x7fffffff-n
)) {
501 /* integer overflow */
502 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
508 /* not needed for Punycode: */
509 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
511 if(n
>0x10ffff || UTF_IS_SURROGATE(n
)) {
512 /* Unicode code point overflow */
513 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
517 /* Insert n at position i of the output: */
518 cpLength
=UTF_CHAR_LENGTH(n
);
519 if((destLength
+cpLength
)<destCapacity
) {
520 int32_t codeUnitIndex
;
523 * Handle indexes when supplementary code points are present.
525 * In almost all cases, there will be only BMP code points before i
526 * and even in the entire string.
527 * This is handled with the same efficiency as with UTF-32.
529 * Only the rare cases with supplementary code points are handled
530 * more slowly - but not too bad since this is an insertion anyway.
532 if(i
<=firstSupplementaryIndex
) {
535 firstSupplementaryIndex
=codeUnitIndex
;
537 ++firstSupplementaryIndex
;
540 codeUnitIndex
=firstSupplementaryIndex
;
541 UTF_FWD_N(dest
, codeUnitIndex
, destLength
, i
-codeUnitIndex
);
544 /* use the UChar index codeUnitIndex instead of the code point index i */
545 if(codeUnitIndex
<destLength
) {
546 uprv_memmove(dest
+codeUnitIndex
+cpLength
,
548 (destLength
-codeUnitIndex
)*U_SIZEOF_UCHAR
);
549 if(caseFlags
!=NULL
) {
550 uprv_memmove(caseFlags
+codeUnitIndex
+cpLength
,
551 caseFlags
+codeUnitIndex
,
552 destLength
-codeUnitIndex
);
556 /* BMP, insert one code unit */
557 dest
[codeUnitIndex
]=(UChar
)n
;
559 /* supplementary character, insert two code units */
560 dest
[codeUnitIndex
]=UTF16_LEAD(n
);
561 dest
[codeUnitIndex
+1]=UTF16_TRAIL(n
);
563 if(caseFlags
!=NULL
) {
564 /* Case of last character determines uppercase flag: */
565 caseFlags
[codeUnitIndex
]=IS_BASIC_UPPERCASE(src
[in
-1]);
567 caseFlags
[codeUnitIndex
+1]=FALSE
;
571 destLength
+=cpLength
;
575 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
578 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
580 #endif /* #if !UCONFIG_NO_IDNA */