2 *******************************************************************************
4 * Copyright (C) 2002-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: punycode.cpp
10 * tab size: 8 (not used)
13 * created on: 2002jan31
14 * created by: Markus W. Scherer
18 /* This ICU code derived from: */
20 punycode.c 0.4.0 (2001-Nov-17-Sat)
21 http://www.cs.berkeley.edu/~amc/idn/
23 http://www.nicemice.net/amc/
25 Disclaimer and license
27 Regarding this entire document or any portion of it (including
28 the pseudocode and C code), the author makes no guarantees and
29 is not responsible for any damage resulting from its use. The
30 author grants irrevocable permission to anyone to use, modify,
31 and distribute it in any way that does not diminish the rights
32 of anyone else to use, modify, and distribute it, provided that
33 redistributed derivative works do not contain misleading author or
34 version information. Derivative works need not be licensed under
39 * - ICU data types and coding conventions
40 * - ICU string buffer handling with implicit source lengths
41 * and destination preflighting
45 #include "unicode/utypes.h"
49 #include "unicode/ustring.h"
50 #include "unicode/utf.h"
51 #include "unicode/utf16.h"
59 /* Punycode ----------------------------------------------------------------- */
61 /* Punycode parameters for Bootstring */
67 #define INITIAL_BIAS 72
68 #define INITIAL_N 0x80
70 /* "Basic" Unicode/ASCII code points */
72 #define DELIMITER _HYPHEN
80 #define _CAPITAL_A 0X41
81 #define _CAPITAL_Z 0X5a
83 #define IS_BASIC(c) ((c)<0x80)
84 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87 * digitToBasic() returns the basic code point whose value
88 * (when used for representing integers) is d, which must be in the
89 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
90 * nonzero, in which case the uppercase form is used.
93 digitToBasic(int32_t digit
, UBool uppercase
) {
94 /* 0..25 map to ASCII a..z or A..Z */
95 /* 26..35 map to ASCII 0..9 */
98 return (char)(_CAPITAL_A
+digit
);
100 return (char)(_SMALL_A
+digit
);
103 return (char)((_ZERO_
-26)+digit
);
108 * basicToDigit[] contains the numeric value of a basic code
109 * point (for use in representing integers) in the range 0 to
110 * BASE-1, or -1 if b is does not represent a value.
114 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
121 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
123 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
124 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
135 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
136 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
140 asciiCaseMap(char b
, UBool uppercase
) {
142 if(_SMALL_A
<=b
&& b
<=_SMALL_Z
) {
143 b
-=(_SMALL_A
-_CAPITAL_A
);
146 if(_CAPITAL_A
<=b
&& b
<=_CAPITAL_Z
) {
147 b
+=(_SMALL_A
-_CAPITAL_A
);
153 /* Punycode-specific Bootstring code ---------------------------------------- */
156 * The following code omits the {parts} of the pseudo-algorithm in the spec
157 * that are not used with the Punycode parameter set.
160 /* Bias adaptation function. */
162 adaptBias(int32_t delta
, int32_t length
, UBool firstTime
) {
172 for(count
=0; delta
>((BASE
-TMIN
)*TMAX
)/2; count
+=BASE
) {
176 return count
+(((BASE
-TMIN
+1)*delta
)/(delta
+SKEW
));
179 #define MAX_CP_COUNT 200
182 u_strToPunycode(const UChar
*src
, int32_t srcLength
,
183 UChar
*dest
, int32_t destCapacity
,
184 const UBool
*caseFlags
,
185 UErrorCode
*pErrorCode
) {
187 int32_t cpBuffer
[MAX_CP_COUNT
];
188 int32_t n
, delta
, handledCPCount
, basicLength
, destLength
, bias
, j
, m
, q
, k
, t
, srcCPCount
;
191 /* argument checking */
192 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
196 if(src
==NULL
|| srcLength
<-1 || (dest
==NULL
&& destCapacity
!=0)) {
197 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
202 * Handle the basic code points and
203 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
205 srcCPCount
=destLength
=0;
207 /* NUL-terminated input */
208 for(j
=0; /* no condition */; ++j
) {
212 if(srcCPCount
==MAX_CP_COUNT
) {
213 /* too many input code points */
214 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
218 cpBuffer
[srcCPCount
++]=0;
219 if(destLength
<destCapacity
) {
222 asciiCaseMap((char)c
, caseFlags
[j
]) :
227 n
=(caseFlags
!=NULL
&& caseFlags
[j
])<<31L;
228 if(U16_IS_SINGLE(c
)) {
230 } else if(U16_IS_LEAD(c
) && U16_IS_TRAIL(c2
=src
[j
+1])) {
232 n
|=(int32_t)U16_GET_SUPPLEMENTARY(c
, c2
);
234 /* error: unmatched surrogate */
235 *pErrorCode
=U_INVALID_CHAR_FOUND
;
238 cpBuffer
[srcCPCount
++]=n
;
242 /* length-specified input */
243 for(j
=0; j
<srcLength
; ++j
) {
244 if(srcCPCount
==MAX_CP_COUNT
) {
245 /* too many input code points */
246 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
251 cpBuffer
[srcCPCount
++]=0;
252 if(destLength
<destCapacity
) {
255 asciiCaseMap((char)c
, caseFlags
[j
]) :
260 n
=(caseFlags
!=NULL
&& caseFlags
[j
])<<31L;
261 if(U16_IS_SINGLE(c
)) {
263 } else if(U16_IS_LEAD(c
) && (j
+1)<srcLength
&& U16_IS_TRAIL(c2
=src
[j
+1])) {
265 n
|=(int32_t)U16_GET_SUPPLEMENTARY(c
, c2
);
267 /* error: unmatched surrogate */
268 *pErrorCode
=U_INVALID_CHAR_FOUND
;
271 cpBuffer
[srcCPCount
++]=n
;
276 /* Finish the basic string - if it is not empty - with a delimiter. */
277 basicLength
=destLength
;
279 if(destLength
<destCapacity
) {
280 dest
[destLength
]=DELIMITER
;
286 * handledCPCount is the number of code points that have been handled
287 * basicLength is the number of basic code points
288 * destLength is the number of chars that have been output
291 /* Initialize the state: */
296 /* Main encoding loop: */
297 for(handledCPCount
=basicLength
; handledCPCount
<srcCPCount
; /* no op */) {
299 * All non-basic code points < n have been handled already.
300 * Find the next larger one:
302 for(m
=0x7fffffff, j
=0; j
<srcCPCount
; ++j
) {
303 q
=cpBuffer
[j
]&0x7fffffff; /* remove case flag from the sign bit */
310 * Increase delta enough to advance the decoder's
311 * <n,i> state to <m,0>, but guard against overflow:
313 if(m
-n
>(0x7fffffff-MAX_CP_COUNT
-delta
)/(handledCPCount
+1)) {
314 *pErrorCode
=U_INTERNAL_PROGRAM_ERROR
;
317 delta
+=(m
-n
)*(handledCPCount
+1);
320 /* Encode a sequence of same code points n */
321 for(j
=0; j
<srcCPCount
; ++j
) {
322 q
=cpBuffer
[j
]&0x7fffffff; /* remove case flag from the sign bit */
326 /* Represent delta as a generalized variable-length integer: */
327 for(q
=delta
, k
=BASE
; /* no condition */; k
+=BASE
) {
329 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
342 } else if(k
>=(bias
+TMAX
)) {
350 if(destLength
<destCapacity
) {
351 dest
[destLength
]=digitToBasic(t
+(q
-t
)%(BASE
-t
), 0);
357 if(destLength
<destCapacity
) {
358 dest
[destLength
]=digitToBasic(q
, (UBool
)(cpBuffer
[j
]<0));
361 bias
=adaptBias(delta
, handledCPCount
+1, (UBool
)(handledCPCount
==basicLength
));
371 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
375 u_strFromPunycode(const UChar
*src
, int32_t srcLength
,
376 UChar
*dest
, int32_t destCapacity
,
378 UErrorCode
*pErrorCode
) {
379 int32_t n
, destLength
, i
, bias
, basicLength
, j
, in
, oldi
, w
, k
, digit
, t
,
380 destCPCount
, firstSupplementaryIndex
, cpLength
;
383 /* argument checking */
384 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
388 if(src
==NULL
|| srcLength
<-1 || (dest
==NULL
&& destCapacity
!=0)) {
389 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
394 srcLength
=u_strlen(src
);
398 * Handle the basic code points:
399 * Let basicLength be the number of input code points
400 * before the last delimiter, or 0 if there is none,
401 * then copy the first basicLength code points to the output.
403 * The two following loops iterate backward.
405 for(j
=srcLength
; j
>0;) {
406 if(src
[--j
]==DELIMITER
) {
410 destLength
=basicLength
=destCPCount
=j
;
411 U_ASSERT(destLength
>=0);
416 *pErrorCode
=U_INVALID_CHAR_FOUND
;
423 if(caseFlags
!=NULL
) {
424 caseFlags
[j
]=IS_BASIC_UPPERCASE(b
);
429 /* Initialize the state: */
433 firstSupplementaryIndex
=1000000000;
436 * Main decoding loop:
437 * Start just after the last delimiter if any
438 * basic code points were copied; start at the beginning otherwise.
440 for(in
=basicLength
>0 ? basicLength
+1 : 0; in
<srcLength
; /* no op */) {
442 * in is the index of the next character to be consumed, and
443 * destCPCount is the number of code points in the output array.
445 * Decode a generalized variable-length integer into delta,
446 * which gets added to i. The overflow checking is easier
447 * if we increase i as we go, then subtract off its starting
448 * value at the end to obtain delta.
450 for(oldi
=i
, w
=1, k
=BASE
; /* no condition */; k
+=BASE
) {
452 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
456 digit
=basicToDigit
[(uint8_t)src
[in
++]];
458 *pErrorCode
=U_INVALID_CHAR_FOUND
;
461 if(digit
>(0x7fffffff-i
)/w
) {
462 /* integer overflow */
463 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
468 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
479 } else if(k
>=(bias
+TMAX
)) {
486 if(w
>0x7fffffff/(BASE
-t
)) {
487 /* integer overflow */
488 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
495 * Modification from sample code:
496 * Increments destCPCount here,
497 * where needed instead of in for() loop tail.
500 bias
=adaptBias(i
-oldi
, destCPCount
, (UBool
)(oldi
==0));
503 * i was supposed to wrap around from (incremented) destCPCount to 0,
504 * incrementing n each time, so we'll fix that now:
506 if(i
/destCPCount
>(0x7fffffff-n
)) {
507 /* integer overflow */
508 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
514 /* not needed for Punycode: */
515 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
517 if(n
>0x10ffff || U_IS_SURROGATE(n
)) {
518 /* Unicode code point overflow */
519 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
523 /* Insert n at position i of the output: */
524 cpLength
=U16_LENGTH(n
);
525 if(dest
!=NULL
&& ((destLength
+cpLength
)<=destCapacity
)) {
526 int32_t codeUnitIndex
;
529 * Handle indexes when supplementary code points are present.
531 * In almost all cases, there will be only BMP code points before i
532 * and even in the entire string.
533 * This is handled with the same efficiency as with UTF-32.
535 * Only the rare cases with supplementary code points are handled
536 * more slowly - but not too bad since this is an insertion anyway.
538 if(i
<=firstSupplementaryIndex
) {
541 firstSupplementaryIndex
=codeUnitIndex
;
543 ++firstSupplementaryIndex
;
546 codeUnitIndex
=firstSupplementaryIndex
;
547 U16_FWD_N(dest
, codeUnitIndex
, destLength
, i
-codeUnitIndex
);
550 /* use the UChar index codeUnitIndex instead of the code point index i */
551 if(codeUnitIndex
<destLength
) {
552 uprv_memmove(dest
+codeUnitIndex
+cpLength
,
554 (destLength
-codeUnitIndex
)*U_SIZEOF_UCHAR
);
555 if(caseFlags
!=NULL
) {
556 uprv_memmove(caseFlags
+codeUnitIndex
+cpLength
,
557 caseFlags
+codeUnitIndex
,
558 destLength
-codeUnitIndex
);
562 /* BMP, insert one code unit */
563 dest
[codeUnitIndex
]=(UChar
)n
;
565 /* supplementary character, insert two code units */
566 dest
[codeUnitIndex
]=U16_LEAD(n
);
567 dest
[codeUnitIndex
+1]=U16_TRAIL(n
);
569 if(caseFlags
!=NULL
) {
570 /* Case of last character determines uppercase flag: */
571 caseFlags
[codeUnitIndex
]=IS_BASIC_UPPERCASE(src
[in
-1]);
573 caseFlags
[codeUnitIndex
+1]=FALSE
;
577 destLength
+=cpLength
;
578 U_ASSERT(destLength
>=0);
582 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
585 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
587 #endif /* #if !UCONFIG_NO_IDNA */