1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2002-2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: punycode.cpp
12 * tab size: 8 (not used)
15 * created on: 2002jan31
16 * created by: Markus W. Scherer
20 /* This ICU code derived from: */
22 punycode.c 0.4.0 (2001-Nov-17-Sat)
23 http://www.cs.berkeley.edu/~amc/idn/
25 http://www.nicemice.net/amc/
27 Disclaimer and license
29 Regarding this entire document or any portion of it (including
30 the pseudocode and C code), the author makes no guarantees and
31 is not responsible for any damage resulting from its use. The
32 author grants irrevocable permission to anyone to use, modify,
33 and distribute it in any way that does not diminish the rights
34 of anyone else to use, modify, and distribute it, provided that
35 redistributed derivative works do not contain misleading author or
36 version information. Derivative works need not be licensed under
41 * - ICU data types and coding conventions
42 * - ICU string buffer handling with implicit source lengths
43 * and destination preflighting
47 #include "unicode/utypes.h"
51 #include "unicode/ustring.h"
52 #include "unicode/utf.h"
53 #include "unicode/utf16.h"
61 /* Punycode ----------------------------------------------------------------- */
63 /* Punycode parameters for Bootstring */
69 #define INITIAL_BIAS 72
70 #define INITIAL_N 0x80
72 /* "Basic" Unicode/ASCII code points */
74 #define DELIMITER _HYPHEN
82 #define _CAPITAL_A 0X41
83 #define _CAPITAL_Z 0X5a
85 #define IS_BASIC(c) ((c)<0x80)
86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
89 * digitToBasic() returns the basic code point whose value
90 * (when used for representing integers) is d, which must be in the
91 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92 * nonzero, in which case the uppercase form is used.
95 digitToBasic(int32_t digit
, UBool uppercase
) {
96 /* 0..25 map to ASCII a..z or A..Z */
97 /* 26..35 map to ASCII 0..9 */
100 return (char)(_CAPITAL_A
+digit
);
102 return (char)(_SMALL_A
+digit
);
105 return (char)((_ZERO_
-26)+digit
);
110 * basicToDigit[] contains the numeric value of a basic code
111 * point (for use in representing integers) in the range 0 to
112 * BASE-1, or -1 if b is does not represent a value.
116 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
119 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
122 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
123 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
125 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
126 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
128 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
131 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
134 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
135 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
137 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
142 asciiCaseMap(char b
, UBool uppercase
) {
144 if(_SMALL_A
<=b
&& b
<=_SMALL_Z
) {
145 b
-=(_SMALL_A
-_CAPITAL_A
);
148 if(_CAPITAL_A
<=b
&& b
<=_CAPITAL_Z
) {
149 b
+=(_SMALL_A
-_CAPITAL_A
);
155 /* Punycode-specific Bootstring code ---------------------------------------- */
158 * The following code omits the {parts} of the pseudo-algorithm in the spec
159 * that are not used with the Punycode parameter set.
162 /* Bias adaptation function. */
164 adaptBias(int32_t delta
, int32_t length
, UBool firstTime
) {
174 for(count
=0; delta
>((BASE
-TMIN
)*TMAX
)/2; count
+=BASE
) {
178 return count
+(((BASE
-TMIN
+1)*delta
)/(delta
+SKEW
));
181 #define MAX_CP_COUNT 200
184 u_strToPunycode(const UChar
*src
, int32_t srcLength
,
185 UChar
*dest
, int32_t destCapacity
,
186 const UBool
*caseFlags
,
187 UErrorCode
*pErrorCode
) {
189 int32_t cpBuffer
[MAX_CP_COUNT
];
190 int32_t n
, delta
, handledCPCount
, basicLength
, destLength
, bias
, j
, m
, q
, k
, t
, srcCPCount
;
193 /* argument checking */
194 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
198 if(src
==NULL
|| srcLength
<-1 || (dest
==NULL
&& destCapacity
!=0)) {
199 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
204 * Handle the basic code points and
205 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
207 srcCPCount
=destLength
=0;
209 /* NUL-terminated input */
210 for(j
=0; /* no condition */; ++j
) {
214 if(srcCPCount
==MAX_CP_COUNT
) {
215 /* too many input code points */
216 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
220 cpBuffer
[srcCPCount
++]=0;
221 if(destLength
<destCapacity
) {
224 asciiCaseMap((char)c
, caseFlags
[j
]) :
229 n
=(caseFlags
!=NULL
&& caseFlags
[j
])<<31L;
230 if(U16_IS_SINGLE(c
)) {
232 } else if(U16_IS_LEAD(c
) && U16_IS_TRAIL(c2
=src
[j
+1])) {
234 n
|=(int32_t)U16_GET_SUPPLEMENTARY(c
, c2
);
236 /* error: unmatched surrogate */
237 *pErrorCode
=U_INVALID_CHAR_FOUND
;
240 cpBuffer
[srcCPCount
++]=n
;
244 /* length-specified input */
245 for(j
=0; j
<srcLength
; ++j
) {
246 if(srcCPCount
==MAX_CP_COUNT
) {
247 /* too many input code points */
248 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
253 cpBuffer
[srcCPCount
++]=0;
254 if(destLength
<destCapacity
) {
257 asciiCaseMap((char)c
, caseFlags
[j
]) :
262 n
=(caseFlags
!=NULL
&& caseFlags
[j
])<<31L;
263 if(U16_IS_SINGLE(c
)) {
265 } else if(U16_IS_LEAD(c
) && (j
+1)<srcLength
&& U16_IS_TRAIL(c2
=src
[j
+1])) {
267 n
|=(int32_t)U16_GET_SUPPLEMENTARY(c
, c2
);
269 /* error: unmatched surrogate */
270 *pErrorCode
=U_INVALID_CHAR_FOUND
;
273 cpBuffer
[srcCPCount
++]=n
;
278 /* Finish the basic string - if it is not empty - with a delimiter. */
279 basicLength
=destLength
;
281 if(destLength
<destCapacity
) {
282 dest
[destLength
]=DELIMITER
;
288 * handledCPCount is the number of code points that have been handled
289 * basicLength is the number of basic code points
290 * destLength is the number of chars that have been output
293 /* Initialize the state: */
298 /* Main encoding loop: */
299 for(handledCPCount
=basicLength
; handledCPCount
<srcCPCount
; /* no op */) {
301 * All non-basic code points < n have been handled already.
302 * Find the next larger one:
304 for(m
=0x7fffffff, j
=0; j
<srcCPCount
; ++j
) {
305 q
=cpBuffer
[j
]&0x7fffffff; /* remove case flag from the sign bit */
312 * Increase delta enough to advance the decoder's
313 * <n,i> state to <m,0>, but guard against overflow:
315 if(m
-n
>(0x7fffffff-MAX_CP_COUNT
-delta
)/(handledCPCount
+1)) {
316 *pErrorCode
=U_INTERNAL_PROGRAM_ERROR
;
319 delta
+=(m
-n
)*(handledCPCount
+1);
322 /* Encode a sequence of same code points n */
323 for(j
=0; j
<srcCPCount
; ++j
) {
324 q
=cpBuffer
[j
]&0x7fffffff; /* remove case flag from the sign bit */
328 /* Represent delta as a generalized variable-length integer: */
329 for(q
=delta
, k
=BASE
; /* no condition */; k
+=BASE
) {
331 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
344 } else if(k
>=(bias
+TMAX
)) {
352 if(destLength
<destCapacity
) {
353 dest
[destLength
]=digitToBasic(t
+(q
-t
)%(BASE
-t
), 0);
359 if(destLength
<destCapacity
) {
360 dest
[destLength
]=digitToBasic(q
, (UBool
)(cpBuffer
[j
]<0));
363 bias
=adaptBias(delta
, handledCPCount
+1, (UBool
)(handledCPCount
==basicLength
));
373 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
377 u_strFromPunycode(const UChar
*src
, int32_t srcLength
,
378 UChar
*dest
, int32_t destCapacity
,
380 UErrorCode
*pErrorCode
) {
381 int32_t n
, destLength
, i
, bias
, basicLength
, j
, in
, oldi
, w
, k
, digit
, t
,
382 destCPCount
, firstSupplementaryIndex
, cpLength
;
385 /* argument checking */
386 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
390 if(src
==NULL
|| srcLength
<-1 || (dest
==NULL
&& destCapacity
!=0)) {
391 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
396 srcLength
=u_strlen(src
);
400 * Handle the basic code points:
401 * Let basicLength be the number of input code points
402 * before the last delimiter, or 0 if there is none,
403 * then copy the first basicLength code points to the output.
405 * The two following loops iterate backward.
407 for(j
=srcLength
; j
>0;) {
408 if(src
[--j
]==DELIMITER
) {
412 destLength
=basicLength
=destCPCount
=j
;
413 U_ASSERT(destLength
>=0);
418 *pErrorCode
=U_INVALID_CHAR_FOUND
;
425 if(caseFlags
!=NULL
) {
426 caseFlags
[j
]=IS_BASIC_UPPERCASE(b
);
431 /* Initialize the state: */
435 firstSupplementaryIndex
=1000000000;
438 * Main decoding loop:
439 * Start just after the last delimiter if any
440 * basic code points were copied; start at the beginning otherwise.
442 for(in
=basicLength
>0 ? basicLength
+1 : 0; in
<srcLength
; /* no op */) {
444 * in is the index of the next character to be consumed, and
445 * destCPCount is the number of code points in the output array.
447 * Decode a generalized variable-length integer into delta,
448 * which gets added to i. The overflow checking is easier
449 * if we increase i as we go, then subtract off its starting
450 * value at the end to obtain delta.
452 for(oldi
=i
, w
=1, k
=BASE
; /* no condition */; k
+=BASE
) {
454 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
458 digit
=basicToDigit
[(uint8_t)src
[in
++]];
460 *pErrorCode
=U_INVALID_CHAR_FOUND
;
463 if(digit
>(0x7fffffff-i
)/w
) {
464 /* integer overflow */
465 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
470 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
481 } else if(k
>=(bias
+TMAX
)) {
488 if(w
>0x7fffffff/(BASE
-t
)) {
489 /* integer overflow */
490 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
497 * Modification from sample code:
498 * Increments destCPCount here,
499 * where needed instead of in for() loop tail.
502 bias
=adaptBias(i
-oldi
, destCPCount
, (UBool
)(oldi
==0));
505 * i was supposed to wrap around from (incremented) destCPCount to 0,
506 * incrementing n each time, so we'll fix that now:
508 if(i
/destCPCount
>(0x7fffffff-n
)) {
509 /* integer overflow */
510 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
516 /* not needed for Punycode: */
517 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
519 if(n
>0x10ffff || U_IS_SURROGATE(n
)) {
520 /* Unicode code point overflow */
521 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
525 /* Insert n at position i of the output: */
526 cpLength
=U16_LENGTH(n
);
527 if(dest
!=NULL
&& ((destLength
+cpLength
)<=destCapacity
)) {
528 int32_t codeUnitIndex
;
531 * Handle indexes when supplementary code points are present.
533 * In almost all cases, there will be only BMP code points before i
534 * and even in the entire string.
535 * This is handled with the same efficiency as with UTF-32.
537 * Only the rare cases with supplementary code points are handled
538 * more slowly - but not too bad since this is an insertion anyway.
540 if(i
<=firstSupplementaryIndex
) {
543 firstSupplementaryIndex
=codeUnitIndex
;
545 ++firstSupplementaryIndex
;
548 codeUnitIndex
=firstSupplementaryIndex
;
549 U16_FWD_N(dest
, codeUnitIndex
, destLength
, i
-codeUnitIndex
);
552 /* use the UChar index codeUnitIndex instead of the code point index i */
553 if(codeUnitIndex
<destLength
) {
554 uprv_memmove(dest
+codeUnitIndex
+cpLength
,
556 (destLength
-codeUnitIndex
)*U_SIZEOF_UCHAR
);
557 if(caseFlags
!=NULL
) {
558 uprv_memmove(caseFlags
+codeUnitIndex
+cpLength
,
559 caseFlags
+codeUnitIndex
,
560 destLength
-codeUnitIndex
);
564 /* BMP, insert one code unit */
565 dest
[codeUnitIndex
]=(UChar
)n
;
567 /* supplementary character, insert two code units */
568 dest
[codeUnitIndex
]=U16_LEAD(n
);
569 dest
[codeUnitIndex
+1]=U16_TRAIL(n
);
571 if(caseFlags
!=NULL
) {
572 /* Case of last character determines uppercase flag: */
573 caseFlags
[codeUnitIndex
]=IS_BASIC_UPPERCASE(src
[in
-1]);
575 caseFlags
[codeUnitIndex
+1]=FALSE
;
579 destLength
+=cpLength
;
580 U_ASSERT(destLength
>=0);
584 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
587 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
589 #endif /* #if !UCONFIG_NO_IDNA */