[apple/icu.git] / icuSources / common / punycode.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 2002-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  punycode.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan31
*   created by: Markus W. Scherer
*/


/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/

Disclaimer and license

    Regarding this entire document or any portion of it (including
    the pseudocode and C code), the author makes no guarantees and
    is not responsible for any damage resulting from its use.  The
    author grants irrevocable permission to anyone to use, modify,
    and distribute it in any way that does not diminish the rights
    of anyone else to use, modify, and distribute it, provided that
    redistributed derivative works do not contain misleading author or
    version information.  Derivative works need not be licensed under
    similar terms.
*/
/*
 * ICU modifications:
 * - ICU data types and coding conventions
 * - ICU string buffer handling with implicit source lengths
 *   and destination preflighting
 * - UTF-16 handling
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "unicode/ustring.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "uassert.h"


/* Punycode ----------------------------------------------------------------- */

/* Punycode parameters for Bootstring */
#define BASE            36
#define TMIN            1
#define TMAX            26
#define SKEW            38
#define DAMP            700
#define INITIAL_BIAS    72
#define INITIAL_N       0x80

/* "Basic" Unicode/ASCII code points */
#define _HYPHEN         0X2d
#define DELIMITER       _HYPHEN

#define _ZERO_          0X30
#define _NINE           0x39

#define _SMALL_A        0X61
#define _SMALL_Z        0X7a

#define _CAPITAL_A      0X41
#define _CAPITAL_Z      0X5a

#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)

/**
 * digitToBasic() returns the basic code point whose value
 * (when used for representing integers) is d, which must be in the
 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
 * nonzero, in which case the uppercase form is used.
 */
static inline char
digitToBasic(int32_t digit, UBool uppercase) {
    /*  0..25 map to ASCII a..z or A..Z */
    /* 26..35 map to ASCII 0..9         */
    if(digit<26) {
        if(uppercase) {
            return (char)(_CAPITAL_A+digit);
        } else {
            return (char)(_SMALL_A+digit);
        }
    } else {
        return (char)((_ZERO_-26)+digit);
    }
}

/**
 * basicToDigit[] contains the numeric value of a basic code
 * point (for use in representing integers) in the range 0 to
 * BASE-1, or -1 if b is does not represent a value.
 */
static const int8_t
basicToDigit[256]={
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};

static inline char
asciiCaseMap(char b, UBool uppercase) {
    if(uppercase) {
        if(_SMALL_A<=b && b<=_SMALL_Z) {
            b-=(_SMALL_A-_CAPITAL_A);
        }
    } else {
        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
            b+=(_SMALL_A-_CAPITAL_A);
        }
    }
    return b;
}

/* Punycode-specific Bootstring code ---------------------------------------- */

/*
 * The following code omits the {parts} of the pseudo-algorithm in the spec
 * that are not used with the Punycode parameter set.
 */

/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    int32_t count;

    if(firstTime) {
        delta/=DAMP;
    } else {
        delta/=2;
    }

    delta+=delta/length;
    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
        delta/=(BASE-TMIN);
    }

    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}

#define MAX_CP_COUNT    200

U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
                UChar *dest, int32_t destCapacity,
                const UBool *caseFlags,
                UErrorCode *pErrorCode) {

    int32_t cpBuffer[MAX_CP_COUNT];
    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    UChar c, c2;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /*
     * Handle the basic code points and
     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
     */
    srcCPCount=destLength=0;
    if(srcLength==-1) {
        /* NUL-terminated input */
        for(j=0; /* no condition */; ++j) {
            if((c=src[j])==0) {
                break;
            }
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    } else {
        /* length-specified input */
        for(j=0; j<srcLength; ++j) {
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            c=src[j];
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    }

    /* Finish the basic string - if it is not empty - with a delimiter. */
    basicLength=destLength;
    if(basicLength>0) {
        if(destLength<destCapacity) {
            dest[destLength]=DELIMITER;
        }
        ++destLength;
    }

    /*
     * handledCPCount is the number of code points that have been handled
     * basicLength is the number of basic code points
     * destLength is the number of chars that have been output
     */

    /* Initialize the state: */
    n=INITIAL_N;
    delta=0;
    bias=INITIAL_BIAS;

    /* Main encoding loop: */
    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
        /*
         * All non-basic code points < n have been handled already.
         * Find the next larger one:
         */
        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(n<=q && q<m) {
                m=q;
            }
        }

        /*
         * Increase delta enough to advance the decoder's
         * <n,i> state to <m,0>, but guard against overflow:
         */
        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
            return 0;
        }
        delta+=(m-n)*(handledCPCount+1);
        n=m;

        /* Encode a sequence of same code points n */
        for(j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(q<n) {
                ++delta;
            } else if(q==n) {
                /* Represent delta as a generalized variable-length integer: */
                for(q=delta, k=BASE; /* no condition */; k+=BASE) {

                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(t>TMAX) {
                        t=TMAX;
                    }
                    */

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(k>=(bias+TMAX)) {
                        t=TMAX;
                    }

                    if(q<t) {
                        break;
                    }

                    if(destLength<destCapacity) {
                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
                    }
                    ++destLength;
                    q=(q-t)/(BASE-t);
                }

                if(destLength<destCapacity) {
                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
                }
                ++destLength;
                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
                delta=0;
                ++handledCPCount;
            }
        }

        ++delta;
        ++n;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
                  UChar *dest, int32_t destCapacity,
                  UBool *caseFlags,
                  UErrorCode *pErrorCode) {
    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
            destCPCount, firstSupplementaryIndex, cpLength;
    UChar b;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(srcLength==-1) {
        srcLength=u_strlen(src);
    }

    /*
     * Handle the basic code points:
     * Let basicLength be the number of input code points
     * before the last delimiter, or 0 if there is none,
     * then copy the first basicLength code points to the output.
     *
     * The two following loops iterate backward.
     */
    for(j=srcLength; j>0;) {
        if(src[--j]==DELIMITER) {
            break;
        }
    }
    destLength=basicLength=destCPCount=j;
    U_ASSERT(destLength>=0);

    while(j>0) {
        b=src[--j];
        if(!IS_BASIC(b)) {
            *pErrorCode=U_INVALID_CHAR_FOUND;
            return 0;
        }

        if(j<destCapacity) {
            dest[j]=(UChar)b;

            if(caseFlags!=NULL) {
                caseFlags[j]=IS_BASIC_UPPERCASE(b);
            }
        }
    }

    /* Initialize the state: */
    n=INITIAL_N;
    i=0;
    bias=INITIAL_BIAS;
    firstSupplementaryIndex=1000000000;

    /*
     * Main decoding loop:
     * Start just after the last delimiter if any
     * basic code points were copied; start at the beginning otherwise.
     */
    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
        /*
         * in is the index of the next character to be consumed, and
         * destCPCount is the number of code points in the output array.
         *
         * Decode a generalized variable-length integer into delta,
         * which gets added to i.  The overflow checking is easier
         * if we increase i as we go, then subtract off its starting
         * value at the end to obtain delta.
         */
        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
            if(in>=srcLength) {
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            digit=basicToDigit[(uint8_t)src[in++]];
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
            }
            if(digit>(0x7fffffff-i)/w) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            i+=digit*w;
            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(t>TMAX) {
                t=TMAX;
            }
            */
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(k>=(bias+TMAX)) {
                t=TMAX;
            }
            if(digit<t) {
                break;
            }

            if(w>0x7fffffff/(BASE-t)) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }
            w*=BASE-t;
        }

        /*
         * Modification from sample code:
         * Increments destCPCount here,
         * where needed instead of in for() loop tail.
         */
        ++destCPCount;
        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));

        /*
         * i was supposed to wrap around from (incremented) destCPCount to 0,
         * incrementing n each time, so we'll fix that now:
         */
        if(i/destCPCount>(0x7fffffff-n)) {
            /* integer overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        n+=i/destCPCount;
        i%=destCPCount;
        /* not needed for Punycode: */
        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

        if(n>0x10ffff || U_IS_SURROGATE(n)) {
            /* Unicode code point overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        /* Insert n at position i of the output: */
        cpLength=U16_LENGTH(n);
        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
            int32_t codeUnitIndex;

            /*
             * Handle indexes when supplementary code points are present.
             *
             * In almost all cases, there will be only BMP code points before i
             * and even in the entire string.
             * This is handled with the same efficiency as with UTF-32.
             *
             * Only the rare cases with supplementary code points are handled
             * more slowly - but not too bad since this is an insertion anyway.
             */
            if(i<=firstSupplementaryIndex) {
                codeUnitIndex=i;
                if(cpLength>1) {
                    firstSupplementaryIndex=codeUnitIndex;
                } else {
                    ++firstSupplementaryIndex;
                }
            } else {
                codeUnitIndex=firstSupplementaryIndex;
                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
            }

            /* use the UChar index codeUnitIndex instead of the code point index i */
            if(codeUnitIndex<destLength) {
                uprv_memmove(dest+codeUnitIndex+cpLength,
                             dest+codeUnitIndex,
                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
                if(caseFlags!=NULL) {
                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
                                 caseFlags+codeUnitIndex,
                                 destLength-codeUnitIndex);
                }
            }
            if(cpLength==1) {
                /* BMP, insert one code unit */
                dest[codeUnitIndex]=(UChar)n;
            } else {
                /* supplementary character, insert two code units */
                dest[codeUnitIndex]=U16_LEAD(n);
                dest[codeUnitIndex+1]=U16_TRAIL(n);
            }
            if(caseFlags!=NULL) {
                /* Case of last character determines uppercase flag: */
                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
                if(cpLength==2) {
                    caseFlags[codeUnitIndex+1]=FALSE;
                }
            }
        }
        destLength+=cpLength;
        U_ASSERT(destLength>=0);
        ++i;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */

#endif /* #if !UCONFIG_NO_IDNA */
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
	4	*******************************************************************************
	5	*
4388f060	6	* Copyright (C) 2002-2011, International Business Machines
b75a7d8f A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
4388f060	10	* file name: punycode.cpp
f3c0d7a5	11	* encoding: UTF-8
b75a7d8f A	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2002jan31
	16	* created by: Markus W. Scherer
	17	*/
	18
	19
	20	/* This ICU code derived from: */
	21	/*
	22	punycode.c 0.4.0 (2001-Nov-17-Sat)
	23	http://www.cs.berkeley.edu/~amc/idn/
	24	Adam M. Costello
	25	http://www.nicemice.net/amc/
	26
	27	Disclaimer and license
	28
	29	Regarding this entire document or any portion of it (including
	30	the pseudocode and C code), the author makes no guarantees and
	31	is not responsible for any damage resulting from its use. The
	32	author grants irrevocable permission to anyone to use, modify,
	33	and distribute it in any way that does not diminish the rights
	34	of anyone else to use, modify, and distribute it, provided that
	35	redistributed derivative works do not contain misleading author or
	36	version information. Derivative works need not be licensed under
	37	similar terms.
	38	*/
	39	/*
	40	* ICU modifications:
	41	* - ICU data types and coding conventions
	42	* - ICU string buffer handling with implicit source lengths
	43	* and destination preflighting
	44	* - UTF-16 handling
	45	*/
	46
	47	#include "unicode/utypes.h"
	48
	49	#if !UCONFIG_NO_IDNA
	50
4388f060 A	51	#include "unicode/ustring.h"
	52	#include "unicode/utf.h"
	53	#include "unicode/utf16.h"
b75a7d8f A	54	#include "ustr_imp.h"
	55	#include "cstring.h"
	56	#include "cmemory.h"
	57	#include "punycode.h"
4388f060	58	#include "uassert.h"
b75a7d8f A	59
	60
	61	/* Punycode ----------------------------------------------------------------- */
	62
	63	/* Punycode parameters for Bootstring */
	64	#define BASE 36
	65	#define TMIN 1
	66	#define TMAX 26
	67	#define SKEW 38
	68	#define DAMP 700
	69	#define INITIAL_BIAS 72
	70	#define INITIAL_N 0x80
	71
	72	/* "Basic" Unicode/ASCII code points */
	73	#define _HYPHEN 0X2d
	74	#define DELIMITER _HYPHEN
	75
	76	#define _ZERO_ 0X30
	77	#define _NINE 0x39
	78
	79	#define _SMALL_A 0X61
	80	#define _SMALL_Z 0X7a
	81
	82	#define _CAPITAL_A 0X41
	83	#define _CAPITAL_Z 0X5a
	84
	85	#define IS_BASIC(c) ((c)<0x80)
	86	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
	87
	88	/**
	89	* digitToBasic() returns the basic code point whose value
	90	* (when used for representing integers) is d, which must be in the
	91	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
	92	* nonzero, in which case the uppercase form is used.
	93	*/
4388f060	94	static inline char
b75a7d8f A	95	digitToBasic(int32_t digit, UBool uppercase) {
	96	/* 0..25 map to ASCII a..z or A..Z */
	97	/* 26..35 map to ASCII 0..9 */
	98	if(digit<26) {
	99	if(uppercase) {
	100	return (char)(_CAPITAL_A+digit);
	101	} else {
	102	return (char)(_SMALL_A+digit);
	103	}
	104	} else {
	105	return (char)((_ZERO_-26)+digit);
	106	}
	107	}
	108
	109	/**
	110	* basicToDigit[] contains the numeric value of a basic code
	111	* point (for use in representing integers) in the range 0 to
	112	* BASE-1, or -1 if b is does not represent a value.
	113	*/
	114	static const int8_t
	115	basicToDigit[256]={
	116	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	117	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	118
	119	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	120	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
	121
	122	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	123	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
	124
	125	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	126	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
	127
	128	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	129	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	130
	131	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	132	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	133
	134	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	135	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	136
	137	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	138	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
	139	};
	140
4388f060	141	static inline char
b75a7d8f A	142	asciiCaseMap(char b, UBool uppercase) {
	143	if(uppercase) {
	144	if(_SMALL_A<=b && b<=_SMALL_Z) {
	145	b-=(_SMALL_A-_CAPITAL_A);
	146	}
	147	} else {
	148	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
	149	b+=(_SMALL_A-_CAPITAL_A);
	150	}
	151	}
	152	return b;
	153	}
	154
	155	/* Punycode-specific Bootstring code ---------------------------------------- */
	156
	157	/*
	158	* The following code omits the {parts} of the pseudo-algorithm in the spec
	159	* that are not used with the Punycode parameter set.
	160	*/
	161
	162	/* Bias adaptation function. */
	163	static int32_t
	164	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
	165	int32_t count;
	166
	167	if(firstTime) {
	168	delta/=DAMP;
	169	} else {
	170	delta/=2;
	171	}
	172
	173	delta+=delta/length;
	174	for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
	175	delta/=(BASE-TMIN);
	176	}
	177
	178	return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
	179	}
	180
	181	#define MAX_CP_COUNT 200
	182
	183	U_CFUNC int32_t
	184	u_strToPunycode(const UChar *src, int32_t srcLength,
	185	UChar *dest, int32_t destCapacity,
	186	const UBool *caseFlags,
	187	UErrorCode *pErrorCode) {
	188
	189	int32_t cpBuffer[MAX_CP_COUNT];
	190	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
	191	UChar c, c2;
	192
	193	/* argument checking */
	194	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	195	return 0;
	196	}
	197
	198	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	199	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	200	return 0;
	201	}
	202
	203	/*
	204	* Handle the basic code points and
	205	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
206	*/
207	srcCPCount=destLength=0;
208	if(srcLength==-1) {
209	/* NUL-terminated input */
210	for(j=0; /* no condition */; ++j) {
211	if((c=src[j])==0) {
212	break;
213	}
214	if(srcCPCount==MAX_CP_COUNT) {
215	/* too many input code points */
216	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
217	return 0;
218	}
219	if(IS_BASIC(c)) {
220	cpBuffer[srcCPCount++]=0;
221	if(destLength<destCapacity) {
222	dest[destLength]=
223	caseFlags!=NULL ?
224	asciiCaseMap((char)c, caseFlags[j]) :
225	(char)c;
226	}
227	++destLength;
228	} else {
229	n=(caseFlags!=NULL && caseFlags[j])<<31L;
4388f060	230	if(U16_IS_SINGLE(c)) {
b75a7d8f	231	n\|=c;
4388f060	232	} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
b75a7d8f	233	++j;
4388f060	234	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
b75a7d8f A	235	} else {
	236	/* error: unmatched surrogate */
	237	*pErrorCode=U_INVALID_CHAR_FOUND;
	238	return 0;
	239	}
	240	cpBuffer[srcCPCount++]=n;
	241	}
	242	}
	243	} else {
	244	/* length-specified input */
	245	for(j=0; j<srcLength; ++j) {
	246	if(srcCPCount==MAX_CP_COUNT) {
	247	/* too many input code points */
	248	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	249	return 0;
	250	}
	251	c=src[j];
	252	if(IS_BASIC(c)) {
729e4ab9	253	cpBuffer[srcCPCount++]=0;
b75a7d8f	254	if(destLength<destCapacity) {
b75a7d8f A	255	dest[destLength]=
	256	caseFlags!=NULL ?
	257	asciiCaseMap((char)c, caseFlags[j]) :
	258	(char)c;
	259	}
	260	++destLength;
	261	} else {
	262	n=(caseFlags!=NULL && caseFlags[j])<<31L;
4388f060	263	if(U16_IS_SINGLE(c)) {
b75a7d8f	264	n\|=c;
4388f060	265	} else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
b75a7d8f	266	++j;
4388f060	267	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
b75a7d8f A	268	} else {
	269	/* error: unmatched surrogate */
	270	*pErrorCode=U_INVALID_CHAR_FOUND;
	271	return 0;
	272	}
	273	cpBuffer[srcCPCount++]=n;
	274	}
	275	}
	276	}
	277
	278	/* Finish the basic string - if it is not empty - with a delimiter. */
	279	basicLength=destLength;
	280	if(basicLength>0) {
	281	if(destLength<destCapacity) {
	282	dest[destLength]=DELIMITER;
	283	}
	284	++destLength;
	285	}
	286
	287	/*
	288	* handledCPCount is the number of code points that have been handled
	289	* basicLength is the number of basic code points
	290	* destLength is the number of chars that have been output
	291	*/
	292
	293	/* Initialize the state: */
	294	n=INITIAL_N;
	295	delta=0;
	296	bias=INITIAL_BIAS;
	297
	298	/* Main encoding loop: */
	299	for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
	300	/*
	301	* All non-basic code points < n have been handled already.
	302	* Find the next larger one:
	303	*/
	304	for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
	305	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	306	if(n<=q && q<m) {
	307	m=q;
	308	}
	309	}
	310
	311	/*
	312	* Increase delta enough to advance the decoder's
	313	* <n,i> state to <m,0>, but guard against overflow:
	314	*/
	315	if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
	316	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	317	return 0;
	318	}
	319	delta+=(m-n)*(handledCPCount+1);
	320	n=m;
	321
	322	/* Encode a sequence of same code points n */
	323	for(j=0; j<srcCPCount; ++j) {
	324	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	325	if(q<n) {
	326	++delta;
	327	} else if(q==n) {
	328	/* Represent delta as a generalized variable-length integer: */
	329	for(q=delta, k=BASE; /* no condition */; k+=BASE) {
	330
729e4ab9	331	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
b75a7d8f A	332
	333	t=k-bias;
	334	if(t<TMIN) {
	335	t=TMIN;
	336	} else if(t>TMAX) {
	337	t=TMAX;
	338	}
	339	*/
729e4ab9	340
b75a7d8f A	341	t=k-bias;
	342	if(t<TMIN) {
	343	t=TMIN;
	344	} else if(k>=(bias+TMAX)) {
	345	t=TMAX;
	346	}
	347
	348	if(q<t) {
	349	break;
	350	}
	351
	352	if(destLength<destCapacity) {
729e4ab9	353	dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
b75a7d8f	354	}
729e4ab9	355	++destLength;
b75a7d8f A	356	q=(q-t)/(BASE-t);
	357	}
	358
	359	if(destLength<destCapacity) {
729e4ab9	360	dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
b75a7d8f	361	}
729e4ab9	362	++destLength;
b75a7d8f A	363	bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
	364	delta=0;
	365	++handledCPCount;
	366	}
	367	}
	368
	369	++delta;
	370	++n;
	371	}
	372
	373	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	374	}
	375
	376	U_CFUNC int32_t
	377	u_strFromPunycode(const UChar *src, int32_t srcLength,
	378	UChar *dest, int32_t destCapacity,
	379	UBool *caseFlags,
	380	UErrorCode *pErrorCode) {
	381	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
	382	destCPCount, firstSupplementaryIndex, cpLength;
	383	UChar b;
	384
	385	/* argument checking */
	386	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	387	return 0;
	388	}
	389
	390	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	391	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	392	return 0;
	393	}
	394
	395	if(srcLength==-1) {
	396	srcLength=u_strlen(src);
	397	}
	398
	399	/*
	400	* Handle the basic code points:
	401	* Let basicLength be the number of input code points
	402	* before the last delimiter, or 0 if there is none,
	403	* then copy the first basicLength code points to the output.
	404	*
	405	* The two following loops iterate backward.
	406	*/
	407	for(j=srcLength; j>0;) {
	408	if(src[--j]==DELIMITER) {
	409	break;
	410	}
	411	}
	412	destLength=basicLength=destCPCount=j;
4388f060	413	U_ASSERT(destLength>=0);
b75a7d8f A	414
	415	while(j>0) {
	416	b=src[--j];
	417	if(!IS_BASIC(b)) {
	418	*pErrorCode=U_INVALID_CHAR_FOUND;
	419	return 0;
	420	}
	421
	422	if(j<destCapacity) {
	423	dest[j]=(UChar)b;
	424
	425	if(caseFlags!=NULL) {
	426	caseFlags[j]=IS_BASIC_UPPERCASE(b);
	427	}
	428	}
	429	}
	430
	431	/* Initialize the state: */
	432	n=INITIAL_N;
	433	i=0;
	434	bias=INITIAL_BIAS;
	435	firstSupplementaryIndex=1000000000;
	436
	437	/*
	438	* Main decoding loop:
	439	* Start just after the last delimiter if any
	440	* basic code points were copied; start at the beginning otherwise.
	441	*/
	442	for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
	443	/*
	444	* in is the index of the next character to be consumed, and
	445	* destCPCount is the number of code points in the output array.
	446	*
	447	* Decode a generalized variable-length integer into delta,
	448	* which gets added to i. The overflow checking is easier
	449	* if we increase i as we go, then subtract off its starting
	450	* value at the end to obtain delta.
	451	*/
	452	for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
	453	if(in>=srcLength) {
	454	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	455	return 0;
	456	}
	457
	458	digit=basicToDigit[(uint8_t)src[in++]];
	459	if(digit<0) {
	460	*pErrorCode=U_INVALID_CHAR_FOUND;
	461	return 0;
	462	}
	463	if(digit>(0x7fffffff-i)/w) {
	464	/* integer overflow */
	465	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	466	return 0;
	467	}
	468
	469	i+=digit*w;
	470	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
	471	t=k-bias;
	472	if(t<TMIN) {
	473	t=TMIN;
	474	} else if(t>TMAX) {
	475	t=TMAX;
	476	}
	477	*/
478	t=k-bias;
479	if(t<TMIN) {
480	t=TMIN;
481	} else if(k>=(bias+TMAX)) {
482	t=TMAX;
483	}
484	if(digit<t) {
485	break;
486	}
487
488	if(w>0x7fffffff/(BASE-t)) {
489	/* integer overflow */
490	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
491	return 0;
492	}
493	w*=BASE-t;
494	}
495
496	/*
497	* Modification from sample code:
498	* Increments destCPCount here,
499	* where needed instead of in for() loop tail.
500	*/
501	++destCPCount;
502	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
503
504	/*
505	* i was supposed to wrap around from (incremented) destCPCount to 0,
506	* incrementing n each time, so we'll fix that now:
507	*/
508	if(i/destCPCount>(0x7fffffff-n)) {
509	/* integer overflow */
510	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
511	return 0;
512	}
513
514	n+=i/destCPCount;
515	i%=destCPCount;
516	/* not needed for Punycode: */
517	/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
518
4388f060	519	if(n>0x10ffff \|\| U_IS_SURROGATE(n)) {
b75a7d8f A	520	/* Unicode code point overflow */
	521	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	522	return 0;
	523	}
	524
	525	/* Insert n at position i of the output: */
4388f060 A	526	cpLength=U16_LENGTH(n);
4388f060 A	527	if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
b75a7d8f A	528	int32_t codeUnitIndex;
	529
	530	/*
	531	* Handle indexes when supplementary code points are present.
	532	*
	533	* In almost all cases, there will be only BMP code points before i
	534	* and even in the entire string.
	535	* This is handled with the same efficiency as with UTF-32.
	536	*
	537	* Only the rare cases with supplementary code points are handled
	538	* more slowly - but not too bad since this is an insertion anyway.
	539	*/
	540	if(i<=firstSupplementaryIndex) {
	541	codeUnitIndex=i;
	542	if(cpLength>1) {
	543	firstSupplementaryIndex=codeUnitIndex;
	544	} else {
	545	++firstSupplementaryIndex;
	546	}
	547	} else {
	548	codeUnitIndex=firstSupplementaryIndex;
4388f060	549	U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
b75a7d8f A	550	}
	551
	552	/* use the UChar index codeUnitIndex instead of the code point index i */
	553	if(codeUnitIndex<destLength) {
	554	uprv_memmove(dest+codeUnitIndex+cpLength,
	555	dest+codeUnitIndex,
	556	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
	557	if(caseFlags!=NULL) {
	558	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
	559	caseFlags+codeUnitIndex,
	560	destLength-codeUnitIndex);
	561	}
	562	}
	563	if(cpLength==1) {
	564	/* BMP, insert one code unit */
	565	dest[codeUnitIndex]=(UChar)n;
	566	} else {
	567	/* supplementary character, insert two code units */
4388f060 A	568	dest[codeUnitIndex]=U16_LEAD(n);
4388f060 A	569	dest[codeUnitIndex+1]=U16_TRAIL(n);
b75a7d8f A	570	}
	571	if(caseFlags!=NULL) {
	572	/* Case of last character determines uppercase flag: */
	573	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
	574	if(cpLength==2) {
	575	caseFlags[codeUnitIndex+1]=FALSE;
	576	}
	577	}
	578	}
	579	destLength+=cpLength;
4388f060	580	U_ASSERT(destLength>=0);
b75a7d8f A	581	++i;
	582	}
	583
	584	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	585	}
	586
	587	/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
	588
	589	#endif /* #if !UCONFIG_NO_IDNA */