[apple/icu.git] / icuSources / common / punycode.cpp

/*
*******************************************************************************
*
*   Copyright (C) 2002-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  punycode.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan31
*   created by: Markus W. Scherer
*/


/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/

Disclaimer and license

    Regarding this entire document or any portion of it (including
    the pseudocode and C code), the author makes no guarantees and
    is not responsible for any damage resulting from its use.  The
    author grants irrevocable permission to anyone to use, modify,
    and distribute it in any way that does not diminish the rights
    of anyone else to use, modify, and distribute it, provided that
    redistributed derivative works do not contain misleading author or
    version information.  Derivative works need not be licensed under
    similar terms.
*/
/*
 * ICU modifications:
 * - ICU data types and coding conventions
 * - ICU string buffer handling with implicit source lengths
 *   and destination preflighting
 * - UTF-16 handling
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "unicode/ustring.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "uassert.h"


/* Punycode ----------------------------------------------------------------- */

/* Punycode parameters for Bootstring */
#define BASE            36
#define TMIN            1
#define TMAX            26
#define SKEW            38
#define DAMP            700
#define INITIAL_BIAS    72
#define INITIAL_N       0x80

/* "Basic" Unicode/ASCII code points */
#define _HYPHEN         0X2d
#define DELIMITER       _HYPHEN

#define _ZERO_          0X30
#define _NINE           0x39

#define _SMALL_A        0X61
#define _SMALL_Z        0X7a

#define _CAPITAL_A      0X41
#define _CAPITAL_Z      0X5a

#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)

/**
 * digitToBasic() returns the basic code point whose value
 * (when used for representing integers) is d, which must be in the
 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
 * nonzero, in which case the uppercase form is used.
 */
static inline char
digitToBasic(int32_t digit, UBool uppercase) {
    /*  0..25 map to ASCII a..z or A..Z */
    /* 26..35 map to ASCII 0..9         */
    if(digit<26) {
        if(uppercase) {
            return (char)(_CAPITAL_A+digit);
        } else {
            return (char)(_SMALL_A+digit);
        }
    } else {
        return (char)((_ZERO_-26)+digit);
    }
}

/**
 * basicToDigit[] contains the numeric value of a basic code
 * point (for use in representing integers) in the range 0 to
 * BASE-1, or -1 if b is does not represent a value.
 */
static const int8_t
basicToDigit[256]={
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};

static inline char
asciiCaseMap(char b, UBool uppercase) {
    if(uppercase) {
        if(_SMALL_A<=b && b<=_SMALL_Z) {
            b-=(_SMALL_A-_CAPITAL_A);
        }
    } else {
        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
            b+=(_SMALL_A-_CAPITAL_A);
        }
    }
    return b;
}

/* Punycode-specific Bootstring code ---------------------------------------- */

/*
 * The following code omits the {parts} of the pseudo-algorithm in the spec
 * that are not used with the Punycode parameter set.
 */

/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    int32_t count;

    if(firstTime) {
        delta/=DAMP;
    } else {
        delta/=2;
    }

    delta+=delta/length;
    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
        delta/=(BASE-TMIN);
    }

    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}

#define MAX_CP_COUNT    200

U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
                UChar *dest, int32_t destCapacity,
                const UBool *caseFlags,
                UErrorCode *pErrorCode) {

    int32_t cpBuffer[MAX_CP_COUNT];
    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    UChar c, c2;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /*
     * Handle the basic code points and
     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
     */
    srcCPCount=destLength=0;
    if(srcLength==-1) {
        /* NUL-terminated input */
        for(j=0; /* no condition */; ++j) {
            if((c=src[j])==0) {
                break;
            }
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    } else {
        /* length-specified input */
        for(j=0; j<srcLength; ++j) {
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            c=src[j];
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    }

    /* Finish the basic string - if it is not empty - with a delimiter. */
    basicLength=destLength;
    if(basicLength>0) {
        if(destLength<destCapacity) {
            dest[destLength]=DELIMITER;
        }
        ++destLength;
    }

    /*
     * handledCPCount is the number of code points that have been handled
     * basicLength is the number of basic code points
     * destLength is the number of chars that have been output
     */

    /* Initialize the state: */
    n=INITIAL_N;
    delta=0;
    bias=INITIAL_BIAS;

    /* Main encoding loop: */
    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
        /*
         * All non-basic code points < n have been handled already.
         * Find the next larger one:
         */
        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(n<=q && q<m) {
                m=q;
            }
        }

        /*
         * Increase delta enough to advance the decoder's
         * <n,i> state to <m,0>, but guard against overflow:
         */
        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
            return 0;
        }
        delta+=(m-n)*(handledCPCount+1);
        n=m;

        /* Encode a sequence of same code points n */
        for(j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(q<n) {
                ++delta;
            } else if(q==n) {
                /* Represent delta as a generalized variable-length integer: */
                for(q=delta, k=BASE; /* no condition */; k+=BASE) {

                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(t>TMAX) {
                        t=TMAX;
                    }
                    */

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(k>=(bias+TMAX)) {
                        t=TMAX;
                    }

                    if(q<t) {
                        break;
                    }

                    if(destLength<destCapacity) {
                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
                    }
                    ++destLength;
                    q=(q-t)/(BASE-t);
                }

                if(destLength<destCapacity) {
                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
                }
                ++destLength;
                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
                delta=0;
                ++handledCPCount;
            }
        }

        ++delta;
        ++n;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
                  UChar *dest, int32_t destCapacity,
                  UBool *caseFlags,
                  UErrorCode *pErrorCode) {
    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
            destCPCount, firstSupplementaryIndex, cpLength;
    UChar b;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(srcLength==-1) {
        srcLength=u_strlen(src);
    }

    /*
     * Handle the basic code points:
     * Let basicLength be the number of input code points
     * before the last delimiter, or 0 if there is none,
     * then copy the first basicLength code points to the output.
     *
     * The two following loops iterate backward.
     */
    for(j=srcLength; j>0;) {
        if(src[--j]==DELIMITER) {
            break;
        }
    }
    destLength=basicLength=destCPCount=j;
    U_ASSERT(destLength>=0);

    while(j>0) {
        b=src[--j];
        if(!IS_BASIC(b)) {
            *pErrorCode=U_INVALID_CHAR_FOUND;
            return 0;
        }

        if(j<destCapacity) {
            dest[j]=(UChar)b;

            if(caseFlags!=NULL) {
                caseFlags[j]=IS_BASIC_UPPERCASE(b);
            }
        }
    }

    /* Initialize the state: */
    n=INITIAL_N;
    i=0;
    bias=INITIAL_BIAS;
    firstSupplementaryIndex=1000000000;

    /*
     * Main decoding loop:
     * Start just after the last delimiter if any
     * basic code points were copied; start at the beginning otherwise.
     */
    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
        /*
         * in is the index of the next character to be consumed, and
         * destCPCount is the number of code points in the output array.
         *
         * Decode a generalized variable-length integer into delta,
         * which gets added to i.  The overflow checking is easier
         * if we increase i as we go, then subtract off its starting
         * value at the end to obtain delta.
         */
        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
            if(in>=srcLength) {
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            digit=basicToDigit[(uint8_t)src[in++]];
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
            }
            if(digit>(0x7fffffff-i)/w) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            i+=digit*w;
            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(t>TMAX) {
                t=TMAX;
            }
            */
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(k>=(bias+TMAX)) {
                t=TMAX;
            }
            if(digit<t) {
                break;
            }

            if(w>0x7fffffff/(BASE-t)) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }
            w*=BASE-t;
        }

        /*
         * Modification from sample code:
         * Increments destCPCount here,
         * where needed instead of in for() loop tail.
         */
        ++destCPCount;
        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));

        /*
         * i was supposed to wrap around from (incremented) destCPCount to 0,
         * incrementing n each time, so we'll fix that now:
         */
        if(i/destCPCount>(0x7fffffff-n)) {
            /* integer overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        n+=i/destCPCount;
        i%=destCPCount;
        /* not needed for Punycode: */
        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

        if(n>0x10ffff || U_IS_SURROGATE(n)) {
            /* Unicode code point overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        /* Insert n at position i of the output: */
        cpLength=U16_LENGTH(n);
        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
            int32_t codeUnitIndex;

            /*
             * Handle indexes when supplementary code points are present.
             *
             * In almost all cases, there will be only BMP code points before i
             * and even in the entire string.
             * This is handled with the same efficiency as with UTF-32.
             *
             * Only the rare cases with supplementary code points are handled
             * more slowly - but not too bad since this is an insertion anyway.
             */
            if(i<=firstSupplementaryIndex) {
                codeUnitIndex=i;
                if(cpLength>1) {
                    firstSupplementaryIndex=codeUnitIndex;
                } else {
                    ++firstSupplementaryIndex;
                }
            } else {
                codeUnitIndex=firstSupplementaryIndex;
                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
            }

            /* use the UChar index codeUnitIndex instead of the code point index i */
            if(codeUnitIndex<destLength) {
                uprv_memmove(dest+codeUnitIndex+cpLength,
                             dest+codeUnitIndex,
                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
                if(caseFlags!=NULL) {
                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
                                 caseFlags+codeUnitIndex,
                                 destLength-codeUnitIndex);
                }
            }
            if(cpLength==1) {
                /* BMP, insert one code unit */
                dest[codeUnitIndex]=(UChar)n;
            } else {
                /* supplementary character, insert two code units */
                dest[codeUnitIndex]=U16_LEAD(n);
                dest[codeUnitIndex+1]=U16_TRAIL(n);
            }
            if(caseFlags!=NULL) {
                /* Case of last character determines uppercase flag: */
                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
                if(cpLength==2) {
                    caseFlags[codeUnitIndex+1]=FALSE;
                }
            }
        }
        destLength+=cpLength;
        U_ASSERT(destLength>=0);
        ++i;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */

#endif /* #if !UCONFIG_NO_IDNA */
Commit	Line	Data
b75a7d8f A	1	/*
	2	*******************************************************************************
	3	*
4388f060	4	* Copyright (C) 2002-2011, International Business Machines
b75a7d8f A	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
4388f060	8	* file name: punycode.cpp
b75a7d8f A	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2002jan31
	14	* created by: Markus W. Scherer
	15	*/
	16
	17
	18	/* This ICU code derived from: */
	19	/*
	20	punycode.c 0.4.0 (2001-Nov-17-Sat)
	21	http://www.cs.berkeley.edu/~amc/idn/
	22	Adam M. Costello
	23	http://www.nicemice.net/amc/
	24
	25	Disclaimer and license
	26
	27	Regarding this entire document or any portion of it (including
	28	the pseudocode and C code), the author makes no guarantees and
	29	is not responsible for any damage resulting from its use. The
	30	author grants irrevocable permission to anyone to use, modify,
	31	and distribute it in any way that does not diminish the rights
	32	of anyone else to use, modify, and distribute it, provided that
	33	redistributed derivative works do not contain misleading author or
	34	version information. Derivative works need not be licensed under
	35	similar terms.
	36	*/
	37	/*
	38	* ICU modifications:
	39	* - ICU data types and coding conventions
	40	* - ICU string buffer handling with implicit source lengths
	41	* and destination preflighting
	42	* - UTF-16 handling
	43	*/
	44
	45	#include "unicode/utypes.h"
	46
	47	#if !UCONFIG_NO_IDNA
	48
4388f060 A	49	#include "unicode/ustring.h"
	50	#include "unicode/utf.h"
	51	#include "unicode/utf16.h"
b75a7d8f A	52	#include "ustr_imp.h"
	53	#include "cstring.h"
	54	#include "cmemory.h"
	55	#include "punycode.h"
4388f060	56	#include "uassert.h"
b75a7d8f A	57
	58
	59	/* Punycode ----------------------------------------------------------------- */
	60
	61	/* Punycode parameters for Bootstring */
	62	#define BASE 36
	63	#define TMIN 1
	64	#define TMAX 26
	65	#define SKEW 38
	66	#define DAMP 700
	67	#define INITIAL_BIAS 72
	68	#define INITIAL_N 0x80
	69
	70	/* "Basic" Unicode/ASCII code points */
	71	#define _HYPHEN 0X2d
	72	#define DELIMITER _HYPHEN
	73
	74	#define _ZERO_ 0X30
	75	#define _NINE 0x39
	76
	77	#define _SMALL_A 0X61
	78	#define _SMALL_Z 0X7a
	79
	80	#define _CAPITAL_A 0X41
	81	#define _CAPITAL_Z 0X5a
	82
	83	#define IS_BASIC(c) ((c)<0x80)
	84	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
	85
	86	/**
	87	* digitToBasic() returns the basic code point whose value
	88	* (when used for representing integers) is d, which must be in the
	89	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
	90	* nonzero, in which case the uppercase form is used.
	91	*/
4388f060	92	static inline char
b75a7d8f A	93	digitToBasic(int32_t digit, UBool uppercase) {
	94	/* 0..25 map to ASCII a..z or A..Z */
	95	/* 26..35 map to ASCII 0..9 */
	96	if(digit<26) {
	97	if(uppercase) {
	98	return (char)(_CAPITAL_A+digit);
	99	} else {
	100	return (char)(_SMALL_A+digit);
	101	}
	102	} else {
	103	return (char)((_ZERO_-26)+digit);
	104	}
	105	}
	106
	107	/**
	108	* basicToDigit[] contains the numeric value of a basic code
	109	* point (for use in representing integers) in the range 0 to
	110	* BASE-1, or -1 if b is does not represent a value.
	111	*/
	112	static const int8_t
	113	basicToDigit[256]={
	114	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	115	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	116
	117	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	118	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
	119
	120	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	121	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
	122
	123	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	124	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
	125
	126	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	127	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	128
	129	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	130	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	131
	132	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	133	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	134
	135	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	136	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
	137	};
	138
4388f060	139	static inline char
b75a7d8f A	140	asciiCaseMap(char b, UBool uppercase) {
	141	if(uppercase) {
	142	if(_SMALL_A<=b && b<=_SMALL_Z) {
	143	b-=(_SMALL_A-_CAPITAL_A);
	144	}
	145	} else {
	146	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
	147	b+=(_SMALL_A-_CAPITAL_A);
	148	}
	149	}
	150	return b;
	151	}
	152
	153	/* Punycode-specific Bootstring code ---------------------------------------- */
	154
	155	/*
	156	* The following code omits the {parts} of the pseudo-algorithm in the spec
	157	* that are not used with the Punycode parameter set.
	158	*/
	159
	160	/* Bias adaptation function. */
	161	static int32_t
	162	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
	163	int32_t count;
	164
	165	if(firstTime) {
	166	delta/=DAMP;
	167	} else {
	168	delta/=2;
	169	}
	170
	171	delta+=delta/length;
	172	for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
	173	delta/=(BASE-TMIN);
	174	}
	175
	176	return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
	177	}
	178
	179	#define MAX_CP_COUNT 200
	180
	181	U_CFUNC int32_t
	182	u_strToPunycode(const UChar *src, int32_t srcLength,
	183	UChar *dest, int32_t destCapacity,
	184	const UBool *caseFlags,
	185	UErrorCode *pErrorCode) {
	186
	187	int32_t cpBuffer[MAX_CP_COUNT];
	188	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
	189	UChar c, c2;
	190
	191	/* argument checking */
	192	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	193	return 0;
	194	}
	195
	196	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	197	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	198	return 0;
	199	}
	200
	201	/*
	202	* Handle the basic code points and
	203	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
204	*/
205	srcCPCount=destLength=0;
206	if(srcLength==-1) {
207	/* NUL-terminated input */
208	for(j=0; /* no condition */; ++j) {
209	if((c=src[j])==0) {
210	break;
211	}
212	if(srcCPCount==MAX_CP_COUNT) {
213	/* too many input code points */
214	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
215	return 0;
216	}
217	if(IS_BASIC(c)) {
218	cpBuffer[srcCPCount++]=0;
219	if(destLength<destCapacity) {
220	dest[destLength]=
221	caseFlags!=NULL ?
222	asciiCaseMap((char)c, caseFlags[j]) :
223	(char)c;
224	}
225	++destLength;
226	} else {
227	n=(caseFlags!=NULL && caseFlags[j])<<31L;
4388f060	228	if(U16_IS_SINGLE(c)) {
b75a7d8f	229	n\|=c;
4388f060	230	} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
b75a7d8f	231	++j;
4388f060	232	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
b75a7d8f A	233	} else {
	234	/* error: unmatched surrogate */
	235	*pErrorCode=U_INVALID_CHAR_FOUND;
	236	return 0;
	237	}
	238	cpBuffer[srcCPCount++]=n;
	239	}
	240	}
	241	} else {
	242	/* length-specified input */
	243	for(j=0; j<srcLength; ++j) {
	244	if(srcCPCount==MAX_CP_COUNT) {
	245	/* too many input code points */
	246	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	247	return 0;
	248	}
	249	c=src[j];
	250	if(IS_BASIC(c)) {
729e4ab9	251	cpBuffer[srcCPCount++]=0;
b75a7d8f	252	if(destLength<destCapacity) {
b75a7d8f A	253	dest[destLength]=
	254	caseFlags!=NULL ?
	255	asciiCaseMap((char)c, caseFlags[j]) :
	256	(char)c;
	257	}
	258	++destLength;
	259	} else {
	260	n=(caseFlags!=NULL && caseFlags[j])<<31L;
4388f060	261	if(U16_IS_SINGLE(c)) {
b75a7d8f	262	n\|=c;
4388f060	263	} else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
b75a7d8f	264	++j;
4388f060	265	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
b75a7d8f A	266	} else {
	267	/* error: unmatched surrogate */
	268	*pErrorCode=U_INVALID_CHAR_FOUND;
	269	return 0;
	270	}
	271	cpBuffer[srcCPCount++]=n;
	272	}
	273	}
	274	}
	275
	276	/* Finish the basic string - if it is not empty - with a delimiter. */
	277	basicLength=destLength;
	278	if(basicLength>0) {
	279	if(destLength<destCapacity) {
	280	dest[destLength]=DELIMITER;
	281	}
	282	++destLength;
	283	}
	284
	285	/*
	286	* handledCPCount is the number of code points that have been handled
	287	* basicLength is the number of basic code points
	288	* destLength is the number of chars that have been output
	289	*/
	290
	291	/* Initialize the state: */
	292	n=INITIAL_N;
	293	delta=0;
	294	bias=INITIAL_BIAS;
	295
	296	/* Main encoding loop: */
	297	for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
	298	/*
	299	* All non-basic code points < n have been handled already.
	300	* Find the next larger one:
	301	*/
	302	for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
	303	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	304	if(n<=q && q<m) {
	305	m=q;
	306	}
	307	}
	308
	309	/*
	310	* Increase delta enough to advance the decoder's
	311	* <n,i> state to <m,0>, but guard against overflow:
	312	*/
	313	if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
	314	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	315	return 0;
	316	}
	317	delta+=(m-n)*(handledCPCount+1);
	318	n=m;
	319
	320	/* Encode a sequence of same code points n */
	321	for(j=0; j<srcCPCount; ++j) {
	322	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	323	if(q<n) {
	324	++delta;
	325	} else if(q==n) {
	326	/* Represent delta as a generalized variable-length integer: */
	327	for(q=delta, k=BASE; /* no condition */; k+=BASE) {
	328
729e4ab9	329	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
b75a7d8f A	330
	331	t=k-bias;
	332	if(t<TMIN) {
	333	t=TMIN;
	334	} else if(t>TMAX) {
	335	t=TMAX;
	336	}
	337	*/
729e4ab9	338
b75a7d8f A	339	t=k-bias;
	340	if(t<TMIN) {
	341	t=TMIN;
	342	} else if(k>=(bias+TMAX)) {
	343	t=TMAX;
	344	}
	345
	346	if(q<t) {
	347	break;
	348	}
	349
	350	if(destLength<destCapacity) {
729e4ab9	351	dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
b75a7d8f	352	}
729e4ab9	353	++destLength;
b75a7d8f A	354	q=(q-t)/(BASE-t);
	355	}
	356
	357	if(destLength<destCapacity) {
729e4ab9	358	dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
b75a7d8f	359	}
729e4ab9	360	++destLength;
b75a7d8f A	361	bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
	362	delta=0;
	363	++handledCPCount;
	364	}
	365	}
	366
	367	++delta;
	368	++n;
	369	}
	370
	371	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	372	}
	373
	374	U_CFUNC int32_t
	375	u_strFromPunycode(const UChar *src, int32_t srcLength,
	376	UChar *dest, int32_t destCapacity,
	377	UBool *caseFlags,
	378	UErrorCode *pErrorCode) {
	379	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
	380	destCPCount, firstSupplementaryIndex, cpLength;
	381	UChar b;
	382
	383	/* argument checking */
	384	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	385	return 0;
	386	}
	387
	388	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	389	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	390	return 0;
	391	}
	392
	393	if(srcLength==-1) {
	394	srcLength=u_strlen(src);
	395	}
	396
	397	/*
	398	* Handle the basic code points:
	399	* Let basicLength be the number of input code points
	400	* before the last delimiter, or 0 if there is none,
	401	* then copy the first basicLength code points to the output.
	402	*
	403	* The two following loops iterate backward.
	404	*/
	405	for(j=srcLength; j>0;) {
	406	if(src[--j]==DELIMITER) {
	407	break;
	408	}
	409	}
	410	destLength=basicLength=destCPCount=j;
4388f060	411	U_ASSERT(destLength>=0);
b75a7d8f A	412
	413	while(j>0) {
	414	b=src[--j];
	415	if(!IS_BASIC(b)) {
	416	*pErrorCode=U_INVALID_CHAR_FOUND;
	417	return 0;
	418	}
	419
	420	if(j<destCapacity) {
	421	dest[j]=(UChar)b;
	422
	423	if(caseFlags!=NULL) {
	424	caseFlags[j]=IS_BASIC_UPPERCASE(b);
	425	}
	426	}
	427	}
	428
	429	/* Initialize the state: */
	430	n=INITIAL_N;
	431	i=0;
	432	bias=INITIAL_BIAS;
	433	firstSupplementaryIndex=1000000000;
	434
	435	/*
	436	* Main decoding loop:
	437	* Start just after the last delimiter if any
	438	* basic code points were copied; start at the beginning otherwise.
	439	*/
	440	for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
	441	/*
	442	* in is the index of the next character to be consumed, and
	443	* destCPCount is the number of code points in the output array.
	444	*
	445	* Decode a generalized variable-length integer into delta,
	446	* which gets added to i. The overflow checking is easier
	447	* if we increase i as we go, then subtract off its starting
	448	* value at the end to obtain delta.
	449	*/
	450	for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
	451	if(in>=srcLength) {
	452	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	453	return 0;
	454	}
	455
	456	digit=basicToDigit[(uint8_t)src[in++]];
	457	if(digit<0) {
	458	*pErrorCode=U_INVALID_CHAR_FOUND;
	459	return 0;
	460	}
	461	if(digit>(0x7fffffff-i)/w) {
	462	/* integer overflow */
	463	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	464	return 0;
	465	}
	466
	467	i+=digit*w;
	468	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
	469	t=k-bias;
	470	if(t<TMIN) {
	471	t=TMIN;
	472	} else if(t>TMAX) {
	473	t=TMAX;
	474	}
	475	*/
476	t=k-bias;
477	if(t<TMIN) {
478	t=TMIN;
479	} else if(k>=(bias+TMAX)) {
480	t=TMAX;
481	}
482	if(digit<t) {
483	break;
484	}
485
486	if(w>0x7fffffff/(BASE-t)) {
487	/* integer overflow */
488	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
489	return 0;
490	}
491	w*=BASE-t;
492	}
493
494	/*
495	* Modification from sample code:
496	* Increments destCPCount here,
497	* where needed instead of in for() loop tail.
498	*/
499	++destCPCount;
500	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
501
502	/*
503	* i was supposed to wrap around from (incremented) destCPCount to 0,
504	* incrementing n each time, so we'll fix that now:
505	*/
506	if(i/destCPCount>(0x7fffffff-n)) {
507	/* integer overflow */
508	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
509	return 0;
510	}
511
512	n+=i/destCPCount;
513	i%=destCPCount;
514	/* not needed for Punycode: */
515	/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
516
4388f060	517	if(n>0x10ffff \|\| U_IS_SURROGATE(n)) {
b75a7d8f A	518	/* Unicode code point overflow */
	519	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	520	return 0;
	521	}
	522
	523	/* Insert n at position i of the output: */
4388f060 A	524	cpLength=U16_LENGTH(n);
4388f060 A	525	if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
b75a7d8f A	526	int32_t codeUnitIndex;
	527
	528	/*
	529	* Handle indexes when supplementary code points are present.
	530	*
	531	* In almost all cases, there will be only BMP code points before i
	532	* and even in the entire string.
	533	* This is handled with the same efficiency as with UTF-32.
	534	*
	535	* Only the rare cases with supplementary code points are handled
	536	* more slowly - but not too bad since this is an insertion anyway.
	537	*/
	538	if(i<=firstSupplementaryIndex) {
	539	codeUnitIndex=i;
	540	if(cpLength>1) {
	541	firstSupplementaryIndex=codeUnitIndex;
	542	} else {
	543	++firstSupplementaryIndex;
	544	}
	545	} else {
	546	codeUnitIndex=firstSupplementaryIndex;
4388f060	547	U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
b75a7d8f A	548	}
	549
	550	/* use the UChar index codeUnitIndex instead of the code point index i */
	551	if(codeUnitIndex<destLength) {
	552	uprv_memmove(dest+codeUnitIndex+cpLength,
	553	dest+codeUnitIndex,
	554	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
	555	if(caseFlags!=NULL) {
	556	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
	557	caseFlags+codeUnitIndex,
	558	destLength-codeUnitIndex);
	559	}
	560	}
	561	if(cpLength==1) {
	562	/* BMP, insert one code unit */
	563	dest[codeUnitIndex]=(UChar)n;
	564	} else {
	565	/* supplementary character, insert two code units */
4388f060 A	566	dest[codeUnitIndex]=U16_LEAD(n);
4388f060 A	567	dest[codeUnitIndex+1]=U16_TRAIL(n);
b75a7d8f A	568	}
	569	if(caseFlags!=NULL) {
	570	/* Case of last character determines uppercase flag: */
	571	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
	572	if(cpLength==2) {
	573	caseFlags[codeUnitIndex+1]=FALSE;
	574	}
	575	}
	576	}
	577	destLength+=cpLength;
4388f060	578	U_ASSERT(destLength>=0);
b75a7d8f A	579	++i;
	580	}
	581
	582	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	583	}
	584
	585	/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
	586
	587	#endif /* #if !UCONFIG_NO_IDNA */