[apple/icu.git] / icuSources / common / punycode.c

/*
*******************************************************************************
*
*   Copyright (C) 2002-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  punycode.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan31
*   created by: Markus W. Scherer
*/


/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/

Disclaimer and license

    Regarding this entire document or any portion of it (including
    the pseudocode and C code), the author makes no guarantees and
    is not responsible for any damage resulting from its use.  The
    author grants irrevocable permission to anyone to use, modify,
    and distribute it in any way that does not diminish the rights
    of anyone else to use, modify, and distribute it, provided that
    redistributed derivative works do not contain misleading author or
    version information.  Derivative works need not be licensed under
    similar terms.
*/
/*
 * ICU modifications:
 * - ICU data types and coding conventions
 * - ICU string buffer handling with implicit source lengths
 *   and destination preflighting
 * - UTF-16 handling
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "unicode/ustring.h"


/* Punycode ----------------------------------------------------------------- */

/* Punycode parameters for Bootstring */
#define BASE            36
#define TMIN            1
#define TMAX            26
#define SKEW            38
#define DAMP            700
#define INITIAL_BIAS    72
#define INITIAL_N       0x80

/* "Basic" Unicode/ASCII code points */
#define _HYPHEN         0X2d
#define DELIMITER       _HYPHEN

#define _ZERO_          0X30
#define _NINE           0x39

#define _SMALL_A        0X61
#define _SMALL_Z        0X7a

#define _CAPITAL_A      0X41
#define _CAPITAL_Z      0X5a

#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)

/**
 * digitToBasic() returns the basic code point whose value
 * (when used for representing integers) is d, which must be in the
 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
 * nonzero, in which case the uppercase form is used.
 */
static U_INLINE char
digitToBasic(int32_t digit, UBool uppercase) {
    /*  0..25 map to ASCII a..z or A..Z */
    /* 26..35 map to ASCII 0..9         */
    if(digit<26) {
        if(uppercase) {
            return (char)(_CAPITAL_A+digit);
        } else {
            return (char)(_SMALL_A+digit);
        }
    } else {
        return (char)((_ZERO_-26)+digit);
    }
}

/**
 * basicToDigit[] contains the numeric value of a basic code
 * point (for use in representing integers) in the range 0 to
 * BASE-1, or -1 if b is does not represent a value.
 */
static const int8_t
basicToDigit[256]={
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};

static U_INLINE char
asciiCaseMap(char b, UBool uppercase) {
    if(uppercase) {
        if(_SMALL_A<=b && b<=_SMALL_Z) {
            b-=(_SMALL_A-_CAPITAL_A);
        }
    } else {
        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
            b+=(_SMALL_A-_CAPITAL_A);
        }
    }
    return b;
}

/* Punycode-specific Bootstring code ---------------------------------------- */

/*
 * The following code omits the {parts} of the pseudo-algorithm in the spec
 * that are not used with the Punycode parameter set.
 */

/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    int32_t count;

    if(firstTime) {
        delta/=DAMP;
    } else {
        delta/=2;
    }

    delta+=delta/length;
    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
        delta/=(BASE-TMIN);
    }

    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}

#define MAX_CP_COUNT    200

U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
                UChar *dest, int32_t destCapacity,
                const UBool *caseFlags,
                UErrorCode *pErrorCode) {

    int32_t cpBuffer[MAX_CP_COUNT];
    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    UChar c, c2;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /*
     * Handle the basic code points and
     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
     */
    srcCPCount=destLength=0;
    if(srcLength==-1) {
        /* NUL-terminated input */
        for(j=0; /* no condition */; ++j) {
            if((c=src[j])==0) {
                break;
            }
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(UTF_IS_SINGLE(c)) {
                    n|=c;
                } else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    } else {
        /* length-specified input */
        for(j=0; j<srcLength; ++j) {
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            c=src[j];
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(UTF_IS_SINGLE(c)) {
                    n|=c;
                } else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    }

    /* Finish the basic string - if it is not empty - with a delimiter. */
    basicLength=destLength;
    if(basicLength>0) {
        if(destLength<destCapacity) {
            dest[destLength]=DELIMITER;
        }
        ++destLength;
    }

    /*
     * handledCPCount is the number of code points that have been handled
     * basicLength is the number of basic code points
     * destLength is the number of chars that have been output
     */

    /* Initialize the state: */
    n=INITIAL_N;
    delta=0;
    bias=INITIAL_BIAS;

    /* Main encoding loop: */
    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
        /*
         * All non-basic code points < n have been handled already.
         * Find the next larger one:
         */
        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(n<=q && q<m) {
                m=q;
            }
        }

        /*
         * Increase delta enough to advance the decoder's
         * <n,i> state to <m,0>, but guard against overflow:
         */
        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
            return 0;
        }
        delta+=(m-n)*(handledCPCount+1);
        n=m;

        /* Encode a sequence of same code points n */
        for(j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(q<n) {
                ++delta;
            } else if(q==n) {
                /* Represent delta as a generalized variable-length integer: */
                for(q=delta, k=BASE; /* no condition */; k+=BASE) {

                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(t>TMAX) {
                        t=TMAX;
                    }
                    */

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(k>=(bias+TMAX)) {
                        t=TMAX;
                    }

                    if(q<t) {
                        break;
                    }

                    if(destLength<destCapacity) {
                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
                    }
                    ++destLength;
                    q=(q-t)/(BASE-t);
                }

                if(destLength<destCapacity) {
                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
                }
                ++destLength;
                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
                delta=0;
                ++handledCPCount;
            }
        }

        ++delta;
        ++n;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
                  UChar *dest, int32_t destCapacity,
                  UBool *caseFlags,
                  UErrorCode *pErrorCode) {
    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
            destCPCount, firstSupplementaryIndex, cpLength;
    UChar b;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(srcLength==-1) {
        srcLength=u_strlen(src);
    }

    /*
     * Handle the basic code points:
     * Let basicLength be the number of input code points
     * before the last delimiter, or 0 if there is none,
     * then copy the first basicLength code points to the output.
     *
     * The two following loops iterate backward.
     */
    for(j=srcLength; j>0;) {
        if(src[--j]==DELIMITER) {
            break;
        }
    }
    destLength=basicLength=destCPCount=j;

    while(j>0) {
        b=src[--j];
        if(!IS_BASIC(b)) {
            *pErrorCode=U_INVALID_CHAR_FOUND;
            return 0;
        }

        if(j<destCapacity) {
            dest[j]=(UChar)b;

            if(caseFlags!=NULL) {
                caseFlags[j]=IS_BASIC_UPPERCASE(b);
            }
        }
    }

    /* Initialize the state: */
    n=INITIAL_N;
    i=0;
    bias=INITIAL_BIAS;
    firstSupplementaryIndex=1000000000;

    /*
     * Main decoding loop:
     * Start just after the last delimiter if any
     * basic code points were copied; start at the beginning otherwise.
     */
    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
        /*
         * in is the index of the next character to be consumed, and
         * destCPCount is the number of code points in the output array.
         *
         * Decode a generalized variable-length integer into delta,
         * which gets added to i.  The overflow checking is easier
         * if we increase i as we go, then subtract off its starting
         * value at the end to obtain delta.
         */
        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
            if(in>=srcLength) {
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            digit=basicToDigit[(uint8_t)src[in++]];
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
            }
            if(digit>(0x7fffffff-i)/w) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            i+=digit*w;
            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(t>TMAX) {
                t=TMAX;
            }
            */
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(k>=(bias+TMAX)) {
                t=TMAX;
            }
            if(digit<t) {
                break;
            }

            if(w>0x7fffffff/(BASE-t)) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }
            w*=BASE-t;
        }

        /*
         * Modification from sample code:
         * Increments destCPCount here,
         * where needed instead of in for() loop tail.
         */
        ++destCPCount;
        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));

        /*
         * i was supposed to wrap around from (incremented) destCPCount to 0,
         * incrementing n each time, so we'll fix that now:
         */
        if(i/destCPCount>(0x7fffffff-n)) {
            /* integer overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        n+=i/destCPCount;
        i%=destCPCount;
        /* not needed for Punycode: */
        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

        if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
            /* Unicode code point overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        /* Insert n at position i of the output: */
        cpLength=UTF_CHAR_LENGTH(n);
        if((destLength+cpLength)<=destCapacity) {
            int32_t codeUnitIndex;

            /*
             * Handle indexes when supplementary code points are present.
             *
             * In almost all cases, there will be only BMP code points before i
             * and even in the entire string.
             * This is handled with the same efficiency as with UTF-32.
             *
             * Only the rare cases with supplementary code points are handled
             * more slowly - but not too bad since this is an insertion anyway.
             */
            if(i<=firstSupplementaryIndex) {
                codeUnitIndex=i;
                if(cpLength>1) {
                    firstSupplementaryIndex=codeUnitIndex;
                } else {
                    ++firstSupplementaryIndex;
                }
            } else {
                codeUnitIndex=firstSupplementaryIndex;
                UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
            }

            /* use the UChar index codeUnitIndex instead of the code point index i */
            if(codeUnitIndex<destLength) {
                uprv_memmove(dest+codeUnitIndex+cpLength,
                             dest+codeUnitIndex,
                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
                if(caseFlags!=NULL) {
                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
                                 caseFlags+codeUnitIndex,
                                 destLength-codeUnitIndex);
                }
            }
            if(cpLength==1) {
                /* BMP, insert one code unit */
                dest[codeUnitIndex]=(UChar)n;
            } else {
                /* supplementary character, insert two code units */
                dest[codeUnitIndex]=UTF16_LEAD(n);
                dest[codeUnitIndex+1]=UTF16_TRAIL(n);
            }
            if(caseFlags!=NULL) {
                /* Case of last character determines uppercase flag: */
                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
                if(cpLength==2) {
                    caseFlags[codeUnitIndex+1]=FALSE;
                }
            }
        }
        destLength+=cpLength;
        ++i;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */

#endif /* #if !UCONFIG_NO_IDNA */
Commit	Line	Data
b75a7d8f A	1	/*
	2	*******************************************************************************
	3	*
729e4ab9	4	* Copyright (C) 2002-2010, International Business Machines
b75a7d8f A	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: punycode.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2002jan31
	14	* created by: Markus W. Scherer
	15	*/
	16
	17
	18	/* This ICU code derived from: */
	19	/*
	20	punycode.c 0.4.0 (2001-Nov-17-Sat)
	21	http://www.cs.berkeley.edu/~amc/idn/
	22	Adam M. Costello
	23	http://www.nicemice.net/amc/
	24
	25	Disclaimer and license
	26
	27	Regarding this entire document or any portion of it (including
	28	the pseudocode and C code), the author makes no guarantees and
	29	is not responsible for any damage resulting from its use. The
	30	author grants irrevocable permission to anyone to use, modify,
	31	and distribute it in any way that does not diminish the rights
	32	of anyone else to use, modify, and distribute it, provided that
	33	redistributed derivative works do not contain misleading author or
	34	version information. Derivative works need not be licensed under
	35	similar terms.
	36	*/
	37	/*
	38	* ICU modifications:
	39	* - ICU data types and coding conventions
	40	* - ICU string buffer handling with implicit source lengths
	41	* and destination preflighting
	42	* - UTF-16 handling
	43	*/
	44
	45	#include "unicode/utypes.h"
	46
	47	#if !UCONFIG_NO_IDNA
	48
	49	#include "ustr_imp.h"
	50	#include "cstring.h"
	51	#include "cmemory.h"
	52	#include "punycode.h"
	53	#include "unicode/ustring.h"
	54
	55
	56	/* Punycode ----------------------------------------------------------------- */
	57
	58	/* Punycode parameters for Bootstring */
	59	#define BASE 36
	60	#define TMIN 1
	61	#define TMAX 26
	62	#define SKEW 38
	63	#define DAMP 700
	64	#define INITIAL_BIAS 72
	65	#define INITIAL_N 0x80
	66
	67	/* "Basic" Unicode/ASCII code points */
	68	#define _HYPHEN 0X2d
69	#define DELIMITER _HYPHEN
70
71	#define _ZERO_ 0X30
72	#define _NINE 0x39
73
74	#define _SMALL_A 0X61
75	#define _SMALL_Z 0X7a
76
77	#define _CAPITAL_A 0X41
78	#define _CAPITAL_Z 0X5a
79
80	#define IS_BASIC(c) ((c)<0x80)
81	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
82
83	/**
84	* digitToBasic() returns the basic code point whose value
85	* (when used for representing integers) is d, which must be in the
86	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
87	* nonzero, in which case the uppercase form is used.
88	*/
89	static U_INLINE char
90	digitToBasic(int32_t digit, UBool uppercase) {
91	/* 0..25 map to ASCII a..z or A..Z */
92	/* 26..35 map to ASCII 0..9 */
93	if(digit<26) {
94	if(uppercase) {
95	return (char)(_CAPITAL_A+digit);
96	} else {
97	return (char)(_SMALL_A+digit);
98	}
99	} else {
100	return (char)((_ZERO_-26)+digit);
101	}
102	}
103
104	/**
105	* basicToDigit[] contains the numeric value of a basic code
106	* point (for use in representing integers) in the range 0 to
107	* BASE-1, or -1 if b is does not represent a value.
108	*/
109	static const int8_t
110	basicToDigit[256]={
111	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
112	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
113
114	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
116
117	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
118	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
119
120	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
121	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
122
123	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
124	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
125
126	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
128
129	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
131
132	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
134	};
135
136	static U_INLINE char
137	asciiCaseMap(char b, UBool uppercase) {
138	if(uppercase) {
139	if(_SMALL_A<=b && b<=_SMALL_Z) {
140	b-=(_SMALL_A-_CAPITAL_A);
141	}
142	} else {
143	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
144	b+=(_SMALL_A-_CAPITAL_A);
145	}
146	}
147	return b;
148	}
149
150	/* Punycode-specific Bootstring code ---------------------------------------- */
151
152	/*
153	* The following code omits the {parts} of the pseudo-algorithm in the spec
154	* that are not used with the Punycode parameter set.
155	*/
156
157	/* Bias adaptation function. */
158	static int32_t
159	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
160	int32_t count;
161
162	if(firstTime) {
163	delta/=DAMP;
164	} else {
165	delta/=2;
166	}
167
168	delta+=delta/length;
169	for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
170	delta/=(BASE-TMIN);
171	}
172
173	return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
174	}
175
176	#define MAX_CP_COUNT 200
177
178	U_CFUNC int32_t
179	u_strToPunycode(const UChar *src, int32_t srcLength,
180	UChar *dest, int32_t destCapacity,
181	const UBool *caseFlags,
182	UErrorCode *pErrorCode) {
183
184	int32_t cpBuffer[MAX_CP_COUNT];
185	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
186	UChar c, c2;
187
188	/* argument checking */
189	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
190	return 0;
191	}
192
193	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
194	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
195	return 0;
196	}
197
198	/*
199	* Handle the basic code points and
200	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
201	*/
202	srcCPCount=destLength=0;
203	if(srcLength==-1) {
204	/* NUL-terminated input */
205	for(j=0; /* no condition */; ++j) {
206	if((c=src[j])==0) {
207	break;
208	}
209	if(srcCPCount==MAX_CP_COUNT) {
210	/* too many input code points */
211	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
212	return 0;
213	}
214	if(IS_BASIC(c)) {
215	cpBuffer[srcCPCount++]=0;
216	if(destLength<destCapacity) {
217	dest[destLength]=
218	caseFlags!=NULL ?
219	asciiCaseMap((char)c, caseFlags[j]) :
220	(char)c;
221	}
222	++destLength;
223	} else {
224	n=(caseFlags!=NULL && caseFlags[j])<<31L;
225	if(UTF_IS_SINGLE(c)) {
226	n\|=c;
227	} else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
228	++j;
229	n\|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
230	} else {
231	/* error: unmatched surrogate */
232	*pErrorCode=U_INVALID_CHAR_FOUND;
233	return 0;
234	}
235	cpBuffer[srcCPCount++]=n;
236	}
237	}
238	} else {
239	/* length-specified input */
240	for(j=0; j<srcLength; ++j) {
241	if(srcCPCount==MAX_CP_COUNT) {
242	/* too many input code points */
243	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
244	return 0;
245	}
246	c=src[j];
247	if(IS_BASIC(c)) {
729e4ab9	248	cpBuffer[srcCPCount++]=0;
b75a7d8f	249	if(destLength<destCapacity) {
b75a7d8f A	250	dest[destLength]=
	251	caseFlags!=NULL ?
	252	asciiCaseMap((char)c, caseFlags[j]) :
	253	(char)c;
	254	}
	255	++destLength;
	256	} else {
	257	n=(caseFlags!=NULL && caseFlags[j])<<31L;
	258	if(UTF_IS_SINGLE(c)) {
	259	n\|=c;
	260	} else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
	261	++j;
	262	n\|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
	263	} else {
	264	/* error: unmatched surrogate */
	265	*pErrorCode=U_INVALID_CHAR_FOUND;
	266	return 0;
	267	}
	268	cpBuffer[srcCPCount++]=n;
	269	}
	270	}
	271	}
	272
	273	/* Finish the basic string - if it is not empty - with a delimiter. */
	274	basicLength=destLength;
	275	if(basicLength>0) {
	276	if(destLength<destCapacity) {
	277	dest[destLength]=DELIMITER;
	278	}
	279	++destLength;
	280	}
	281
	282	/*
	283	* handledCPCount is the number of code points that have been handled
	284	* basicLength is the number of basic code points
	285	* destLength is the number of chars that have been output
	286	*/
	287
	288	/* Initialize the state: */
	289	n=INITIAL_N;
	290	delta=0;
	291	bias=INITIAL_BIAS;
	292
	293	/* Main encoding loop: */
	294	for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
	295	/*
	296	* All non-basic code points < n have been handled already.
	297	* Find the next larger one:
	298	*/
	299	for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
	300	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	301	if(n<=q && q<m) {
	302	m=q;
	303	}
	304	}
	305
	306	/*
	307	* Increase delta enough to advance the decoder's
	308	* <n,i> state to <m,0>, but guard against overflow:
	309	*/
	310	if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
	311	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	312	return 0;
	313	}
314	delta+=(m-n)*(handledCPCount+1);
315	n=m;
316
317	/* Encode a sequence of same code points n */
318	for(j=0; j<srcCPCount; ++j) {
319	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
320	if(q<n) {
321	++delta;
322	} else if(q==n) {
323	/* Represent delta as a generalized variable-length integer: */
324	for(q=delta, k=BASE; /* no condition */; k+=BASE) {
325
729e4ab9	326	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
b75a7d8f A	327
	328	t=k-bias;
	329	if(t<TMIN) {
	330	t=TMIN;
	331	} else if(t>TMAX) {
	332	t=TMAX;
	333	}
	334	*/
729e4ab9	335
b75a7d8f A	336	t=k-bias;
	337	if(t<TMIN) {
	338	t=TMIN;
	339	} else if(k>=(bias+TMAX)) {
	340	t=TMAX;
	341	}
	342
	343	if(q<t) {
	344	break;
	345	}
	346
	347	if(destLength<destCapacity) {
729e4ab9	348	dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
b75a7d8f	349	}
729e4ab9	350	++destLength;
b75a7d8f A	351	q=(q-t)/(BASE-t);
	352	}
	353
	354	if(destLength<destCapacity) {
729e4ab9	355	dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
b75a7d8f	356	}
729e4ab9	357	++destLength;
b75a7d8f A	358	bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
	359	delta=0;
	360	++handledCPCount;
	361	}
	362	}
	363
	364	++delta;
	365	++n;
	366	}
	367
	368	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	369	}
	370
	371	U_CFUNC int32_t
	372	u_strFromPunycode(const UChar *src, int32_t srcLength,
	373	UChar *dest, int32_t destCapacity,
	374	UBool *caseFlags,
	375	UErrorCode *pErrorCode) {
	376	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
	377	destCPCount, firstSupplementaryIndex, cpLength;
	378	UChar b;
	379
	380	/* argument checking */
	381	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	382	return 0;
	383	}
	384
	385	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	386	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	387	return 0;
	388	}
	389
	390	if(srcLength==-1) {
	391	srcLength=u_strlen(src);
	392	}
	393
	394	/*
	395	* Handle the basic code points:
	396	* Let basicLength be the number of input code points
	397	* before the last delimiter, or 0 if there is none,
	398	* then copy the first basicLength code points to the output.
	399	*
	400	* The two following loops iterate backward.
	401	*/
	402	for(j=srcLength; j>0;) {
	403	if(src[--j]==DELIMITER) {
	404	break;
	405	}
	406	}
	407	destLength=basicLength=destCPCount=j;
	408
	409	while(j>0) {
	410	b=src[--j];
	411	if(!IS_BASIC(b)) {
	412	*pErrorCode=U_INVALID_CHAR_FOUND;
	413	return 0;
	414	}
	415
	416	if(j<destCapacity) {
	417	dest[j]=(UChar)b;
	418
	419	if(caseFlags!=NULL) {
	420	caseFlags[j]=IS_BASIC_UPPERCASE(b);
	421	}
422	}
423	}
424
425	/* Initialize the state: */
426	n=INITIAL_N;
427	i=0;
428	bias=INITIAL_BIAS;
429	firstSupplementaryIndex=1000000000;
430
431	/*
432	* Main decoding loop:
433	* Start just after the last delimiter if any
434	* basic code points were copied; start at the beginning otherwise.
435	*/
436	for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
437	/*
438	* in is the index of the next character to be consumed, and
439	* destCPCount is the number of code points in the output array.
440	*
441	* Decode a generalized variable-length integer into delta,
442	* which gets added to i. The overflow checking is easier
443	* if we increase i as we go, then subtract off its starting
444	* value at the end to obtain delta.
445	*/
446	for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
447	if(in>=srcLength) {
448	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
449	return 0;
450	}
451
452	digit=basicToDigit[(uint8_t)src[in++]];
453	if(digit<0) {
454	*pErrorCode=U_INVALID_CHAR_FOUND;
455	return 0;
456	}
457	if(digit>(0x7fffffff-i)/w) {
458	/* integer overflow */
459	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
460	return 0;
461	}
462
463	i+=digit*w;
464	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
465	t=k-bias;
466	if(t<TMIN) {
467	t=TMIN;
468	} else if(t>TMAX) {
469	t=TMAX;
470	}
471	*/
472	t=k-bias;
473	if(t<TMIN) {
474	t=TMIN;
475	} else if(k>=(bias+TMAX)) {
476	t=TMAX;
477	}
478	if(digit<t) {
479	break;
480	}
481
482	if(w>0x7fffffff/(BASE-t)) {
483	/* integer overflow */
484	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
485	return 0;
486	}
487	w*=BASE-t;
488	}
489
490	/*
491	* Modification from sample code:
492	* Increments destCPCount here,
493	* where needed instead of in for() loop tail.
494	*/
495	++destCPCount;
496	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
497
498	/*
499	* i was supposed to wrap around from (incremented) destCPCount to 0,
500	* incrementing n each time, so we'll fix that now:
501	*/
502	if(i/destCPCount>(0x7fffffff-n)) {
503	/* integer overflow */
504	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
505	return 0;
506	}
507
508	n+=i/destCPCount;
509	i%=destCPCount;
510	/* not needed for Punycode: */
511	/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
512
513	if(n>0x10ffff \|\| UTF_IS_SURROGATE(n)) {
514	/* Unicode code point overflow */
515	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
516	return 0;
517	}
518
519	/* Insert n at position i of the output: */
520	cpLength=UTF_CHAR_LENGTH(n);
729e4ab9	521	if((destLength+cpLength)<=destCapacity) {
b75a7d8f A	522	int32_t codeUnitIndex;
	523
	524	/*
	525	* Handle indexes when supplementary code points are present.
	526	*
	527	* In almost all cases, there will be only BMP code points before i
	528	* and even in the entire string.
	529	* This is handled with the same efficiency as with UTF-32.
	530	*
	531	* Only the rare cases with supplementary code points are handled
	532	* more slowly - but not too bad since this is an insertion anyway.
	533	*/
	534	if(i<=firstSupplementaryIndex) {
	535	codeUnitIndex=i;
	536	if(cpLength>1) {
	537	firstSupplementaryIndex=codeUnitIndex;
	538	} else {
	539	++firstSupplementaryIndex;
	540	}
	541	} else {
	542	codeUnitIndex=firstSupplementaryIndex;
	543	UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
	544	}
	545
	546	/* use the UChar index codeUnitIndex instead of the code point index i */
	547	if(codeUnitIndex<destLength) {
	548	uprv_memmove(dest+codeUnitIndex+cpLength,
	549	dest+codeUnitIndex,
	550	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
	551	if(caseFlags!=NULL) {
	552	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
	553	caseFlags+codeUnitIndex,
	554	destLength-codeUnitIndex);
	555	}
	556	}
	557	if(cpLength==1) {
	558	/* BMP, insert one code unit */
	559	dest[codeUnitIndex]=(UChar)n;
	560	} else {
	561	/* supplementary character, insert two code units */
	562	dest[codeUnitIndex]=UTF16_LEAD(n);
	563	dest[codeUnitIndex+1]=UTF16_TRAIL(n);
	564	}
	565	if(caseFlags!=NULL) {
	566	/* Case of last character determines uppercase flag: */
	567	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
	568	if(cpLength==2) {
	569	caseFlags[codeUnitIndex+1]=FALSE;
	570	}
	571	}
	572	}
	573	destLength+=cpLength;
	574	++i;
	575	}
	576
	577	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	578	}
	579
	580	/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
	581
	582	#endif /* #if !UCONFIG_NO_IDNA */