X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..refs/heads/master:/icuSources/common/uidna.cpp diff --git a/icuSources/common/uidna.cpp b/icuSources/common/uidna.cpp index 4fdb5521..ac2f9c3c 100644 --- a/icuSources/common/uidna.cpp +++ b/icuSources/common/uidna.cpp @@ -1,12 +1,14 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * - * Copyright (C) 2003-2004, International Business Machines + * Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uidna.cpp - * encoding: US-ASCII + * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * @@ -24,6 +26,7 @@ #include "punycode.h" #include "ustr_imp.h" #include "cmemory.h" +#include "uassert.h" #include "sprpimpl.h" /* it is official IDNA ACE Prefix is "xn--" */ @@ -31,16 +34,18 @@ static const UChar ACE_PREFIX[] ={ 0x0078,0x006E,0x002d,0x002d } ; #define ACE_PREFIX_LENGTH 4 #define MAX_LABEL_LENGTH 63 -#define HYPHEN 0x002D -/* The Max length of the labels should not be more than 64 */ -#define MAX_LABEL_BUFFER_SIZE 100 -#define MAX_IDN_BUFFER_SIZE 300 +/* The Max length of the labels should not be more than MAX_LABEL_LENGTH */ +#define MAX_LABEL_BUFFER_SIZE 100 + +#define MAX_DOMAIN_NAME_LENGTH 255 +/* The Max length of the domain names should not be more than MAX_DOMAIN_NAME_LENGTH */ +#define MAX_IDN_BUFFER_SIZE MAX_DOMAIN_NAME_LENGTH+1 -#define CAPITAL_A 0x0041 -#define CAPITAL_Z 0x005A #define LOWER_CASE_DELTA 0x0020 +#define HYPHEN 0x002D #define FULL_STOP 0x002E -#define DATA_FILE_NAME "uidna" +#define CAPITAL_A 0x0041 +#define CAPITAL_Z 0x005A inline static UChar toASCIILower(UChar ch){ @@ -52,26 +57,18 @@ toASCIILower(UChar ch){ inline static UBool startsWithPrefix(const UChar* src , int32_t srcLength){ - UBool startsWithPrefix = TRUE; - if(srcLength < ACE_PREFIX_LENGTH){ return FALSE; } for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){ if(toASCIILower(src[i]) != ACE_PREFIX[i]){ - startsWithPrefix = FALSE; + return FALSE; } } - return startsWithPrefix; + return TRUE; } -inline static void -toASCIILower(UChar* src, int32_t srcLen){ - for(int32_t i=0; i b1Capacity){ + b1 = (UChar*) uprv_malloc(srcLength * U_SIZEOF_UCHAR); if(b1==NULL){ *status = U_MEMORY_ALLOCATION_ERROR; goto CLEANUP; } + b1Capacity = srcLength; + } - *status = U_ZERO_ERROR; // reset error + // step 1 + for( j=0;j 0x7F){ + srcIsASCII = FALSE; + } + b1[b1Len++] = src[j]; + } + + // step 2 is performed only if the source contains non ASCII + if(srcIsASCII == FALSE){ - b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Len, namePrepOptions, parseError, status); + // step 2 + b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Capacity, namePrepOptions, parseError, status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + // we do not have enough room so grow the buffer + if(b1 != b1Stack){ + uprv_free(b1); + } + b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR); + if(b1==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Len, namePrepOptions, parseError, status); + } } // error bail out if(U_FAILURE(*status)){ goto CLEANUP; } + if(b1Len == 0){ + *status = U_IDNA_ZERO_LENGTH_LABEL_ERROR; + goto CLEANUP; + } - // step 3 & 4 + // for step 3 & 4 + srcIsASCII = TRUE; for( j=0;j 0x7F){ srcIsASCII = FALSE; }else if(isLDHChar(b1[j])==FALSE){ // if the char is in ASCII range verify that it is an LDH character @@ -255,7 +287,6 @@ _internal_toASCII(const UChar* src, int32_t srcLength, failPos = j; } } - if(useSTD3ASCIIRules == TRUE){ // verify 3a and 3b // 3(a) Verify the absence of non-LDH ASCII code points; that is, the @@ -282,9 +313,10 @@ _internal_toASCII(const UChar* src, int32_t srcLength, goto CLEANUP; } } + // Step 4: if the source is ASCII then proceed to step 8 if(srcIsASCII){ if(b1Len <= destCapacity){ - uprv_memmove(dest, b1, b1Len * U_SIZEOF_UCHAR); + u_memmove(dest, b1, b1Len); reqLength = b1Len; }else{ reqLength = b1Len; @@ -330,9 +362,9 @@ _internal_toASCII(const UChar* src, int32_t srcLength, goto CLEANUP; } //Step 7: prepend the ACE prefix - uprv_memcpy(dest,ACE_PREFIX,ACE_PREFIX_LENGTH * U_SIZEOF_UCHAR); + u_memcpy(dest, ACE_PREFIX, ACE_PREFIX_LENGTH); //Step 6: copy the contents in b2 into dest - uprv_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len * U_SIZEOF_UCHAR); + u_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len); }else{ *status = U_IDNA_ACE_PREFIX_ERROR; @@ -341,7 +373,7 @@ _internal_toASCII(const UChar* src, int32_t srcLength, goto CLEANUP; } } - + // step 8: verify the length of label if(reqLength > MAX_LABEL_LENGTH){ *status = U_IDNA_LABEL_TOO_LONG_ERROR; } @@ -364,28 +396,29 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, int32_t options, UStringPrepProfile* nameprep, UParseError* parseError, - UErrorCode* status){ + UErrorCode* status) +{ //get the options - UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0); + //UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0); int32_t namePrepOptions = ((options & UIDNA_ALLOW_UNASSIGNED) != 0) ? USPREP_ALLOW_UNASSIGNED: 0; - + + // TODO Revisit buffer handling. The label should not be over 63 ASCII characters. ICU4J may need to be updated too. UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE]; //initialize pointers to stack buffers UChar *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack; - int32_t b1Len, b2Len, b1PrimeLen, b3Len, + int32_t b1Len = 0, b2Len, b1PrimeLen, b3Len, b1Capacity = MAX_LABEL_BUFFER_SIZE, b2Capacity = MAX_LABEL_BUFFER_SIZE, b3Capacity = MAX_LABEL_BUFFER_SIZE, reqLength=0; - b1Len = 0; UBool* caseFlags = NULL; UBool srcIsASCII = TRUE; - UBool srcIsLDH = TRUE; - int32_t failPos =0; + /*UBool srcIsLDH = TRUE; + int32_t failPos =0;*/ // step 1: find out if all the codepoints in src are ASCII if(srcLength==-1){ @@ -393,31 +426,32 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, for(;src[srcLength]!=0;){ if(src[srcLength]> 0x7f){ srcIsASCII = FALSE; - }else if(isLDHChar(src[srcLength])==FALSE){ + }/*else if(isLDHChar(src[srcLength])==FALSE){ // here we do not assemble surrogates // since we know that LDH code points // are in the ASCII range only srcIsLDH = FALSE; failPos = srcLength; - } + }*/ srcLength++; } }else if(srcLength > 0){ for(int32_t j=0; j 0x7f){ srcIsASCII = FALSE; - }else if(isLDHChar(src[j])==FALSE){ + break; + }/*else if(isLDHChar(src[j])==FALSE){ // here we do not assemble surrogates // since we know that LDH code points // are in the ASCII range only srcIsLDH = FALSE; failPos = j; - } + }*/ } }else{ return 0; } - + if(srcIsASCII == FALSE){ // step 2: process the string b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Capacity, namePrepOptions, parseError, status); @@ -445,8 +479,14 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, b1Len = srcLength; } + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + //step 3: verify ACE Prefix - if(startsWithPrefix(src,srcLength)){ + if(startsWithPrefix(b1,b1Len)){ //step 4: Remove the ACE Prefix b1Prime = b1 + ACE_PREFIX_LENGTH; @@ -454,7 +494,7 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, //step 5: Decode using punycode b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Capacity, caseFlags,status); - + if(*status == U_BUFFER_OVERFLOW_ERROR){ // redo processing of string /* we do not have enough room so grow the buffer*/ @@ -465,14 +505,13 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, } *status = U_ZERO_ERROR; // reset error - + b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Len, caseFlags, status); - } - - + + //step 6:Apply toASCII - b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity,options,parseError, status); + b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity, options, parseError, status); if(*status == U_BUFFER_OVERFLOW_ERROR){ // redo processing of string @@ -484,9 +523,9 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, } *status = U_ZERO_ERROR; // reset error - + b3Len = uidna_toASCII(b2,b2Len,b3,b3Len,options,parseError, status); - + } //bail out on error if(U_FAILURE(*status)){ @@ -495,23 +534,26 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, //step 7: verify if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){ - *status = U_IDNA_VERIFICATION_ERROR; + // Cause the original to be returned. + *status = U_IDNA_VERIFICATION_ERROR; goto CLEANUP; } //step 8: return output of step 5 reqLength = b2Len; if(b2Len <= destCapacity) { - uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR); + u_memmove(dest, b2, b2Len); } - }else{ + } + else{ + // See the start of this if statement for why this is commented out. // verify that STD3 ASCII rules are satisfied - if(useSTD3ASCIIRules == TRUE){ - if( srcIsLDH == FALSE /* source contains some non-LDH characters */ + /*if(useSTD3ASCIIRules == TRUE){ + if( srcIsLDH == FALSE // source contains some non-LDH characters || src[0] == HYPHEN || src[srcLength-1] == HYPHEN){ *status = U_IDNA_STD3_ASCII_RULES_ERROR; - /* populate the parseError struct */ + // populate the parseError struct if(srcIsLDH==FALSE){ // failPos is always set the index of failure uprv_syntaxError(src,failPos, srcLength,parseError); @@ -525,14 +567,16 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, goto CLEANUP; } - } + }*/ + // just return the source //copy the source to destination if(srcLength <= destCapacity){ - uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR); + u_memmove(dest, src, srcLength); } reqLength = srcLength; } + CLEANUP: if(b1 != b1Stack && b1!=src){ @@ -542,8 +586,7 @@ CLEANUP: uprv_free(b2); } uprv_free(caseFlags); - - + // The RFC states that // // ToUnicode never fails. If any step fails, then the original input @@ -553,13 +596,12 @@ CLEANUP: if(U_FAILURE(*status)){ //copy the source to destination if(dest && srcLength <= destCapacity){ - if(srcLength == -1) { - uprv_memmove(dest,src,u_strlen(src)* U_SIZEOF_UCHAR); - } else { - uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR); - } + // srcLength should have already been set earlier. + U_ASSERT(srcLength >= 0); + u_memmove(dest, src, srcLength); } reqLength = srcLength; + *status = U_ZERO_ERROR; } return u_terminateUChars(dest, destCapacity, reqLength, status); @@ -580,7 +622,7 @@ uidna_toASCII(const UChar* src, int32_t srcLength, return 0; } - UStringPrepProfile* nameprep = usprep_open(NULL,DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return -1; @@ -608,8 +650,8 @@ uidna_toUnicode(const UChar* src, int32_t srcLength, *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - - UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status); + + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return -1; @@ -640,7 +682,7 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, int32_t reqLength = 0; - UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return 0; @@ -659,16 +701,19 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, for(;;){ labelLen = getNextSeparator(labelStart,remainingLen, &delimiter,&done); + labelReqLength = 0; + if(!(labelLen==0 && done)){// make sure this is not a root label separator. - labelReqLength = _internal_toASCII( labelStart, labelLen, - currentDest, remainingDestCapacity, - options, nameprep, - parseError, status); - - if(*status == U_BUFFER_OVERFLOW_ERROR){ - - *status = U_ZERO_ERROR; // reset error - remainingDestCapacity = 0; + labelReqLength = _internal_toASCII( labelStart, labelLen, + currentDest, remainingDestCapacity, + options, nameprep, + parseError, status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + + *status = U_ZERO_ERROR; // reset error + remainingDestCapacity = 0; + } } @@ -685,6 +730,7 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, // should never occur remainingDestCapacity = 0; } + if(done == TRUE){ break; } @@ -694,15 +740,19 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, *currentDest++ = FULL_STOP; remainingDestCapacity--; } - reqLength++; + reqLength++; labelStart = delimiter; if(remainingLen >0 ){ - remainingLen = srcLength - (delimiter - src); + remainingLen = (int32_t)(srcLength - (delimiter - src)); } } - + + if(reqLength > MAX_DOMAIN_NAME_LENGTH){ + *status = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR; + } + usprep_close(nameprep); return u_terminateUChars(dest, destCapacity, reqLength, status); @@ -725,7 +775,7 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, int32_t reqLength = 0; - UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return 0; @@ -740,23 +790,31 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, int32_t labelLen = 0, labelReqLength = 0; UBool done = FALSE; - for(;;){ labelLen = getNextSeparator(labelStart,remainingLen, &delimiter,&done); + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + // _internal_toUnicode will copy the label. + /*if(labelLen==0 && done==FALSE){ + *status = U_IDNA_ZERO_LENGTH_LABEL_ERROR; + break; + }*/ + labelReqLength = _internal_toUnicode(labelStart, labelLen, currentDest, remainingDestCapacity, options, nameprep, parseError, status); if(*status == U_BUFFER_OVERFLOW_ERROR){ - *status = U_ZERO_ERROR; // reset error remainingDestCapacity = 0; } - if(U_FAILURE(*status)){ break; } @@ -776,19 +834,24 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, } // add the label separator + // Unlike the ToASCII operation we don't normalize the label separators if(remainingDestCapacity > 0){ - *currentDest++ = FULL_STOP; + *currentDest++ = *(labelStart + labelLen); remainingDestCapacity--; } - reqLength++; + reqLength++; labelStart = delimiter; if(remainingLen >0 ){ - remainingLen = srcLength - (delimiter - src); + remainingLen = (int32_t)(srcLength - (delimiter - src)); } } - + + if(reqLength > MAX_DOMAIN_NAME_LENGTH){ + *status = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR; + } + usprep_close(nameprep); return u_terminateUChars(dest, destCapacity, reqLength, status);