icuSources/common/uts46.cpp

   1 /*
   2 *******************************************************************************
   3 *   Copyright (C) 2010-2011, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 *   file name:  uts46.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2010mar09
  12 *   created by: Markus W. Scherer
  13 */
  14
  15 #include "unicode/utypes.h"
  16
  17 #if !UCONFIG_NO_IDNA
  18
  19 #include "unicode/idna.h"
  20 #include "unicode/normalizer2.h"
  21 #include "unicode/uscript.h"
  22 #include "unicode/ustring.h"
  23 #include "unicode/utf16.h"
  24 #include "cmemory.h"
  25 #include "cstring.h"
  26 #include "punycode.h"
  27 #include "ubidi_props.h"
  28 #include "ustr_imp.h"
  29
  30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  31
  32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
  33 //
  34 // The domain name length limit is 255 octets in an internal DNS representation
  35 // where the last ("root") label is the empty label
  36 // represented by length byte 0 alone.
  37 // In a conventional string, this translates to 253 characters, or 254
  38 // if there is a trailing dot for the root label.
  39
  40 U_NAMESPACE_BEGIN
  41
  42 // Severe errors which usually result in a U+FFFD replacement character in the result string.
  43 const uint32_t severeErrors=
  44     UIDNA_ERROR_LEADING_COMBINING_MARK|
  45     UIDNA_ERROR_DISALLOWED|
  46     UIDNA_ERROR_PUNYCODE|
  47     UIDNA_ERROR_LABEL_HAS_DOT|
  48     UIDNA_ERROR_INVALID_ACE_LABEL;
  49
  50 static inline UBool
  51 isASCIIString(const UnicodeString &dest) {
  52     const UChar *s=dest.getBuffer();
  53     const UChar *limit=s+dest.length();
  54     while(s<limit) {
  55         if(*s++>0x7f) {
  56             return FALSE;
  57         }
  58     }
  59     return TRUE;
  60 }
  61
  62 static UBool
  63 isASCIIOkBiDi(const UChar *s, int32_t length);
  64
  65 static UBool
  66 isASCIIOkBiDi(const char *s, int32_t length);
  67
  68 // IDNA class default implementations -------------------------------------- ***
  69
  70 IDNA::~IDNA() {}
  71
  72 void
  73 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
  74                         IDNAInfo &info, UErrorCode &errorCode) const {
  75     if(U_SUCCESS(errorCode)) {
  76         UnicodeString destString;
  77         labelToASCII(UnicodeString::fromUTF8(label), destString,
  78                      info, errorCode).toUTF8(dest);
  79     }
  80 }
  81
  82 void
  83 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
  84                          IDNAInfo &info, UErrorCode &errorCode) const {
  85     if(U_SUCCESS(errorCode)) {
  86         UnicodeString destString;
  87         labelToUnicode(UnicodeString::fromUTF8(label), destString,
  88                        info, errorCode).toUTF8(dest);
  89     }
  90 }
  91
  92 void
  93 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
  94                        IDNAInfo &info, UErrorCode &errorCode) const {
  95     if(U_SUCCESS(errorCode)) {
  96         UnicodeString destString;
  97         nameToASCII(UnicodeString::fromUTF8(name), destString,
  98                     info, errorCode).toUTF8(dest);
  99     }
 100 }
 101
 102 void
 103 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
 104                         IDNAInfo &info, UErrorCode &errorCode) const {
 105     if(U_SUCCESS(errorCode)) {
 106         UnicodeString destString;
 107         nameToUnicode(UnicodeString::fromUTF8(name), destString,
 108                       info, errorCode).toUTF8(dest);
 109     }
 110 }
 111
 112 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(IDNA)
 113
 114 // UTS46 class declaration ------------------------------------------------- ***
 115
 116 class UTS46 : public IDNA {
 117 public:
 118     UTS46(uint32_t options, UErrorCode &errorCode);
 119     virtual ~UTS46();
 120
 121     virtual UnicodeString &
 122     labelToASCII(const UnicodeString &label, UnicodeString &dest,
 123                  IDNAInfo &info, UErrorCode &errorCode) const;
 124
 125     virtual UnicodeString &
 126     labelToUnicode(const UnicodeString &label, UnicodeString &dest,
 127                    IDNAInfo &info, UErrorCode &errorCode) const;
 128
 129     virtual UnicodeString &
 130     nameToASCII(const UnicodeString &name, UnicodeString &dest,
 131                 IDNAInfo &info, UErrorCode &errorCode) const;
 132
 133     virtual UnicodeString &
 134     nameToUnicode(const UnicodeString &name, UnicodeString &dest,
 135                   IDNAInfo &info, UErrorCode &errorCode) const;
 136
 137     virtual void
 138     labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
 139                       IDNAInfo &info, UErrorCode &errorCode) const;
 140
 141     virtual void
 142     labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
 143                        IDNAInfo &info, UErrorCode &errorCode) const;
 144
 145     virtual void
 146     nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
 147                      IDNAInfo &info, UErrorCode &errorCode) const;
 148
 149     virtual void
 150     nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
 151                       IDNAInfo &info, UErrorCode &errorCode) const;
 152
 153 private:
 154     UnicodeString &
 155     process(const UnicodeString &src,
 156             UBool isLabel, UBool toASCII,
 157             UnicodeString &dest,
 158             IDNAInfo &info, UErrorCode &errorCode) const;
 159
 160     void
 161     processUTF8(const StringPiece &src,
 162                 UBool isLabel, UBool toASCII,
 163                 ByteSink &dest,
 164                 IDNAInfo &info, UErrorCode &errorCode) const;
 165
 166     UnicodeString &
 167     processUnicode(const UnicodeString &src,
 168                    int32_t labelStart, int32_t mappingStart,
 169                    UBool isLabel, UBool toASCII,
 170                    UnicodeString &dest,
 171                    IDNAInfo &info, UErrorCode &errorCode) const;
 172
 173     // returns the new dest.length()
 174     int32_t
 175     mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
 176                 UErrorCode &errorCode) const;
 177
 178     // returns the new label length
 179     int32_t
 180     processLabel(UnicodeString &dest,
 181                  int32_t labelStart, int32_t labelLength,
 182                  UBool toASCII,
 183                  IDNAInfo &info, UErrorCode &errorCode) const;
 184     int32_t
 185     markBadACELabel(UnicodeString &dest,
 186                     int32_t labelStart, int32_t labelLength,
 187                     UBool toASCII, IDNAInfo &info) const;
 188
 189     void
 190     checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
 191
 192     UBool
 193     isLabelOkContextJ(const UChar *label, int32_t labelLength) const;
 194
 195     void
 196     checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
 197
 198     const Normalizer2 &uts46Norm2;  // uts46.nrm
 199     uint32_t options;
 200 };
 201
 202 IDNA *
 203 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
 204     if(U_SUCCESS(errorCode)) {
 205         IDNA *idna=new UTS46(options, errorCode);
 206         if(idna==NULL) {
 207             errorCode=U_MEMORY_ALLOCATION_ERROR;
 208         } else if(U_FAILURE(errorCode)) {
 209             delete idna;
 210             idna=NULL;
 211         }
 212         return idna;
 213     } else {
 214         return NULL;
 215     }
 216 }
 217
 218 // UTS46 implementation ---------------------------------------------------- ***
 219
 220 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
 221         : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)),
 222           options(opt) {}
 223
 224 UTS46::~UTS46() {}
 225
 226 UnicodeString &
 227 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
 228                     IDNAInfo &info, UErrorCode &errorCode) const {
 229     return process(label, TRUE, TRUE, dest, info, errorCode);
 230 }
 231
 232 UnicodeString &
 233 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
 234                       IDNAInfo &info, UErrorCode &errorCode) const {
 235     return process(label, TRUE, FALSE, dest, info, errorCode);
 236 }
 237
 238 UnicodeString &
 239 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
 240                    IDNAInfo &info, UErrorCode &errorCode) const {
 241     process(name, FALSE, TRUE, dest, info, errorCode);
 242     if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&
 243         isASCIIString(dest) &&
 244         (dest.length()>254 || dest[253]!=0x2e)
 245     ) {
 246         info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
 247     }
 248     return dest;
 249 }
 250
 251 UnicodeString &
 252 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
 253                      IDNAInfo &info, UErrorCode &errorCode) const {
 254     return process(name, FALSE, FALSE, dest, info, errorCode);
 255 }
 256
 257 void
 258 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
 259                          IDNAInfo &info, UErrorCode &errorCode) const {
 260     processUTF8(label, TRUE, TRUE, dest, info, errorCode);
 261 }
 262
 263 void
 264 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
 265                           IDNAInfo &info, UErrorCode &errorCode) const {
 266     processUTF8(label, TRUE, FALSE, dest, info, errorCode);
 267 }
 268
 269 void
 270 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
 271                         IDNAInfo &info, UErrorCode &errorCode) const {
 272     processUTF8(name, FALSE, TRUE, dest, info, errorCode);
 273 }
 274
 275 void
 276 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
 277                          IDNAInfo &info, UErrorCode &errorCode) const {
 278     processUTF8(name, FALSE, FALSE, dest, info, errorCode);
 279 }
 280
 281 // UTS #46 data for ASCII characters.
 282 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
 283 // and passes through all other ASCII characters.
 284 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
 285 // using this data.
 286 // The ASCII fastpath also uses this data.
 287 // Values: -1=disallowed  0==valid  1==mapped (lowercase)
 288 static const int8_t asciiData[128]={
 289     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 290     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 291     // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
 292     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
 293     // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
 294      0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
 295     // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
 296     -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 297      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
 298     // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
 299     -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 300      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
 301 };
 302
 303 UnicodeString &
 304 UTS46::process(const UnicodeString &src,
 305                UBool isLabel, UBool toASCII,
 306                UnicodeString &dest,
 307                IDNAInfo &info, UErrorCode &errorCode) const {
 308     // uts46Norm2.normalize() would do all of this error checking and setup,
 309     // but with the ASCII fastpath we do not always call it, and do not
 310     // call it first.
 311     if(U_FAILURE(errorCode)) {
 312         dest.setToBogus();
 313         return dest;
 314     }
 315     const UChar *srcArray=src.getBuffer();
 316     if(&dest==&src || srcArray==NULL) {
 317         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 318         dest.setToBogus();
 319         return dest;
 320     }
 321     // Arguments are fine, reset output values.
 322     dest.remove();
 323     info.reset();
 324     int32_t srcLength=src.length();
 325     if(srcLength==0) {
 326         if(toASCII) {
 327             info.errors|=UIDNA_ERROR_EMPTY_LABEL;
 328         }
 329         return dest;
 330     }
 331     UChar *destArray=dest.getBuffer(srcLength);
 332     if(destArray==NULL) {
 333         errorCode=U_MEMORY_ALLOCATION_ERROR;
 334         return dest;
 335     }
 336     // ASCII fastpath
 337     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
 338     int32_t labelStart=0;
 339     int32_t i;
 340     for(i=0;; ++i) {
 341         if(i==srcLength) {
 342             if(toASCII) {
 343                 if((i-labelStart)>63) {
 344                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 345                 }
 346                 // There is a trailing dot if labelStart==i.
 347                 if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
 348                     info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
 349                 }
 350             }
 351             info.errors|=info.labelErrors;
 352             dest.releaseBuffer(i);
 353             return dest;
 354         }
 355         UChar c=srcArray[i];
 356         if(c>0x7f) {
 357             break;
 358         }
 359         int cData=asciiData[c];
 360         if(cData>0) {
 361             destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
 362         } else if(cData<0 && disallowNonLDHDot) {
 363             break;  // Replacing with U+FFFD can be complicated for toASCII.
 364         } else {
 365             destArray[i]=c;
 366             if(c==0x2d) {  // hyphen
 367                 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
 368                     // "??--..." is Punycode or forbidden.
 369                     ++i;  // '-' was copied to dest already
 370                     break;
 371                 }
 372                 if(i==labelStart) {
 373                     // label starts with "-"
 374                     info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
 375                 }
 376                 if((i+1)==srcLength || srcArray[i+1]==0x2e) {
 377                     // label ends with "-"
 378                     info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
 379                 }
 380             } else if(c==0x2e) {  // dot
 381                 if(isLabel) {
 382                     // Replacing with U+FFFD can be complicated for toASCII.
 383                     ++i;  // '.' was copied to dest already
 384                     break;
 385                 }
 386                 if(toASCII) {
 387                     // Permit an empty label at the end but not elsewhere.
 388                     if(i==labelStart && i<(srcLength-1)) {
 389                         info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
 390                     } else if((i-labelStart)>63) {
 391                         info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 392                     }
 393                 }
 394                 info.errors|=info.labelErrors;
 395                 info.labelErrors=0;
 396                 labelStart=i+1;
 397             }
 398         }
 399     }
 400     info.errors|=info.labelErrors;
 401     dest.releaseBuffer(i);
 402     processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
 403     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
 404         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
 405     ) {
 406         info.errors|=UIDNA_ERROR_BIDI;
 407     }
 408     return dest;
 409 }
 410
 411 void
 412 UTS46::processUTF8(const StringPiece &src,
 413                    UBool isLabel, UBool toASCII,
 414                    ByteSink &dest,
 415                    IDNAInfo &info, UErrorCode &errorCode) const {
 416     if(U_FAILURE(errorCode)) {
 417         return;
 418     }
 419     const char *srcArray=src.data();
 420     int32_t srcLength=src.length();
 421     if(srcArray==NULL && srcLength!=0) {
 422         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 423         return;
 424     }
 425     // Arguments are fine, reset output values.
 426     info.reset();
 427     if(srcLength==0) {
 428         if(toASCII) {
 429             info.errors|=UIDNA_ERROR_EMPTY_LABEL;
 430         }
 431         dest.Flush();
 432         return;
 433     }
 434     UnicodeString destString;
 435     int32_t labelStart=0;
 436     if(srcLength<=256) {  // length of stackArray[]
 437         // ASCII fastpath
 438         char stackArray[256];
 439         int32_t destCapacity;
 440         char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
 441                                              stackArray, LENGTHOF(stackArray), &destCapacity);
 442         UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
 443         int32_t i;
 444         for(i=0;; ++i) {
 445             if(i==srcLength) {
 446                 if(toASCII) {
 447                     if((i-labelStart)>63) {
 448                         info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 449                     }
 450                     // There is a trailing dot if labelStart==i.
 451                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
 452                         info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
 453                     }
 454                 }
 455                 info.errors|=info.labelErrors;
 456                 dest.Append(destArray, i);
 457                 dest.Flush();
 458                 return;
 459             }
 460             char c=srcArray[i];
 461             if((int8_t)c<0) {  // (uint8_t)c>0x7f
 462                 break;
 463             }
 464             int cData=asciiData[(int)c];  // Cast: gcc warns about indexing with a char.
 465             if(cData>0) {
 466                 destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
 467             } else if(cData<0 && disallowNonLDHDot) {
 468                 break;  // Replacing with U+FFFD can be complicated for toASCII.
 469             } else {
 470                 destArray[i]=c;
 471                 if(c==0x2d) {  // hyphen
 472                     if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
 473                         // "??--..." is Punycode or forbidden.
 474                         break;
 475                     }
 476                     if(i==labelStart) {
 477                         // label starts with "-"
 478                         info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
 479                     }
 480                     if((i+1)==srcLength || srcArray[i+1]==0x2e) {
 481                         // label ends with "-"
 482                         info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
 483                     }
 484                 } else if(c==0x2e) {  // dot
 485                     if(isLabel) {
 486                         break;  // Replacing with U+FFFD can be complicated for toASCII.
 487                     }
 488                     if(toASCII) {
 489                         // Permit an empty label at the end but not elsewhere.
 490                         if(i==labelStart && i<(srcLength-1)) {
 491                             info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
 492                         } else if((i-labelStart)>63) {
 493                             info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 494                         }
 495                     }
 496                     info.errors|=info.labelErrors;
 497                     info.labelErrors=0;
 498                     labelStart=i+1;
 499                 }
 500             }
 501         }
 502         info.errors|=info.labelErrors;
 503         // Convert the processed ASCII prefix of the current label to UTF-16.
 504         int32_t mappingStart=i-labelStart;
 505         destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
 506         // Output the previous ASCII labels and process the rest of src in UTF-16.
 507         dest.Append(destArray, labelStart);
 508         processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
 509                        isLabel, toASCII,
 510                        destString, info, errorCode);
 511     } else {
 512         // src is too long for the ASCII fastpath implementation.
 513         processUnicode(UnicodeString::fromUTF8(src), 0, 0,
 514                        isLabel, toASCII,
 515                        destString, info, errorCode);
 516     }
 517     destString.toUTF8(dest);  // calls dest.Flush()
 518     if(toASCII && !isLabel) {
 519         // length==labelStart==254 means that there is a trailing dot (ok) and
 520         // destString is empty (do not index at 253-labelStart).
 521         int32_t length=labelStart+destString.length();
 522         if( length>=254 && isASCIIString(destString) &&
 523             (length>254 ||
 524              (labelStart<254 && destString[253-labelStart]!=0x2e))
 525         ) {
 526             info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
 527         }
 528     }
 529     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
 530         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
 531     ) {
 532         info.errors|=UIDNA_ERROR_BIDI;
 533     }
 534 }
 535
 536 UnicodeString &
 537 UTS46::processUnicode(const UnicodeString &src,
 538                       int32_t labelStart, int32_t mappingStart,
 539                       UBool isLabel, UBool toASCII,
 540                       UnicodeString &dest,
 541                       IDNAInfo &info, UErrorCode &errorCode) const {
 542     if(mappingStart==0) {
 543         uts46Norm2.normalize(src, dest, errorCode);
 544     } else {
 545         uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
 546     }
 547     if(U_FAILURE(errorCode)) {
 548         return dest;
 549     }
 550     UBool doMapDevChars=
 551         toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
 552                   (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;
 553     const UChar *destArray=dest.getBuffer();
 554     int32_t destLength=dest.length();
 555     int32_t labelLimit=labelStart;
 556     while(labelLimit<destLength) {
 557         UChar c=destArray[labelLimit];
 558         if(c==0x2e && !isLabel) {
 559             int32_t labelLength=labelLimit-labelStart;
 560             int32_t newLength=processLabel(dest, labelStart, labelLength,
 561                                             toASCII, info, errorCode);
 562             info.errors|=info.labelErrors;
 563             info.labelErrors=0;
 564             if(U_FAILURE(errorCode)) {
 565                 return dest;
 566             }
 567             destArray=dest.getBuffer();
 568             destLength+=newLength-labelLength;
 569             labelLimit=labelStart+=newLength+1;
 570         } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
 571             info.isTransDiff=TRUE;
 572             if(doMapDevChars) {
 573                 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
 574                 if(U_FAILURE(errorCode)) {
 575                     return dest;
 576                 }
 577                 destArray=dest.getBuffer();
 578                 // Do not increment labelLimit in case c was removed.
 579                 // All deviation characters have been mapped, no need to check for them again.
 580                 doMapDevChars=FALSE;
 581             } else {
 582                 ++labelLimit;
 583             }
 584         } else {
 585             ++labelLimit;
 586         }
 587     }
 588     // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
 589     // but not an empty label elsewhere nor a completely empty domain name.
 590     // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
 591     if(0==labelStart || labelStart<labelLimit) {
 592         processLabel(dest, labelStart, labelLimit-labelStart,
 593                       toASCII, info, errorCode);
 594         info.errors|=info.labelErrors;
 595     }
 596     return dest;
 597 }
 598
 599 int32_t
 600 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
 601                    UErrorCode &errorCode) const {
 602     int32_t length=dest.length();
 603     UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);
 604     if(s==NULL) {
 605         errorCode=U_MEMORY_ALLOCATION_ERROR;
 606         return length;
 607     }
 608     int32_t capacity=dest.getCapacity();
 609     UBool didMapDevChars=FALSE;
 610     int32_t readIndex=mappingStart, writeIndex=mappingStart;
 611     do {
 612         UChar c=s[readIndex++];
 613         switch(c) {
 614         case 0xdf:
 615             // Map sharp s to ss.
 616             didMapDevChars=TRUE;
 617             s[writeIndex++]=0x73;  // Replace sharp s with first s.
 618             // Insert second s and account for possible buffer reallocation.
 619             if(writeIndex==readIndex) {
 620                 if(length==capacity) {
 621                     dest.releaseBuffer(length);
 622                     s=dest.getBuffer(length+1);
 623                     if(s==NULL) {
 624                         errorCode=U_MEMORY_ALLOCATION_ERROR;
 625                         return length;
 626                     }
 627                     capacity=dest.getCapacity();
 628                 }
 629                 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);
 630                 ++readIndex;
 631             }
 632             s[writeIndex++]=0x73;
 633             ++length;
 634             break;
 635         case 0x3c2:  // Map final sigma to nonfinal sigma.
 636             didMapDevChars=TRUE;
 637             s[writeIndex++]=0x3c3;
 638             break;
 639         case 0x200c:  // Ignore/remove ZWNJ.
 640         case 0x200d:  // Ignore/remove ZWJ.
 641             didMapDevChars=TRUE;
 642             --length;
 643             break;
 644         default:
 645             // Only really necessary if writeIndex was different from readIndex.
 646             s[writeIndex++]=c;
 647             break;
 648         }
 649     } while(writeIndex<length);
 650     dest.releaseBuffer(length);
 651     if(didMapDevChars) {
 652         // Mapping deviation characters might have resulted in an un-NFC string.
 653         // We could use either the NFC or the UTS #46 normalizer.
 654         // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
 655         UnicodeString normalized;
 656         uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
 657         if(U_SUCCESS(errorCode)) {
 658             dest.replace(labelStart, 0x7fffffff, normalized);
 659             return dest.length();
 660         }
 661     }
 662     return length;
 663 }
 664
 665 // Some non-ASCII characters are equivalent to sequences with
 666 // non-LDH ASCII characters. To find them:
 667 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
 668 static inline UBool
 669 isNonASCIIDisallowedSTD3Valid(UChar32 c) {
 670     return c==0x2260 || c==0x226E || c==0x226F;
 671 }
 672
 673 // Replace the label in dest with the label string, if the label was modified.
 674 // If &label==&dest then the label was modified in-place and labelLength
 675 // is the new label length, different from label.length().
 676 // If &label!=&dest then labelLength==label.length().
 677 // Returns labelLength (= the new label length).
 678 static int32_t
 679 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
 680              const UnicodeString &label, int32_t labelLength) {
 681     if(&label!=&dest) {
 682         dest.replace(destLabelStart, destLabelLength, label);
 683     }
 684     return labelLength;
 685 }
 686
 687 int32_t
 688 UTS46::processLabel(UnicodeString &dest,
 689                     int32_t labelStart, int32_t labelLength,
 690                     UBool toASCII,
 691                     IDNAInfo &info, UErrorCode &errorCode) const {
 692     UnicodeString fromPunycode;
 693     UnicodeString *labelString;
 694     const UChar *label=dest.getBuffer()+labelStart;
 695     int32_t destLabelStart=labelStart;
 696     int32_t destLabelLength=labelLength;
 697     UBool wasPunycode;
 698     if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
 699         // Label starts with "xn--", try to un-Punycode it.
 700         wasPunycode=TRUE;
 701         UChar *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit
 702         if(unicodeBuffer==NULL) {
 703             // Should never occur if we used capacity==-1 which uses the internal buffer.
 704             errorCode=U_MEMORY_ALLOCATION_ERROR;
 705             return labelLength;
 706         }
 707         UErrorCode punycodeErrorCode=U_ZERO_ERROR;
 708         int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
 709                                                 unicodeBuffer, fromPunycode.getCapacity(),
 710                                                 NULL, &punycodeErrorCode);
 711         if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
 712             fromPunycode.releaseBuffer(0);
 713             unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
 714             if(unicodeBuffer==NULL) {
 715                 errorCode=U_MEMORY_ALLOCATION_ERROR;
 716                 return labelLength;
 717             }
 718             punycodeErrorCode=U_ZERO_ERROR;
 719             unicodeLength=u_strFromPunycode(label+4, labelLength-4,
 720                                             unicodeBuffer, fromPunycode.getCapacity(),
 721                                             NULL, &punycodeErrorCode);
 722         }
 723         fromPunycode.releaseBuffer(unicodeLength);
 724         if(U_FAILURE(punycodeErrorCode)) {
 725             info.labelErrors|=UIDNA_ERROR_PUNYCODE;
 726             return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
 727         }
 728         // Check for NFC, and for characters that are not
 729         // valid or deviation characters according to the normalizer.
 730         // If there is something wrong, then the string will change.
 731         // Note that the normalizer passes through non-LDH ASCII and deviation characters.
 732         // Deviation characters are ok in Punycode even in transitional processing.
 733         // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
 734         // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
 735         UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
 736         if(U_FAILURE(errorCode)) {
 737             return labelLength;
 738         }
 739         if(!isValid) {
 740             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
 741             return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
 742         }
 743         labelString=&fromPunycode;
 744         label=fromPunycode.getBuffer();
 745         labelStart=0;
 746         labelLength=fromPunycode.length();
 747     } else {
 748         wasPunycode=FALSE;
 749         labelString=&dest;
 750     }
 751     // Validity check
 752     if(labelLength==0) {
 753         if(toASCII) {
 754             info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
 755         }
 756         return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
 757     }
 758     // labelLength>0
 759     if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {
 760         // label starts with "??--"
 761         info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
 762     }
 763     if(label[0]==0x2d) {
 764         // label starts with "-"
 765         info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
 766     }
 767     if(label[labelLength-1]==0x2d) {
 768         // label ends with "-"
 769         info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
 770     }
 771     // If the label was not a Punycode label, then it was the result of
 772     // mapping, normalization and label segmentation.
 773     // If the label was in Punycode, then we mapped it again above
 774     // and checked its validity.
 775     // Now we handle the STD3 restriction to LDH characters (if set)
 776     // and we look for U+FFFD which indicates disallowed characters
 777     // in a non-Punycode label or U+FFFD itself in a Punycode label.
 778     // We also check for dots which can come from the input to a single-label function.
 779     // Ok to cast away const because we own the UnicodeString.
 780     UChar *s=(UChar *)label;
 781     const UChar *limit=label+labelLength;
 782     UChar oredChars=0;
 783     // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
 784     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
 785     do {
 786         UChar c=*s;
 787         if(c<=0x7f) {
 788             if(c==0x2e) {
 789                 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
 790                 *s=0xfffd;
 791             } else if(disallowNonLDHDot && asciiData[c]<0) {
 792                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
 793                 *s=0xfffd;
 794             }
 795         } else {
 796             oredChars|=c;
 797             if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
 798                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
 799                 *s=0xfffd;
 800             } else if(c==0xfffd) {
 801                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
 802             }
 803         }
 804         ++s;
 805     } while(s<limit);
 806     // Check for a leading combining mark after other validity checks
 807     // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
 808     UChar32 c;
 809     int32_t cpLength=0;
 810     // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
 811     U16_NEXT_UNSAFE(label, cpLength, c);
 812     if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
 813         info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;
 814         labelString->replace(labelStart, cpLength, (UChar)0xfffd);
 815         label=labelString->getBuffer()+labelStart;
 816         labelLength+=1-cpLength;
 817         if(labelString==&dest) {
 818             destLabelLength=labelLength;
 819         }
 820     }
 821     if((info.labelErrors&severeErrors)==0) {
 822         // Do contextual checks only if we do not have U+FFFD from a severe error
 823         // because U+FFFD can make these checks fail.
 824         if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
 825             checkLabelBiDi(label, labelLength, info);
 826         }
 827         if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
 828             !isLabelOkContextJ(label, labelLength)
 829         ) {
 830             info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
 831         }
 832         if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
 833             checkLabelContextO(label, labelLength, info);
 834         }
 835         if(toASCII) {
 836             if(wasPunycode) {
 837                 // Leave a Punycode label unchanged if it has no severe errors.
 838                 if(destLabelLength>63) {
 839                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 840                 }
 841                 return destLabelLength;
 842             } else if(oredChars>=0x80) {
 843                 // Contains non-ASCII characters.
 844                 UnicodeString punycode;
 845                 UChar *buffer=punycode.getBuffer(63);  // 63==maximum DNS label length
 846                 if(buffer==NULL) {
 847                     errorCode=U_MEMORY_ALLOCATION_ERROR;
 848                     return destLabelLength;
 849                 }
 850                 buffer[0]=0x78;  // Write "xn--".
 851                 buffer[1]=0x6e;
 852                 buffer[2]=0x2d;
 853                 buffer[3]=0x2d;
 854                 int32_t punycodeLength=u_strToPunycode(label, labelLength,
 855                                                       buffer+4, punycode.getCapacity()-4,
 856                                                       NULL, &errorCode);
 857                 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 858                     errorCode=U_ZERO_ERROR;
 859                     punycode.releaseBuffer(4);
 860                     buffer=punycode.getBuffer(4+punycodeLength);
 861                     if(buffer==NULL) {
 862                         errorCode=U_MEMORY_ALLOCATION_ERROR;
 863                         return destLabelLength;
 864                     }
 865                     punycodeLength=u_strToPunycode(label, labelLength,
 866                                                   buffer+4, punycode.getCapacity()-4,
 867                                                   NULL, &errorCode);
 868                 }
 869                 punycodeLength+=4;
 870                 punycode.releaseBuffer(punycodeLength);
 871                 if(U_FAILURE(errorCode)) {
 872                     return destLabelLength;
 873                 }
 874                 if(punycodeLength>63) {
 875                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 876                 }
 877                 return replaceLabel(dest, destLabelStart, destLabelLength,
 878                                     punycode, punycodeLength);
 879             } else {
 880                 // all-ASCII label
 881                 if(labelLength>63) {
 882                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 883                 }
 884             }
 885         }
 886     } else {
 887         // If a Punycode label has severe errors,
 888         // then leave it but make sure it does not look valid.
 889         if(wasPunycode) {
 890             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
 891             return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
 892         }
 893     }
 894     return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
 895 }
 896
 897 // Make sure an ACE label does not look valid.
 898 // Append U+FFFD if the label has only LDH characters.
 899 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
 900 int32_t
 901 UTS46::markBadACELabel(UnicodeString &dest,
 902                        int32_t labelStart, int32_t labelLength,
 903                        UBool toASCII, IDNAInfo &info) const {
 904     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
 905     UBool isASCII=TRUE;
 906     UBool onlyLDH=TRUE;
 907     const UChar *label=dest.getBuffer()+labelStart;
 908     // Ok to cast away const because we own the UnicodeString.
 909     UChar *s=(UChar *)label+4;  // After the initial "xn--".
 910     const UChar *limit=label+labelLength;
 911     do {
 912         UChar c=*s;
 913         if(c<=0x7f) {
 914             if(c==0x2e) {
 915                 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
 916                 *s=0xfffd;
 917                 isASCII=onlyLDH=FALSE;
 918             } else if(asciiData[c]<0) {
 919                 onlyLDH=FALSE;
 920                 if(disallowNonLDHDot) {
 921                     *s=0xfffd;
 922                     isASCII=FALSE;
 923                 }
 924             }
 925         } else {
 926             isASCII=onlyLDH=FALSE;
 927         }
 928     } while(++s<limit);
 929     if(onlyLDH) {
 930         dest.insert(labelStart+labelLength, (UChar)0xfffd);
 931         ++labelLength;
 932     } else {
 933         if(toASCII && isASCII && labelLength>63) {
 934             info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
 935         }
 936     }
 937     return labelLength;
 938 }
 939
 940 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
 941 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
 942 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;
 943
 944 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);
 945
 946 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);
 947 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
 948 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);
 949
 950 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
 951     U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|
 952     U_MASK(U_COMMON_NUMBER_SEPARATOR)|
 953     U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|
 954     U_MASK(U_OTHER_NEUTRAL)|
 955     U_MASK(U_BOUNDARY_NEUTRAL)|
 956     U_MASK(U_DIR_NON_SPACING_MARK);
 957 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
 958 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
 959
 960 // We scan the whole label and check both for whether it contains RTL characters
 961 // and whether it passes the BiDi Rule.
 962 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
 963 // that a domain name is a BiDi domain name (has an RTL label) only after
 964 // processing several earlier labels.
 965 void
 966 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
 967     // IDNA2008 BiDi rule
 968     // Get the directionality of the first character.
 969     UChar32 c;
 970     int32_t i=0;
 971     U16_NEXT_UNSAFE(label, i, c);
 972     uint32_t firstMask=U_MASK(u_charDirection(c));
 973     // 1. The first character must be a character with BIDI property L, R
 974     // or AL.  If it has the R or AL property, it is an RTL label; if it
 975     // has the L property, it is an LTR label.
 976     if((firstMask&~L_R_AL_MASK)!=0) {
 977         info.isOkBiDi=FALSE;
 978     }
 979     // Get the directionality of the last non-NSM character.
 980     uint32_t lastMask;
 981     for(;;) {
 982         if(i>=labelLength) {
 983             lastMask=firstMask;
 984             break;
 985         }
 986         U16_PREV_UNSAFE(label, labelLength, c);
 987         UCharDirection dir=u_charDirection(c);
 988         if(dir!=U_DIR_NON_SPACING_MARK) {
 989             lastMask=U_MASK(dir);
 990             break;
 991         }
 992     }
 993     // 3. In an RTL label, the end of the label must be a character with
 994     // BIDI property R, AL, EN or AN, followed by zero or more
 995     // characters with BIDI property NSM.
 996     // 6. In an LTR label, the end of the label must be a character with
 997     // BIDI property L or EN, followed by zero or more characters with
 998     // BIDI property NSM.
 999     if( (firstMask&L_MASK)!=0 ?
1000             (lastMask&~L_EN_MASK)!=0 :
1001             (lastMask&~R_AL_EN_AN_MASK)!=0
1002     ) {
1003         info.isOkBiDi=FALSE;
1004     }
1005     // Get the directionalities of the intervening characters.
1006     uint32_t mask=0;
1007     while(i<labelLength) {
1008         U16_NEXT_UNSAFE(label, i, c);
1009         mask|=U_MASK(u_charDirection(c));
1010     }
1011     if(firstMask&L_MASK) {
1012         // 5. In an LTR label, only characters with the BIDI properties L, EN,
1013         // ES, CS, ET, ON, BN and NSM are allowed.
1014         if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
1015             info.isOkBiDi=FALSE;
1016         }
1017     } else {
1018         // 2. In an RTL label, only characters with the BIDI properties R, AL,
1019         // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
1020         if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
1021             info.isOkBiDi=FALSE;
1022         }
1023         // 4. In an RTL label, if an EN is present, no AN may be present, and
1024         // vice versa.
1025         if((mask&EN_AN_MASK)==EN_AN_MASK) {
1026             info.isOkBiDi=FALSE;
1027         }
1028     }
1029     // An RTL label is a label that contains at least one character of type
1030     // R, AL or AN. [...]
1031     // A "BIDI domain name" is a domain name that contains at least one RTL
1032     // label. [...]
1033     // The following rule, consisting of six conditions, applies to labels
1034     // in BIDI domain names.
1035     if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
1036         info.isBiDi=TRUE;
1037     }
1038 }
1039
1040 // Special code for the ASCII prefix of a BiDi domain name.
1041 // The ASCII prefix is all-LTR.
1042
1043 // IDNA2008 BiDi rule, parts relevant to ASCII labels:
1044 // 1. The first character must be a character with BIDI property L [...]
1045 // 5. In an LTR label, only characters with the BIDI properties L, EN,
1046 // ES, CS, ET, ON, BN and NSM are allowed.
1047 // 6. In an LTR label, the end of the label must be a character with
1048 // BIDI property L or EN [...]
1049
1050 // UTF-16 version, called for mapped ASCII prefix.
1051 // Cannot contain uppercase A-Z.
1052 // s[length-1] must be the trailing dot.
1053 static UBool
1054 isASCIIOkBiDi(const UChar *s, int32_t length) {
1055     int32_t labelStart=0;
1056     for(int32_t i=0; i<length; ++i) {
1057         UChar c=s[i];
1058         if(c==0x2e) {  // dot
1059             if(i>labelStart) {
1060                 c=s[i-1];
1061                 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {
1062                     // Last character in the label is not an L or EN.
1063                     return FALSE;
1064                 }
1065             }
1066             labelStart=i+1;
1067         } else if(i==labelStart) {
1068             if(!(0x61<=c && c<=0x7a)) {
1069                 // First character in the label is not an L.
1070                 return FALSE;
1071             }
1072         } else {
1073             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
1074                 // Intermediate character in the label is a B, S or WS.
1075                 return FALSE;
1076             }
1077         }
1078     }
1079     return TRUE;
1080 }
1081
1082 // UTF-8 version, called for source ASCII prefix.
1083 // Can contain uppercase A-Z.
1084 // s[length-1] must be the trailing dot.
1085 static UBool
1086 isASCIIOkBiDi(const char *s, int32_t length) {
1087     int32_t labelStart=0;
1088     for(int32_t i=0; i<length; ++i) {
1089         char c=s[i];
1090         if(c==0x2e) {  // dot
1091             if(i>labelStart) {
1092                 c=s[i-1];
1093                 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {
1094                     // Last character in the label is not an L or EN.
1095                     return FALSE;
1096                 }
1097             }
1098             labelStart=i+1;
1099         } else if(i==labelStart) {
1100             if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {
1101                 // First character in the label is not an L.
1102                 return FALSE;
1103             }
1104         } else {
1105             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
1106                 // Intermediate character in the label is a B, S or WS.
1107                 return FALSE;
1108             }
1109         }
1110     }
1111     return TRUE;
1112 }
1113
1114 UBool
1115 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
1116     const UBiDiProps *bdp=ubidi_getSingleton();
1117     // [IDNA2008-Tables]
1118     // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
1119     for(int32_t i=0; i<labelLength; ++i) {
1120         if(label[i]==0x200c) {
1121             // Appendix A.1. ZERO WIDTH NON-JOINER
1122             // Rule Set:
1123             //  False;
1124             //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
1125             //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
1126             //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
1127             if(i==0) {
1128                 return FALSE;
1129             }
1130             UChar32 c;
1131             int32_t j=i;
1132             U16_PREV_UNSAFE(label, j, c);
1133             if(uts46Norm2.getCombiningClass(c)==9) {
1134                 continue;
1135             }
1136             // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
1137             for(;;) {
1138                 UJoiningType type=ubidi_getJoiningType(bdp, c);
1139                 if(type==U_JT_TRANSPARENT) {
1140                     if(j==0) {
1141                         return FALSE;
1142                     }
1143                     U16_PREV_UNSAFE(label, j, c);
1144                 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) {
1145                     break;  // precontext fulfilled
1146                 } else {
1147                     return FALSE;
1148                 }
1149             }
1150             // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
1151             for(j=i+1;;) {
1152                 if(j==labelLength) {
1153                     return FALSE;
1154                 }
1155                 U16_NEXT_UNSAFE(label, j, c);
1156                 UJoiningType type=ubidi_getJoiningType(bdp, c);
1157                 if(type==U_JT_TRANSPARENT) {
1158                     // just skip this character
1159                 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) {
1160                     break;  // postcontext fulfilled
1161                 } else {
1162                     return FALSE;
1163                 }
1164             }
1165         } else if(label[i]==0x200d) {
1166             // Appendix A.2. ZERO WIDTH JOINER (U+200D)
1167             // Rule Set:
1168             //  False;
1169             //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
1170             if(i==0) {
1171                 return FALSE;
1172             }
1173             UChar32 c;
1174             int32_t j=i;
1175             U16_PREV_UNSAFE(label, j, c);
1176             if(uts46Norm2.getCombiningClass(c)!=9) {
1177                 return FALSE;
1178             }
1179         }
1180     }
1181     return TRUE;
1182 }
1183
1184 void
1185 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
1186     int32_t labelEnd=labelLength-1;  // inclusive
1187     int32_t arabicDigits=0;  // -1 for 066x, +1 for 06Fx
1188     for(int32_t i=0; i<=labelEnd; ++i) {
1189         UChar32 c=label[i];
1190         if(c<0xb7) {
1191             // ASCII fastpath
1192         } else if(c<=0x6f9) {
1193             if(c==0xb7) {
1194                 // Appendix A.3. MIDDLE DOT (U+00B7)
1195                 // Rule Set:
1196                 //  False;
1197                 //  If Before(cp) .eq.  U+006C And
1198                 //     After(cp) .eq.  U+006C Then True;
1199                 if(!(0<i && label[i-1]==0x6c &&
1200                      i<labelEnd && label[i+1]==0x6c)) {
1201                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1202                 }
1203             } else if(c==0x375) {
1204                 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
1205                 // Rule Set:
1206                 //  False;
1207                 //  If Script(After(cp)) .eq.  Greek Then True;
1208                 UScriptCode script=USCRIPT_INVALID_CODE;
1209                 if(i<labelEnd) {
1210                     UErrorCode errorCode=U_ZERO_ERROR;
1211                     int32_t j=i+1;
1212                     U16_NEXT(label, j, labelLength, c);
1213                     script=uscript_getScript(c, &errorCode);
1214                 }
1215                 if(script!=USCRIPT_GREEK) {
1216                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1217                 }
1218             } else if(c==0x5f3 || c==0x5f4) {
1219                 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
1220                 // Rule Set:
1221                 //  False;
1222                 //  If Script(Before(cp)) .eq.  Hebrew Then True;
1223                 //
1224                 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
1225                 // Rule Set:
1226                 //  False;
1227                 //  If Script(Before(cp)) .eq.  Hebrew Then True;
1228                 UScriptCode script=USCRIPT_INVALID_CODE;
1229                 if(0<i) {
1230                     UErrorCode errorCode=U_ZERO_ERROR;
1231                     int32_t j=i;
1232                     U16_PREV(label, 0, j, c);
1233                     script=uscript_getScript(c, &errorCode);
1234                 }
1235                 if(script!=USCRIPT_HEBREW) {
1236                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1237                 }
1238             } else if(0x660<=c /* && c<=0x6f9 */) {
1239                 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
1240                 // Rule Set:
1241                 //  True;
1242                 //  For All Characters:
1243                 //    If cp .in. 06F0..06F9 Then False;
1244                 //  End For;
1245                 //
1246                 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
1247                 // Rule Set:
1248                 //  True;
1249                 //  For All Characters:
1250                 //    If cp .in. 0660..0669 Then False;
1251                 //  End For;
1252                 if(c<=0x669) {
1253                     if(arabicDigits>0) {
1254                         info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
1255                     }
1256                     arabicDigits=-1;
1257                 } else if(0x6f0<=c) {
1258                     if(arabicDigits<0) {
1259                         info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
1260                     }
1261                     arabicDigits=1;
1262                 }
1263             }
1264         } else if(c==0x30fb) {
1265             // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
1266             // Rule Set:
1267             //  False;
1268             //  For All Characters:
1269             //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
1270             //  End For;
1271             UErrorCode errorCode=U_ZERO_ERROR;
1272             for(int j=0;;) {
1273                 if(j>labelEnd) {
1274                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1275                     break;
1276                 }
1277                 U16_NEXT(label, j, labelLength, c);
1278                 UScriptCode script=uscript_getScript(c, &errorCode);
1279                 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
1280                     break;
1281                 }
1282             }
1283         }
1284     }
1285 }
1286
1287 U_NAMESPACE_END
1288
1289 // C API ------------------------------------------------------------------- ***
1290
1291 U_NAMESPACE_USE
1292
1293 U_DRAFT UIDNA * U_EXPORT2
1294 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
1295     return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode));
1296 }
1297
1298 U_DRAFT void U_EXPORT2
1299 uidna_close(UIDNA *idna) {
1300     delete reinterpret_cast<IDNA *>(idna);
1301 }
1302
1303 static UBool
1304 checkArgs(const void *label, int32_t length,
1305           void *dest, int32_t capacity,
1306           UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1307     if(U_FAILURE(*pErrorCode)) {
1308         return FALSE;
1309     }
1310     // sizeof(UIDNAInfo)=16 in the first API version.
1311     if(pInfo==NULL || pInfo->size<16) {
1312         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1313         return FALSE;
1314     }
1315     if( (label==NULL ? length!=0 : length<-1) ||
1316         (dest==NULL ? capacity!=0 : capacity<0) ||
1317         (dest==label && label!=NULL)
1318     ) {
1319         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1320         return FALSE;
1321     }
1322     // Set all *pInfo bytes to 0 except for the size field itself.
1323     uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));
1324     return TRUE;
1325 }
1326
1327 static void
1328 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
1329     pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
1330     pInfo->errors=info.getErrors();
1331 }
1332
1333 U_DRAFT int32_t U_EXPORT2
1334 uidna_labelToASCII(const UIDNA *idna,
1335                    const UChar *label, int32_t length,
1336                    UChar *dest, int32_t capacity,
1337                    UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1338     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1339         return 0;
1340     }
1341     UnicodeString src((UBool)(length<0), label, length);
1342     UnicodeString destString(dest, 0, capacity);
1343     IDNAInfo info;
1344     reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode);
1345     idnaInfoToStruct(info, pInfo);
1346     return destString.extract(dest, capacity, *pErrorCode);
1347 }
1348
1349 U_DRAFT int32_t U_EXPORT2
1350 uidna_labelToUnicode(const UIDNA *idna,
1351                      const UChar *label, int32_t length,
1352                      UChar *dest, int32_t capacity,
1353                      UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1354     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1355         return 0;
1356     }
1357     UnicodeString src((UBool)(length<0), label, length);
1358     UnicodeString destString(dest, 0, capacity);
1359     IDNAInfo info;
1360     reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode);
1361     idnaInfoToStruct(info, pInfo);
1362     return destString.extract(dest, capacity, *pErrorCode);
1363 }
1364
1365 U_DRAFT int32_t U_EXPORT2
1366 uidna_nameToASCII(const UIDNA *idna,
1367                   const UChar *name, int32_t length,
1368                   UChar *dest, int32_t capacity,
1369                   UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1370     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1371         return 0;
1372     }
1373     UnicodeString src((UBool)(length<0), name, length);
1374     UnicodeString destString(dest, 0, capacity);
1375     IDNAInfo info;
1376     reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode);
1377     idnaInfoToStruct(info, pInfo);
1378     return destString.extract(dest, capacity, *pErrorCode);
1379 }
1380
1381 U_DRAFT int32_t U_EXPORT2
1382 uidna_nameToUnicode(const UIDNA *idna,
1383                     const UChar *name, int32_t length,
1384                     UChar *dest, int32_t capacity,
1385                     UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1386     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1387         return 0;
1388     }
1389     UnicodeString src((UBool)(length<0), name, length);
1390     UnicodeString destString(dest, 0, capacity);
1391     IDNAInfo info;
1392     reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode);
1393     idnaInfoToStruct(info, pInfo);
1394     return destString.extract(dest, capacity, *pErrorCode);
1395 }
1396
1397 U_DRAFT int32_t U_EXPORT2
1398 uidna_labelToASCII_UTF8(const UIDNA *idna,
1399                         const char *label, int32_t length,
1400                         char *dest, int32_t capacity,
1401                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1402     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1403         return 0;
1404     }
1405     StringPiece src(label, length<0 ? uprv_strlen(label) : length);
1406     CheckedArrayByteSink sink(dest, capacity);
1407     IDNAInfo info;
1408     reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode);
1409     idnaInfoToStruct(info, pInfo);
1410     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1411 }
1412
1413 U_DRAFT int32_t U_EXPORT2
1414 uidna_labelToUnicodeUTF8(const UIDNA *idna,
1415                          const char *label, int32_t length,
1416                          char *dest, int32_t capacity,
1417                          UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1418     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1419         return 0;
1420     }
1421     StringPiece src(label, length<0 ? uprv_strlen(label) : length);
1422     CheckedArrayByteSink sink(dest, capacity);
1423     IDNAInfo info;
1424     reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode);
1425     idnaInfoToStruct(info, pInfo);
1426     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1427 }
1428
1429 U_DRAFT int32_t U_EXPORT2
1430 uidna_nameToASCII_UTF8(const UIDNA *idna,
1431                        const char *name, int32_t length,
1432                        char *dest, int32_t capacity,
1433                        UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1434     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1435         return 0;
1436     }
1437     StringPiece src(name, length<0 ? uprv_strlen(name) : length);
1438     CheckedArrayByteSink sink(dest, capacity);
1439     IDNAInfo info;
1440     reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode);
1441     idnaInfoToStruct(info, pInfo);
1442     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1443 }
1444
1445 U_DRAFT int32_t U_EXPORT2
1446 uidna_nameToUnicodeUTF8(const UIDNA *idna,
1447                         const char *name, int32_t length,
1448                         char *dest, int32_t capacity,
1449                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1450     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1451         return 0;
1452     }
1453     StringPiece src(name, length<0 ? uprv_strlen(name) : length);
1454     CheckedArrayByteSink sink(dest, capacity);
1455     IDNAInfo info;
1456     reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode);
1457     idnaInfoToStruct(info, pInfo);
1458     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1459 }
1460
1461 #endif  // UCONFIG_NO_IDNA