icuSources/common/punycode.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2002-2011, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  punycode.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2002jan31
  16 *   created by: Markus W. Scherer
  17 */
  18
  19
  20 /* This ICU code derived from: */
  21 /*
  22 punycode.c 0.4.0 (2001-Nov-17-Sat)
  23 http://www.cs.berkeley.edu/~amc/idn/
  24 Adam M. Costello
  25 http://www.nicemice.net/amc/
  26
  27 Disclaimer and license
  28
  29     Regarding this entire document or any portion of it (including
  30     the pseudocode and C code), the author makes no guarantees and
  31     is not responsible for any damage resulting from its use.  The
  32     author grants irrevocable permission to anyone to use, modify,
  33     and distribute it in any way that does not diminish the rights
  34     of anyone else to use, modify, and distribute it, provided that
  35     redistributed derivative works do not contain misleading author or
  36     version information.  Derivative works need not be licensed under
  37     similar terms.
  38 */
  39 /*
  40  * ICU modifications:
  41  * - ICU data types and coding conventions
  42  * - ICU string buffer handling with implicit source lengths
  43  *   and destination preflighting
  44  * - UTF-16 handling
  45  */
  46
  47 #include "unicode/utypes.h"
  48
  49 #if !UCONFIG_NO_IDNA
  50
  51 #include "unicode/ustring.h"
  52 #include "unicode/utf.h"
  53 #include "unicode/utf16.h"
  54 #include "ustr_imp.h"
  55 #include "cstring.h"
  56 #include "cmemory.h"
  57 #include "punycode.h"
  58 #include "uassert.h"
  59
  60
  61 /* Punycode ----------------------------------------------------------------- */
  62
  63 /* Punycode parameters for Bootstring */
  64 #define BASE            36
  65 #define TMIN            1
  66 #define TMAX            26
  67 #define SKEW            38
  68 #define DAMP            700
  69 #define INITIAL_BIAS    72
  70 #define INITIAL_N       0x80
  71
  72 /* "Basic" Unicode/ASCII code points */
  73 #define _HYPHEN         0X2d
  74 #define DELIMITER       _HYPHEN
  75
  76 #define _ZERO_          0X30
  77 #define _NINE           0x39
  78
  79 #define _SMALL_A        0X61
  80 #define _SMALL_Z        0X7a
  81
  82 #define _CAPITAL_A      0X41
  83 #define _CAPITAL_Z      0X5a
  84
  85 #define IS_BASIC(c) ((c)<0x80)
  86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
  87
  88 /**
  89  * digitToBasic() returns the basic code point whose value
  90  * (when used for representing integers) is d, which must be in the
  91  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
  92  * nonzero, in which case the uppercase form is used.
  93  */
  94 static inline char
  95 digitToBasic(int32_t digit, UBool uppercase) {
  96     /*  0..25 map to ASCII a..z or A..Z */
  97     /* 26..35 map to ASCII 0..9         */
  98     if(digit<26) {
  99         if(uppercase) {
 100             return (char)(_CAPITAL_A+digit);
 101         } else {
 102             return (char)(_SMALL_A+digit);
 103         }
 104     } else {
 105         return (char)((_ZERO_-26)+digit);
 106     }
 107 }
 108
 109 /**
 110  * basicToDigit[] contains the numeric value of a basic code
 111  * point (for use in representing integers) in the range 0 to
 112  * BASE-1, or -1 if b is does not represent a value.
 113  */
 114 static const int8_t
 115 basicToDigit[256]={
 116     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 117     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 118
 119     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 120     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
 121
 122     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 123     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 124
 125     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 126     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 127
 128     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 129     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 130
 131     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 132     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 133
 134     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 135     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 136
 137     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 138     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 139 };
 140
 141 static inline char
 142 asciiCaseMap(char b, UBool uppercase) {
 143     if(uppercase) {
 144         if(_SMALL_A<=b && b<=_SMALL_Z) {
 145             b-=(_SMALL_A-_CAPITAL_A);
 146         }
 147     } else {
 148         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
 149             b+=(_SMALL_A-_CAPITAL_A);
 150         }
 151     }
 152     return b;
 153 }
 154
 155 /* Punycode-specific Bootstring code ---------------------------------------- */
 156
 157 /*
 158  * The following code omits the {parts} of the pseudo-algorithm in the spec
 159  * that are not used with the Punycode parameter set.
 160  */
 161
 162 /* Bias adaptation function. */
 163 static int32_t
 164 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
 165     int32_t count;
 166
 167     if(firstTime) {
 168         delta/=DAMP;
 169     } else {
 170         delta/=2;
 171     }
 172
 173     delta+=delta/length;
 174     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
 175         delta/=(BASE-TMIN);
 176     }
 177
 178     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
 179 }
 180
 181 #define MAX_CP_COUNT    200
 182
 183 U_CFUNC int32_t
 184 u_strToPunycode(const UChar *src, int32_t srcLength,
 185                 UChar *dest, int32_t destCapacity,
 186                 const UBool *caseFlags,
 187                 UErrorCode *pErrorCode) {
 188
 189     int32_t cpBuffer[MAX_CP_COUNT];
 190     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
 191     UChar c, c2;
 192
 193     /* argument checking */
 194     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 195         return 0;
 196     }
 197
 198     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
 199         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 200         return 0;
 201     }
 202
 203     /*
 204      * Handle the basic code points and
 205      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
 206      */
 207     srcCPCount=destLength=0;
 208     if(srcLength==-1) {
 209         /* NUL-terminated input */
 210         for(j=0; /* no condition */; ++j) {
 211             if((c=src[j])==0) {
 212                 break;
 213             }
 214             if(srcCPCount==MAX_CP_COUNT) {
 215                 /* too many input code points */
 216                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 217                 return 0;
 218             }
 219             if(IS_BASIC(c)) {
 220                 cpBuffer[srcCPCount++]=0;
 221                 if(destLength<destCapacity) {
 222                     dest[destLength]=
 223                         caseFlags!=NULL ?
 224                             asciiCaseMap((char)c, caseFlags[j]) :
 225                             (char)c;
 226                 }
 227                 ++destLength;
 228             } else {
 229                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
 230                 if(U16_IS_SINGLE(c)) {
 231                     n|=c;
 232                 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
 233                     ++j;
 234                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
 235                 } else {
 236                     /* error: unmatched surrogate */
 237                     *pErrorCode=U_INVALID_CHAR_FOUND;
 238                     return 0;
 239                 }
 240                 cpBuffer[srcCPCount++]=n;
 241             }
 242         }
 243     } else {
 244         /* length-specified input */
 245         for(j=0; j<srcLength; ++j) {
 246             if(srcCPCount==MAX_CP_COUNT) {
 247                 /* too many input code points */
 248                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 249                 return 0;
 250             }
 251             c=src[j];
 252             if(IS_BASIC(c)) {
 253                 cpBuffer[srcCPCount++]=0;
 254                 if(destLength<destCapacity) {
 255                     dest[destLength]=
 256                         caseFlags!=NULL ?
 257                             asciiCaseMap((char)c, caseFlags[j]) :
 258                             (char)c;
 259                 }
 260                 ++destLength;
 261             } else {
 262                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
 263                 if(U16_IS_SINGLE(c)) {
 264                     n|=c;
 265                 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
 266                     ++j;
 267                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
 268                 } else {
 269                     /* error: unmatched surrogate */
 270                     *pErrorCode=U_INVALID_CHAR_FOUND;
 271                     return 0;
 272                 }
 273                 cpBuffer[srcCPCount++]=n;
 274             }
 275         }
 276     }
 277
 278     /* Finish the basic string - if it is not empty - with a delimiter. */
 279     basicLength=destLength;
 280     if(basicLength>0) {
 281         if(destLength<destCapacity) {
 282             dest[destLength]=DELIMITER;
 283         }
 284         ++destLength;
 285     }
 286
 287     /*
 288      * handledCPCount is the number of code points that have been handled
 289      * basicLength is the number of basic code points
 290      * destLength is the number of chars that have been output
 291      */
 292
 293     /* Initialize the state: */
 294     n=INITIAL_N;
 295     delta=0;
 296     bias=INITIAL_BIAS;
 297
 298     /* Main encoding loop: */
 299     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
 300         /*
 301          * All non-basic code points < n have been handled already.
 302          * Find the next larger one:
 303          */
 304         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
 305             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 306             if(n<=q && q<m) {
 307                 m=q;
 308             }
 309         }
 310
 311         /*
 312          * Increase delta enough to advance the decoder's
 313          * <n,i> state to <m,0>, but guard against overflow:
 314          */
 315         if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
 316             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 317             return 0;
 318         }
 319         delta+=(m-n)*(handledCPCount+1);
 320         n=m;
 321
 322         /* Encode a sequence of same code points n */
 323         for(j=0; j<srcCPCount; ++j) {
 324             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 325             if(q<n) {
 326                 ++delta;
 327             } else if(q==n) {
 328                 /* Represent delta as a generalized variable-length integer: */
 329                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
 330
 331                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 332
 333                     t=k-bias;
 334                     if(t<TMIN) {
 335                         t=TMIN;
 336                     } else if(t>TMAX) {
 337                         t=TMAX;
 338                     }
 339                     */
 340
 341                     t=k-bias;
 342                     if(t<TMIN) {
 343                         t=TMIN;
 344                     } else if(k>=(bias+TMAX)) {
 345                         t=TMAX;
 346                     }
 347
 348                     if(q<t) {
 349                         break;
 350                     }
 351
 352                     if(destLength<destCapacity) {
 353                         dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
 354                     }
 355                     ++destLength;
 356                     q=(q-t)/(BASE-t);
 357                 }
 358
 359                 if(destLength<destCapacity) {
 360                     dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
 361                 }
 362                 ++destLength;
 363                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
 364                 delta=0;
 365                 ++handledCPCount;
 366             }
 367         }
 368
 369         ++delta;
 370         ++n;
 371     }
 372
 373     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
 374 }
 375
 376 U_CFUNC int32_t
 377 u_strFromPunycode(const UChar *src, int32_t srcLength,
 378                   UChar *dest, int32_t destCapacity,
 379                   UBool *caseFlags,
 380                   UErrorCode *pErrorCode) {
 381     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
 382             destCPCount, firstSupplementaryIndex, cpLength;
 383     UChar b;
 384
 385     /* argument checking */
 386     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 387         return 0;
 388     }
 389
 390     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
 391         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 392         return 0;
 393     }
 394
 395     if(srcLength==-1) {
 396         srcLength=u_strlen(src);
 397     }
 398
 399     /*
 400      * Handle the basic code points:
 401      * Let basicLength be the number of input code points
 402      * before the last delimiter, or 0 if there is none,
 403      * then copy the first basicLength code points to the output.
 404      *
 405      * The two following loops iterate backward.
 406      */
 407     for(j=srcLength; j>0;) {
 408         if(src[--j]==DELIMITER) {
 409             break;
 410         }
 411     }
 412     destLength=basicLength=destCPCount=j;
 413     U_ASSERT(destLength>=0);
 414
 415     while(j>0) {
 416         b=src[--j];
 417         if(!IS_BASIC(b)) {
 418             *pErrorCode=U_INVALID_CHAR_FOUND;
 419             return 0;
 420         }
 421
 422         if(j<destCapacity) {
 423             dest[j]=(UChar)b;
 424
 425             if(caseFlags!=NULL) {
 426                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
 427             }
 428         }
 429     }
 430
 431     /* Initialize the state: */
 432     n=INITIAL_N;
 433     i=0;
 434     bias=INITIAL_BIAS;
 435     firstSupplementaryIndex=1000000000;
 436
 437     /*
 438      * Main decoding loop:
 439      * Start just after the last delimiter if any
 440      * basic code points were copied; start at the beginning otherwise.
 441      */
 442     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
 443         /*
 444          * in is the index of the next character to be consumed, and
 445          * destCPCount is the number of code points in the output array.
 446          *
 447          * Decode a generalized variable-length integer into delta,
 448          * which gets added to i.  The overflow checking is easier
 449          * if we increase i as we go, then subtract off its starting
 450          * value at the end to obtain delta.
 451          */
 452         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
 453             if(in>=srcLength) {
 454                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 455                 return 0;
 456             }
 457
 458             digit=basicToDigit[(uint8_t)src[in++]];
 459             if(digit<0) {
 460                 *pErrorCode=U_INVALID_CHAR_FOUND;
 461                 return 0;
 462             }
 463             if(digit>(0x7fffffff-i)/w) {
 464                 /* integer overflow */
 465                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 466                 return 0;
 467             }
 468
 469             i+=digit*w;
 470             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 471             t=k-bias;
 472             if(t<TMIN) {
 473                 t=TMIN;
 474             } else if(t>TMAX) {
 475                 t=TMAX;
 476             }
 477             */
 478             t=k-bias;
 479             if(t<TMIN) {
 480                 t=TMIN;
 481             } else if(k>=(bias+TMAX)) {
 482                 t=TMAX;
 483             }
 484             if(digit<t) {
 485                 break;
 486             }
 487
 488             if(w>0x7fffffff/(BASE-t)) {
 489                 /* integer overflow */
 490                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 491                 return 0;
 492             }
 493             w*=BASE-t;
 494         }
 495
 496         /*
 497          * Modification from sample code:
 498          * Increments destCPCount here,
 499          * where needed instead of in for() loop tail.
 500          */
 501         ++destCPCount;
 502         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
 503
 504         /*
 505          * i was supposed to wrap around from (incremented) destCPCount to 0,
 506          * incrementing n each time, so we'll fix that now:
 507          */
 508         if(i/destCPCount>(0x7fffffff-n)) {
 509             /* integer overflow */
 510             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 511             return 0;
 512         }
 513
 514         n+=i/destCPCount;
 515         i%=destCPCount;
 516         /* not needed for Punycode: */
 517         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
 518
 519         if(n>0x10ffff || U_IS_SURROGATE(n)) {
 520             /* Unicode code point overflow */
 521             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 522             return 0;
 523         }
 524
 525         /* Insert n at position i of the output: */
 526         cpLength=U16_LENGTH(n);
 527         if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
 528             int32_t codeUnitIndex;
 529
 530             /*
 531              * Handle indexes when supplementary code points are present.
 532              *
 533              * In almost all cases, there will be only BMP code points before i
 534              * and even in the entire string.
 535              * This is handled with the same efficiency as with UTF-32.
 536              *
 537              * Only the rare cases with supplementary code points are handled
 538              * more slowly - but not too bad since this is an insertion anyway.
 539              */
 540             if(i<=firstSupplementaryIndex) {
 541                 codeUnitIndex=i;
 542                 if(cpLength>1) {
 543                     firstSupplementaryIndex=codeUnitIndex;
 544                 } else {
 545                     ++firstSupplementaryIndex;
 546                 }
 547             } else {
 548                 codeUnitIndex=firstSupplementaryIndex;
 549                 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
 550             }
 551
 552             /* use the UChar index codeUnitIndex instead of the code point index i */
 553             if(codeUnitIndex<destLength) {
 554                 uprv_memmove(dest+codeUnitIndex+cpLength,
 555                              dest+codeUnitIndex,
 556                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
 557                 if(caseFlags!=NULL) {
 558                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
 559                                  caseFlags+codeUnitIndex,
 560                                  destLength-codeUnitIndex);
 561                 }
 562             }
 563             if(cpLength==1) {
 564                 /* BMP, insert one code unit */
 565                 dest[codeUnitIndex]=(UChar)n;
 566             } else {
 567                 /* supplementary character, insert two code units */
 568                 dest[codeUnitIndex]=U16_LEAD(n);
 569                 dest[codeUnitIndex+1]=U16_TRAIL(n);
 570             }
 571             if(caseFlags!=NULL) {
 572                 /* Case of last character determines uppercase flag: */
 573                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
 574                 if(cpLength==2) {
 575                     caseFlags[codeUnitIndex+1]=FALSE;
 576                 }
 577             }
 578         }
 579         destLength+=cpLength;
 580         U_ASSERT(destLength>=0);
 581         ++i;
 582     }
 583
 584     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
 585 }
 586
 587 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
 588
 589 #endif /* #if !UCONFIG_NO_IDNA */