icuSources/common/punycode.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2002-2003, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  punycode.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jan31
  14 *   created by: Markus W. Scherer
  15 */
  16
  17
  18 /* This ICU code derived from: */
  19 /*
  20 punycode.c 0.4.0 (2001-Nov-17-Sat)
  21 http://www.cs.berkeley.edu/~amc/idn/
  22 Adam M. Costello
  23 http://www.nicemice.net/amc/
  24
  25 Disclaimer and license
  26
  27     Regarding this entire document or any portion of it (including
  28     the pseudocode and C code), the author makes no guarantees and
  29     is not responsible for any damage resulting from its use.  The
  30     author grants irrevocable permission to anyone to use, modify,
  31     and distribute it in any way that does not diminish the rights
  32     of anyone else to use, modify, and distribute it, provided that
  33     redistributed derivative works do not contain misleading author or
  34     version information.  Derivative works need not be licensed under
  35     similar terms.
  36 */
  37 /*
  38  * ICU modifications:
  39  * - ICU data types and coding conventions
  40  * - ICU string buffer handling with implicit source lengths
  41  *   and destination preflighting
  42  * - UTF-16 handling
  43  */
  44
  45 #include "unicode/utypes.h"
  46
  47 #if !UCONFIG_NO_IDNA
  48
  49 #include "ustr_imp.h"
  50 #include "cstring.h"
  51 #include "cmemory.h"
  52 #include "punycode.h"
  53 #include "unicode/ustring.h"
  54
  55
  56 /* Punycode ----------------------------------------------------------------- */
  57
  58 /* Punycode parameters for Bootstring */
  59 #define BASE            36
  60 #define TMIN            1
  61 #define TMAX            26
  62 #define SKEW            38
  63 #define DAMP            700
  64 #define INITIAL_BIAS    72
  65 #define INITIAL_N       0x80
  66
  67 /* "Basic" Unicode/ASCII code points */
  68 #define _HYPHEN         0X2d
  69 #define DELIMITER       _HYPHEN
  70
  71 #define _ZERO_          0X30
  72 #define _NINE           0x39
  73
  74 #define _SMALL_A        0X61
  75 #define _SMALL_Z        0X7a
  76
  77 #define _CAPITAL_A      0X41
  78 #define _CAPITAL_Z      0X5a
  79
  80 #define IS_BASIC(c) ((c)<0x80)
  81 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
  82
  83 /**
  84  * digitToBasic() returns the basic code point whose value
  85  * (when used for representing integers) is d, which must be in the
  86  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
  87  * nonzero, in which case the uppercase form is used.
  88  */
  89 static U_INLINE char
  90 digitToBasic(int32_t digit, UBool uppercase) {
  91     /*  0..25 map to ASCII a..z or A..Z */
  92     /* 26..35 map to ASCII 0..9         */
  93     if(digit<26) {
  94         if(uppercase) {
  95             return (char)(_CAPITAL_A+digit);
  96         } else {
  97             return (char)(_SMALL_A+digit);
  98         }
  99     } else {
 100         return (char)((_ZERO_-26)+digit);
 101     }
 102 }
 103
 104 /**
 105  * basicToDigit[] contains the numeric value of a basic code
 106  * point (for use in representing integers) in the range 0 to
 107  * BASE-1, or -1 if b is does not represent a value.
 108  */
 109 static const int8_t
 110 basicToDigit[256]={
 111     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 112     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 113
 114     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 115     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
 116
 117     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 118     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 119
 120     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 121     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 122
 123     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 124     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 125
 126     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 127     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 128
 129     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 130     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 131
 132     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 133     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 134 };
 135
 136 static U_INLINE char
 137 asciiCaseMap(char b, UBool uppercase) {
 138     if(uppercase) {
 139         if(_SMALL_A<=b && b<=_SMALL_Z) {
 140             b-=(_SMALL_A-_CAPITAL_A);
 141         }
 142     } else {
 143         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
 144             b+=(_SMALL_A-_CAPITAL_A);
 145         }
 146     }
 147     return b;
 148 }
 149
 150 /* Punycode-specific Bootstring code ---------------------------------------- */
 151
 152 /*
 153  * The following code omits the {parts} of the pseudo-algorithm in the spec
 154  * that are not used with the Punycode parameter set.
 155  */
 156
 157 /* Bias adaptation function. */
 158 static int32_t
 159 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
 160     int32_t count;
 161
 162     if(firstTime) {
 163         delta/=DAMP;
 164     } else {
 165         delta/=2;
 166     }
 167
 168     delta+=delta/length;
 169     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
 170         delta/=(BASE-TMIN);
 171     }
 172
 173     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
 174 }
 175
 176 #define MAX_CP_COUNT    200
 177
 178 U_CFUNC int32_t
 179 u_strToPunycode(const UChar *src, int32_t srcLength,
 180                 UChar *dest, int32_t destCapacity,
 181                 const UBool *caseFlags,
 182                 UErrorCode *pErrorCode) {
 183
 184     int32_t cpBuffer[MAX_CP_COUNT];
 185     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
 186     UChar c, c2;
 187
 188     /* argument checking */
 189     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 190         return 0;
 191     }
 192
 193     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
 194         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 195         return 0;
 196     }
 197
 198     /*
 199      * Handle the basic code points and
 200      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
 201      */
 202     srcCPCount=destLength=0;
 203     if(srcLength==-1) {
 204         /* NUL-terminated input */
 205         for(j=0; /* no condition */; ++j) {
 206             if((c=src[j])==0) {
 207                 break;
 208             }
 209             if(srcCPCount==MAX_CP_COUNT) {
 210                 /* too many input code points */
 211                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 212                 return 0;
 213             }
 214             if(IS_BASIC(c)) {
 215                 cpBuffer[srcCPCount++]=0;
 216                 if(destLength<destCapacity) {
 217                     dest[destLength]=
 218                         caseFlags!=NULL ?
 219                             asciiCaseMap((char)c, caseFlags[j]) :
 220                             (char)c;
 221                 }
 222                 ++destLength;
 223             } else {
 224                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
 225                 if(UTF_IS_SINGLE(c)) {
 226                     n|=c;
 227                 } else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
 228                     ++j;
 229                     n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
 230                 } else {
 231                     /* error: unmatched surrogate */
 232                     *pErrorCode=U_INVALID_CHAR_FOUND;
 233                     return 0;
 234                 }
 235                 cpBuffer[srcCPCount++]=n;
 236             }
 237         }
 238     } else {
 239         /* length-specified input */
 240         for(j=0; j<srcLength; ++j) {
 241             if(srcCPCount==MAX_CP_COUNT) {
 242                 /* too many input code points */
 243                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 244                 return 0;
 245             }
 246             c=src[j];
 247             if(IS_BASIC(c)) {
 248                 if(destLength<destCapacity) {
 249                     cpBuffer[srcCPCount++]=0;
 250                     dest[destLength]=
 251                         caseFlags!=NULL ?
 252                             asciiCaseMap((char)c, caseFlags[j]) :
 253                             (char)c;
 254                 }
 255                 ++destLength;
 256             } else {
 257                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
 258                 if(UTF_IS_SINGLE(c)) {
 259                     n|=c;
 260                 } else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
 261                     ++j;
 262                     n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
 263                 } else {
 264                     /* error: unmatched surrogate */
 265                     *pErrorCode=U_INVALID_CHAR_FOUND;
 266                     return 0;
 267                 }
 268                 cpBuffer[srcCPCount++]=n;
 269             }
 270         }
 271     }
 272
 273     /* Finish the basic string - if it is not empty - with a delimiter. */
 274     basicLength=destLength;
 275     if(basicLength>0) {
 276         if(destLength<destCapacity) {
 277             dest[destLength]=DELIMITER;
 278         }
 279         ++destLength;
 280     }
 281
 282     /*
 283      * handledCPCount is the number of code points that have been handled
 284      * basicLength is the number of basic code points
 285      * destLength is the number of chars that have been output
 286      */
 287
 288     /* Initialize the state: */
 289     n=INITIAL_N;
 290     delta=0;
 291     bias=INITIAL_BIAS;
 292
 293     /* Main encoding loop: */
 294     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
 295         /*
 296          * All non-basic code points < n have been handled already.
 297          * Find the next larger one:
 298          */
 299         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
 300             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 301             if(n<=q && q<m) {
 302                 m=q;
 303             }
 304         }
 305
 306         /*
 307          * Increase delta enough to advance the decoder's
 308          * <n,i> state to <m,0>, but guard against overflow:
 309          */
 310         if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
 311             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 312             return 0;
 313         }
 314         delta+=(m-n)*(handledCPCount+1);
 315         n=m;
 316
 317         /* Encode a sequence of same code points n */
 318         for(j=0; j<srcCPCount; ++j) {
 319             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 320             if(q<n) {
 321                 ++delta;
 322             } else if(q==n) {
 323                 /* Represent delta as a generalized variable-length integer: */
 324                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
 325
 326                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 327
 328                     t=k-bias;
 329                     if(t<TMIN) {
 330                         t=TMIN;
 331                     } else if(t>TMAX) {
 332                         t=TMAX;
 333                     }
 334                     */
 335
 336                     t=k-bias;
 337                     if(t<TMIN) {
 338                         t=TMIN;
 339                     } else if(k>=(bias+TMAX)) {
 340                         t=TMAX;
 341                     }
 342
 343                     if(q<t) {
 344                         break;
 345                     }
 346
 347                     if(destLength<destCapacity) {
 348                         dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), 0);
 349                     }
 350                     q=(q-t)/(BASE-t);
 351                 }
 352
 353                 if(destLength<destCapacity) {
 354                     dest[destLength++]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
 355                 }
 356                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
 357                 delta=0;
 358                 ++handledCPCount;
 359             }
 360         }
 361
 362         ++delta;
 363         ++n;
 364     }
 365
 366     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
 367 }
 368
 369 U_CFUNC int32_t
 370 u_strFromPunycode(const UChar *src, int32_t srcLength,
 371                   UChar *dest, int32_t destCapacity,
 372                   UBool *caseFlags,
 373                   UErrorCode *pErrorCode) {
 374     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
 375             destCPCount, firstSupplementaryIndex, cpLength;
 376     UChar b;
 377
 378     /* argument checking */
 379     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 380         return 0;
 381     }
 382
 383     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
 384         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 385         return 0;
 386     }
 387
 388     if(srcLength==-1) {
 389         srcLength=u_strlen(src);
 390     }
 391
 392     /*
 393      * Handle the basic code points:
 394      * Let basicLength be the number of input code points
 395      * before the last delimiter, or 0 if there is none,
 396      * then copy the first basicLength code points to the output.
 397      *
 398      * The two following loops iterate backward.
 399      */
 400     for(j=srcLength; j>0;) {
 401         if(src[--j]==DELIMITER) {
 402             break;
 403         }
 404     }
 405     destLength=basicLength=destCPCount=j;
 406
 407     while(j>0) {
 408         b=src[--j];
 409         if(!IS_BASIC(b)) {
 410             *pErrorCode=U_INVALID_CHAR_FOUND;
 411             return 0;
 412         }
 413
 414         if(j<destCapacity) {
 415             dest[j]=(UChar)b;
 416
 417             if(caseFlags!=NULL) {
 418                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
 419             }
 420         }
 421     }
 422
 423     /* Initialize the state: */
 424     n=INITIAL_N;
 425     i=0;
 426     bias=INITIAL_BIAS;
 427     firstSupplementaryIndex=1000000000;
 428
 429     /*
 430      * Main decoding loop:
 431      * Start just after the last delimiter if any
 432      * basic code points were copied; start at the beginning otherwise.
 433      */
 434     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
 435         /*
 436          * in is the index of the next character to be consumed, and
 437          * destCPCount is the number of code points in the output array.
 438          *
 439          * Decode a generalized variable-length integer into delta,
 440          * which gets added to i.  The overflow checking is easier
 441          * if we increase i as we go, then subtract off its starting
 442          * value at the end to obtain delta.
 443          */
 444         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
 445             if(in>=srcLength) {
 446                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 447                 return 0;
 448             }
 449
 450             digit=basicToDigit[(uint8_t)src[in++]];
 451             if(digit<0) {
 452                 *pErrorCode=U_INVALID_CHAR_FOUND;
 453                 return 0;
 454             }
 455             if(digit>(0x7fffffff-i)/w) {
 456                 /* integer overflow */
 457                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 458                 return 0;
 459             }
 460
 461             i+=digit*w;
 462             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 463             t=k-bias;
 464             if(t<TMIN) {
 465                 t=TMIN;
 466             } else if(t>TMAX) {
 467                 t=TMAX;
 468             }
 469             */
 470             t=k-bias;
 471             if(t<TMIN) {
 472                 t=TMIN;
 473             } else if(k>=(bias+TMAX)) {
 474                 t=TMAX;
 475             }
 476             if(digit<t) {
 477                 break;
 478             }
 479
 480             if(w>0x7fffffff/(BASE-t)) {
 481                 /* integer overflow */
 482                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 483                 return 0;
 484             }
 485             w*=BASE-t;
 486         }
 487
 488         /*
 489          * Modification from sample code:
 490          * Increments destCPCount here,
 491          * where needed instead of in for() loop tail.
 492          */
 493         ++destCPCount;
 494         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
 495
 496         /*
 497          * i was supposed to wrap around from (incremented) destCPCount to 0,
 498          * incrementing n each time, so we'll fix that now:
 499          */
 500         if(i/destCPCount>(0x7fffffff-n)) {
 501             /* integer overflow */
 502             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 503             return 0;
 504         }
 505
 506         n+=i/destCPCount;
 507         i%=destCPCount;
 508         /* not needed for Punycode: */
 509         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
 510
 511         if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
 512             /* Unicode code point overflow */
 513             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 514             return 0;
 515         }
 516
 517         /* Insert n at position i of the output: */
 518         cpLength=UTF_CHAR_LENGTH(n);
 519         if((destLength+cpLength)<destCapacity) {
 520             int32_t codeUnitIndex;
 521
 522             /*
 523              * Handle indexes when supplementary code points are present.
 524              *
 525              * In almost all cases, there will be only BMP code points before i
 526              * and even in the entire string.
 527              * This is handled with the same efficiency as with UTF-32.
 528              *
 529              * Only the rare cases with supplementary code points are handled
 530              * more slowly - but not too bad since this is an insertion anyway.
 531              */
 532             if(i<=firstSupplementaryIndex) {
 533                 codeUnitIndex=i;
 534                 if(cpLength>1) {
 535                     firstSupplementaryIndex=codeUnitIndex;
 536                 } else {
 537                     ++firstSupplementaryIndex;
 538                 }
 539             } else {
 540                 codeUnitIndex=firstSupplementaryIndex;
 541                 UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
 542             }
 543
 544             /* use the UChar index codeUnitIndex instead of the code point index i */
 545             if(codeUnitIndex<destLength) {
 546                 uprv_memmove(dest+codeUnitIndex+cpLength,
 547                              dest+codeUnitIndex,
 548                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
 549                 if(caseFlags!=NULL) {
 550                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
 551                                  caseFlags+codeUnitIndex,
 552                                  destLength-codeUnitIndex);
 553                 }
 554             }
 555             if(cpLength==1) {
 556                 /* BMP, insert one code unit */
 557                 dest[codeUnitIndex]=(UChar)n;
 558             } else {
 559                 /* supplementary character, insert two code units */
 560                 dest[codeUnitIndex]=UTF16_LEAD(n);
 561                 dest[codeUnitIndex+1]=UTF16_TRAIL(n);
 562             }
 563             if(caseFlags!=NULL) {
 564                 /* Case of last character determines uppercase flag: */
 565                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
 566                 if(cpLength==2) {
 567                     caseFlags[codeUnitIndex+1]=FALSE;
 568                 }
 569             }
 570         }
 571         destLength+=cpLength;
 572         ++i;
 573     }
 574
 575     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
 576 }
 577
 578 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
 579
 580 #endif /* #if !UCONFIG_NO_IDNA */