icuSources/common/loclikely.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 1997-2016, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  loclikely.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2010feb25
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Code for likely and minimized locale subtags, separated out from other .cpp files
  19 *   that then do not depend on resource bundle code and likely-subtags data.
  20 */
  21
  22 #include "unicode/utypes.h"
  23 #include "unicode/locid.h"
  24 #include "unicode/putil.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/uloc.h"
  27 #include "unicode/ures.h"
  28 #include "unicode/uscript.h"
  29 #include "cmemory.h"
  30 #include "cstring.h"
  31 #include "ulocimp.h"
  32 #include "ustr_imp.h"
  33
  34 /**
  35  * This function looks for the localeID in the likelySubtags resource.
  36  *
  37  * @param localeID The tag to find.
  38  * @param buffer A buffer to hold the matching entry
  39  * @param bufferLength The length of the output buffer
  40  * @return A pointer to "buffer" if found, or a null pointer if not.
  41  */
  42 static const char*  U_CALLCONV
  43 findLikelySubtags(const char* localeID,
  44                   char* buffer,
  45                   int32_t bufferLength,
  46                   UErrorCode* err) {
  47     const char* result = NULL;
  48
  49     if (!U_FAILURE(*err)) {
  50         int32_t resLen = 0;
  51         const UChar* s = NULL;
  52         UErrorCode tmpErr = U_ZERO_ERROR;
  53         UResourceBundle* subtags = ures_openDirect(NULL, "likelySubtags", &tmpErr);
  54         if (U_SUCCESS(tmpErr)) {
  55             s = ures_getStringByKey(subtags, localeID, &resLen, &tmpErr);
  56
  57             if (U_FAILURE(tmpErr)) {
  58                 /*
  59                  * If a resource is missing, it's not really an error, it's
  60                  * just that we don't have any data for that particular locale ID.
  61                  */
  62                 if (tmpErr != U_MISSING_RESOURCE_ERROR) {
  63                     *err = tmpErr;
  64                 }
  65             }
  66             else if (resLen >= bufferLength) {
  67                 /* The buffer should never overflow. */
  68                 *err = U_INTERNAL_PROGRAM_ERROR;
  69             }
  70             else {
  71                 u_UCharsToChars(s, buffer, resLen + 1);
  72                 result = buffer;
  73             }
  74
  75             ures_close(subtags);
  76         } else {
  77             *err = tmpErr;
  78         }
  79     }
  80
  81     return result;
  82 }
  83
  84 /**
  85  * Append a tag to a buffer, adding the separator if necessary.  The buffer
  86  * must be large enough to contain the resulting tag plus any separator
  87  * necessary. The tag must not be a zero-length string.
  88  *
  89  * @param tag The tag to add.
  90  * @param tagLength The length of the tag.
  91  * @param buffer The output buffer.
  92  * @param bufferLength The length of the output buffer.  This is an input/ouput parameter.
  93  **/
  94 static void U_CALLCONV
  95 appendTag(
  96     const char* tag,
  97     int32_t tagLength,
  98     char* buffer,
  99     int32_t* bufferLength) {
 100
 101     if (*bufferLength > 0) {
 102         buffer[*bufferLength] = '_';
 103         ++(*bufferLength);
 104     }
 105
 106     uprv_memmove(
 107         &buffer[*bufferLength],
 108         tag,
 109         tagLength);
 110
 111     *bufferLength += tagLength;
 112 }
 113
 114 /**
 115  * These are the canonical strings for unknown languages, scripts and regions.
 116  **/
 117 static const char* const unknownLanguage = "und";
 118 static const char* const unknownScript = "Zzzz";
 119 static const char* const unknownRegion = "ZZ";
 120
 121 /**
 122  * Create a tag string from the supplied parameters.  The lang, script and region
 123  * parameters may be NULL pointers. If they are, their corresponding length parameters
 124  * must be less than or equal to 0.
 125  *
 126  * If any of the language, script or region parameters are empty, and the alternateTags
 127  * parameter is not NULL, it will be parsed for potential language, script and region tags
 128  * to be used when constructing the new tag.  If the alternateTags parameter is NULL, or
 129  * it contains no language tag, the default tag for the unknown language is used.
 130  *
 131  * If the length of the new string exceeds the capacity of the output buffer,
 132  * the function copies as many bytes to the output buffer as it can, and returns
 133  * the error U_BUFFER_OVERFLOW_ERROR.
 134  *
 135  * If an illegal argument is provided, the function returns the error
 136  * U_ILLEGAL_ARGUMENT_ERROR.
 137  *
 138  * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
 139  * the tag string fits in the output buffer, but the null terminator doesn't.
 140  *
 141  * @param lang The language tag to use.
 142  * @param langLength The length of the language tag.
 143  * @param script The script tag to use.
 144  * @param scriptLength The length of the script tag.
 145  * @param region The region tag to use.
 146  * @param regionLength The length of the region tag.
 147  * @param trailing Any trailing data to append to the new tag.
 148  * @param trailingLength The length of the trailing data.
 149  * @param alternateTags A string containing any alternate tags.
 150  * @param tag The output buffer.
 151  * @param tagCapacity The capacity of the output buffer.
 152  * @param err A pointer to a UErrorCode for error reporting.
 153  * @return The length of the tag string, which may be greater than tagCapacity, or -1 on error.
 154  **/
 155 static int32_t U_CALLCONV
 156 createTagStringWithAlternates(
 157     const char* lang,
 158     int32_t langLength,
 159     const char* script,
 160     int32_t scriptLength,
 161     const char* region,
 162     int32_t regionLength,
 163     const char* trailing,
 164     int32_t trailingLength,
 165     const char* alternateTags,
 166     char* tag,
 167     int32_t tagCapacity,
 168     UErrorCode* err) {
 169
 170     if (U_FAILURE(*err)) {
 171         goto error;
 172     }
 173     else if (tag == NULL ||
 174              tagCapacity <= 0 ||
 175              langLength >= ULOC_LANG_CAPACITY ||
 176              scriptLength >= ULOC_SCRIPT_CAPACITY ||
 177              regionLength >= ULOC_COUNTRY_CAPACITY) {
 178         goto error;
 179     }
 180     else {
 181         /**
 182          * ULOC_FULLNAME_CAPACITY will provide enough capacity
 183          * that we can build a string that contains the language,
 184          * script and region code without worrying about overrunning
 185          * the user-supplied buffer.
 186          **/
 187         char tagBuffer[ULOC_FULLNAME_CAPACITY];
 188         int32_t tagLength = 0;
 189         int32_t capacityRemaining = tagCapacity;
 190         UBool regionAppended = FALSE;
 191
 192         if (langLength > 0) {
 193             appendTag(
 194                 lang,
 195                 langLength,
 196                 tagBuffer,
 197                 &tagLength);
 198         }
 199         else if (alternateTags == NULL) {
 200             /*
 201              * Append the value for an unknown language, if
 202              * we found no language.
 203              */
 204             appendTag(
 205                 unknownLanguage,
 206                 (int32_t)uprv_strlen(unknownLanguage),
 207                 tagBuffer,
 208                 &tagLength);
 209         }
 210         else {
 211             /*
 212              * Parse the alternateTags string for the language.
 213              */
 214             char alternateLang[ULOC_LANG_CAPACITY];
 215             int32_t alternateLangLength = sizeof(alternateLang);
 216
 217             alternateLangLength =
 218                 uloc_getLanguage(
 219                     alternateTags,
 220                     alternateLang,
 221                     alternateLangLength,
 222                     err);
 223             if(U_FAILURE(*err) ||
 224                 alternateLangLength >= ULOC_LANG_CAPACITY) {
 225                 goto error;
 226             }
 227             else if (alternateLangLength == 0) {
 228                 /*
 229                  * Append the value for an unknown language, if
 230                  * we found no language.
 231                  */
 232                 appendTag(
 233                     unknownLanguage,
 234                     (int32_t)uprv_strlen(unknownLanguage),
 235                     tagBuffer,
 236                     &tagLength);
 237             }
 238             else {
 239                 appendTag(
 240                     alternateLang,
 241                     alternateLangLength,
 242                     tagBuffer,
 243                     &tagLength);
 244             }
 245         }
 246
 247         if (scriptLength > 0) {
 248             appendTag(
 249                 script,
 250                 scriptLength,
 251                 tagBuffer,
 252                 &tagLength);
 253         }
 254         else if (alternateTags != NULL) {
 255             /*
 256              * Parse the alternateTags string for the script.
 257              */
 258             char alternateScript[ULOC_SCRIPT_CAPACITY];
 259
 260             const int32_t alternateScriptLength =
 261                 uloc_getScript(
 262                     alternateTags,
 263                     alternateScript,
 264                     sizeof(alternateScript),
 265                     err);
 266
 267             if (U_FAILURE(*err) ||
 268                 alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
 269                 goto error;
 270             }
 271             else if (alternateScriptLength > 0) {
 272                 appendTag(
 273                     alternateScript,
 274                     alternateScriptLength,
 275                     tagBuffer,
 276                     &tagLength);
 277             }
 278         }
 279
 280         if (regionLength > 0) {
 281             appendTag(
 282                 region,
 283                 regionLength,
 284                 tagBuffer,
 285                 &tagLength);
 286
 287             regionAppended = TRUE;
 288         }
 289         else if (alternateTags != NULL) {
 290             /*
 291              * Parse the alternateTags string for the region.
 292              */
 293             char alternateRegion[ULOC_COUNTRY_CAPACITY];
 294
 295             const int32_t alternateRegionLength =
 296                 uloc_getCountry(
 297                     alternateTags,
 298                     alternateRegion,
 299                     sizeof(alternateRegion),
 300                     err);
 301             if (U_FAILURE(*err) ||
 302                 alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
 303                 goto error;
 304             }
 305             else if (alternateRegionLength > 0) {
 306                 appendTag(
 307                     alternateRegion,
 308                     alternateRegionLength,
 309                     tagBuffer,
 310                     &tagLength);
 311
 312                 regionAppended = TRUE;
 313             }
 314         }
 315
 316         {
 317             const int32_t toCopy =
 318                 tagLength >= tagCapacity ? tagCapacity : tagLength;
 319
 320             /**
 321              * Copy the partial tag from our internal buffer to the supplied
 322              * target.
 323              **/
 324             uprv_memcpy(
 325                 tag,
 326                 tagBuffer,
 327                 toCopy);
 328
 329             capacityRemaining -= toCopy;
 330         }
 331
 332         if (trailingLength > 0) {
 333             if (*trailing != '@' && capacityRemaining > 0) {
 334                 tag[tagLength++] = '_';
 335                 --capacityRemaining;
 336                 if (capacityRemaining > 0 && !regionAppended) {
 337                     /* extra separator is required */
 338                     tag[tagLength++] = '_';
 339                     --capacityRemaining;
 340                 }
 341             }
 342
 343             if (capacityRemaining > 0) {
 344                 /*
 345                  * Copy the trailing data into the supplied buffer.  Use uprv_memmove, since we
 346                  * don't know if the user-supplied buffers overlap.
 347                  */
 348                 const int32_t toCopy =
 349                     trailingLength >= capacityRemaining ? capacityRemaining : trailingLength;
 350
 351                 uprv_memmove(
 352                     &tag[tagLength],
 353                     trailing,
 354                     toCopy);
 355             }
 356         }
 357
 358         tagLength += trailingLength;
 359
 360         return u_terminateChars(
 361                     tag,
 362                     tagCapacity,
 363                     tagLength,
 364                     err);
 365     }
 366
 367 error:
 368
 369     /**
 370      * An overflow indicates the locale ID passed in
 371      * is ill-formed.  If we got here, and there was
 372      * no previous error, it's an implicit overflow.
 373      **/
 374     if (*err ==  U_BUFFER_OVERFLOW_ERROR ||
 375         U_SUCCESS(*err)) {
 376         *err = U_ILLEGAL_ARGUMENT_ERROR;
 377     }
 378
 379     return -1;
 380 }
 381
 382 /**
 383  * Create a tag string from the supplied parameters.  The lang, script and region
 384  * parameters may be NULL pointers. If they are, their corresponding length parameters
 385  * must be less than or equal to 0.  If the lang parameter is an empty string, the
 386  * default value for an unknown language is written to the output buffer.
 387  *
 388  * If the length of the new string exceeds the capacity of the output buffer,
 389  * the function copies as many bytes to the output buffer as it can, and returns
 390  * the error U_BUFFER_OVERFLOW_ERROR.
 391  *
 392  * If an illegal argument is provided, the function returns the error
 393  * U_ILLEGAL_ARGUMENT_ERROR.
 394  *
 395  * @param lang The language tag to use.
 396  * @param langLength The length of the language tag.
 397  * @param script The script tag to use.
 398  * @param scriptLength The length of the script tag.
 399  * @param region The region tag to use.
 400  * @param regionLength The length of the region tag.
 401  * @param trailing Any trailing data to append to the new tag.
 402  * @param trailingLength The length of the trailing data.
 403  * @param tag The output buffer.
 404  * @param tagCapacity The capacity of the output buffer.
 405  * @param err A pointer to a UErrorCode for error reporting.
 406  * @return The length of the tag string, which may be greater than tagCapacity.
 407  **/
 408 static int32_t U_CALLCONV
 409 createTagString(
 410     const char* lang,
 411     int32_t langLength,
 412     const char* script,
 413     int32_t scriptLength,
 414     const char* region,
 415     int32_t regionLength,
 416     const char* trailing,
 417     int32_t trailingLength,
 418     char* tag,
 419     int32_t tagCapacity,
 420     UErrorCode* err)
 421 {
 422     return createTagStringWithAlternates(
 423                 lang,
 424                 langLength,
 425                 script,
 426                 scriptLength,
 427                 region,
 428                 regionLength,
 429                 trailing,
 430                 trailingLength,
 431                 NULL,
 432                 tag,
 433                 tagCapacity,
 434                 err);
 435 }
 436
 437 /**
 438  * Parse the language, script, and region subtags from a tag string, and copy the
 439  * results into the corresponding output parameters. The buffers are null-terminated,
 440  * unless overflow occurs.
 441  *
 442  * The langLength, scriptLength, and regionLength parameters are input/output
 443  * parameters, and must contain the capacity of their corresponding buffers on
 444  * input.  On output, they will contain the actual length of the buffers, not
 445  * including the null terminator.
 446  *
 447  * If the length of any of the output subtags exceeds the capacity of the corresponding
 448  * buffer, the function copies as many bytes to the output buffer as it can, and returns
 449  * the error U_BUFFER_OVERFLOW_ERROR.  It will not parse any more subtags once overflow
 450  * occurs.
 451  *
 452  * If an illegal argument is provided, the function returns the error
 453  * U_ILLEGAL_ARGUMENT_ERROR.
 454  *
 455  * @param localeID The locale ID to parse.
 456  * @param lang The language tag buffer.
 457  * @param langLength The length of the language tag.
 458  * @param script The script tag buffer.
 459  * @param scriptLength The length of the script tag.
 460  * @param region The region tag buffer.
 461  * @param regionLength The length of the region tag.
 462  * @param err A pointer to a UErrorCode for error reporting.
 463  * @return The number of chars of the localeID parameter consumed.
 464  **/
 465 static int32_t U_CALLCONV
 466 parseTagString(
 467     const char* localeID,
 468     char* lang,
 469     int32_t* langLength,
 470     char* script,
 471     int32_t* scriptLength,
 472     char* region,
 473     int32_t* regionLength,
 474     UErrorCode* err)
 475 {
 476     const char* position = localeID;
 477     int32_t subtagLength = 0;
 478
 479     if(U_FAILURE(*err) ||
 480        localeID == NULL ||
 481        lang == NULL ||
 482        langLength == NULL ||
 483        script == NULL ||
 484        scriptLength == NULL ||
 485        region == NULL ||
 486        regionLength == NULL) {
 487         goto error;
 488     }
 489
 490     subtagLength = ulocimp_getLanguage(position, lang, *langLength, &position);
 491     u_terminateChars(lang, *langLength, subtagLength, err);
 492
 493     /*
 494      * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
 495      * to be an error, because it indicates the user-supplied tag is
 496      * not well-formed.
 497      */
 498     if(U_FAILURE(*err)) {
 499         goto error;
 500     }
 501
 502     *langLength = subtagLength;
 503
 504     /*
 505      * If no language was present, use the value of unknownLanguage
 506      * instead.  Otherwise, move past any separator.
 507      */
 508     if (*langLength == 0) {
 509         uprv_strcpy(
 510             lang,
 511             unknownLanguage);
 512         *langLength = (int32_t)uprv_strlen(lang);
 513     }
 514     else if (_isIDSeparator(*position)) {
 515         ++position;
 516     }
 517
 518     subtagLength = ulocimp_getScript(position, script, *scriptLength, &position);
 519     u_terminateChars(script, *scriptLength, subtagLength, err);
 520
 521     if(U_FAILURE(*err)) {
 522         goto error;
 523     }
 524
 525     *scriptLength = subtagLength;
 526
 527     if (*scriptLength > 0) {
 528         if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
 529             /**
 530              * If the script part is the "unknown" script, then don't return it.
 531              **/
 532             *scriptLength = 0;
 533         }
 534
 535         /*
 536          * Move past any separator.
 537          */
 538         if (_isIDSeparator(*position)) {
 539             ++position;
 540         }
 541     }
 542
 543     subtagLength = ulocimp_getCountry(position, region, *regionLength, &position);
 544     u_terminateChars(region, *regionLength, subtagLength, err);
 545
 546     if(U_FAILURE(*err)) {
 547         goto error;
 548     }
 549
 550     *regionLength = subtagLength;
 551
 552     if (*regionLength > 0) {
 553         if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
 554             /**
 555              * If the region part is the "unknown" region, then don't return it.
 556              **/
 557             *regionLength = 0;
 558         }
 559     } else if (*position != 0 && *position != '@') {
 560         /* back up over consumed trailing separator */
 561         --position;
 562     }
 563
 564 exit:
 565
 566     return (int32_t)(position - localeID);
 567
 568 error:
 569
 570     /**
 571      * If we get here, we have no explicit error, it's the result of an
 572      * illegal argument.
 573      **/
 574     if (!U_FAILURE(*err)) {
 575         *err = U_ILLEGAL_ARGUMENT_ERROR;
 576     }
 577
 578     goto exit;
 579 }
 580
 581 static int32_t U_CALLCONV
 582 createLikelySubtagsString(
 583     const char* lang,
 584     int32_t langLength,
 585     const char* script,
 586     int32_t scriptLength,
 587     const char* region,
 588     int32_t regionLength,
 589     const char* variants,
 590     int32_t variantsLength,
 591     char* tag,
 592     int32_t tagCapacity,
 593     UErrorCode* err)
 594 {
 595     /**
 596      * ULOC_FULLNAME_CAPACITY will provide enough capacity
 597      * that we can build a string that contains the language,
 598      * script and region code without worrying about overrunning
 599      * the user-supplied buffer.
 600      **/
 601     char tagBuffer[ULOC_FULLNAME_CAPACITY];
 602     char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
 603
 604     if(U_FAILURE(*err)) {
 605         goto error;
 606     }
 607
 608     /**
 609      * Try the language with the script and region first.
 610      **/
 611     if (scriptLength > 0 && regionLength > 0) {
 612
 613         const char* likelySubtags = NULL;
 614
 615         createTagString(
 616             lang,
 617             langLength,
 618             script,
 619             scriptLength,
 620             region,
 621             regionLength,
 622             NULL,
 623             0,
 624             tagBuffer,
 625             sizeof(tagBuffer),
 626             err);
 627         if(U_FAILURE(*err)) {
 628             goto error;
 629         }
 630
 631         likelySubtags =
 632             findLikelySubtags(
 633                 tagBuffer,
 634                 likelySubtagsBuffer,
 635                 sizeof(likelySubtagsBuffer),
 636                 err);
 637         if(U_FAILURE(*err)) {
 638             goto error;
 639         }
 640
 641         if (likelySubtags != NULL) {
 642             /* Always use the language tag from the
 643                maximal string, since it may be more
 644                specific than the one provided. */
 645             return createTagStringWithAlternates(
 646                         NULL,
 647                         0,
 648                         NULL,
 649                         0,
 650                         NULL,
 651                         0,
 652                         variants,
 653                         variantsLength,
 654                         likelySubtags,
 655                         tag,
 656                         tagCapacity,
 657                         err);
 658         }
 659     }
 660
 661     /**
 662      * Try the language with just the script.
 663      **/
 664     if (scriptLength > 0) {
 665
 666         const char* likelySubtags = NULL;
 667
 668         createTagString(
 669             lang,
 670             langLength,
 671             script,
 672             scriptLength,
 673             NULL,
 674             0,
 675             NULL,
 676             0,
 677             tagBuffer,
 678             sizeof(tagBuffer),
 679             err);
 680         if(U_FAILURE(*err)) {
 681             goto error;
 682         }
 683
 684         likelySubtags =
 685             findLikelySubtags(
 686                 tagBuffer,
 687                 likelySubtagsBuffer,
 688                 sizeof(likelySubtagsBuffer),
 689                 err);
 690         if(U_FAILURE(*err)) {
 691             goto error;
 692         }
 693
 694         if (likelySubtags != NULL) {
 695             /* Always use the language tag from the
 696                maximal string, since it may be more
 697                specific than the one provided. */
 698             return createTagStringWithAlternates(
 699                         NULL,
 700                         0,
 701                         NULL,
 702                         0,
 703                         region,
 704                         regionLength,
 705                         variants,
 706                         variantsLength,
 707                         likelySubtags,
 708                         tag,
 709                         tagCapacity,
 710                         err);
 711         }
 712     }
 713
 714     /**
 715      * Try the language with just the region.
 716      **/
 717     if (regionLength > 0) {
 718
 719         const char* likelySubtags = NULL;
 720
 721         createTagString(
 722             lang,
 723             langLength,
 724             NULL,
 725             0,
 726             region,
 727             regionLength,
 728             NULL,
 729             0,
 730             tagBuffer,
 731             sizeof(tagBuffer),
 732             err);
 733         if(U_FAILURE(*err)) {
 734             goto error;
 735         }
 736
 737         likelySubtags =
 738             findLikelySubtags(
 739                 tagBuffer,
 740                 likelySubtagsBuffer,
 741                 sizeof(likelySubtagsBuffer),
 742                 err);
 743         if(U_FAILURE(*err)) {
 744             goto error;
 745         }
 746
 747         if (likelySubtags != NULL) {
 748             /* Always use the language tag from the
 749                maximal string, since it may be more
 750                specific than the one provided. */
 751             return createTagStringWithAlternates(
 752                         NULL,
 753                         0,
 754                         script,
 755                         scriptLength,
 756                         NULL,
 757                         0,
 758                         variants,
 759                         variantsLength,
 760                         likelySubtags,
 761                         tag,
 762                         tagCapacity,
 763                         err);
 764         }
 765     }
 766
 767     /**
 768      * Finally, try just the language.
 769      **/
 770     {
 771         const char* likelySubtags = NULL;
 772
 773         createTagString(
 774             lang,
 775             langLength,
 776             NULL,
 777             0,
 778             NULL,
 779             0,
 780             NULL,
 781             0,
 782             tagBuffer,
 783             sizeof(tagBuffer),
 784             err);
 785         if(U_FAILURE(*err)) {
 786             goto error;
 787         }
 788
 789         likelySubtags =
 790             findLikelySubtags(
 791                 tagBuffer,
 792                 likelySubtagsBuffer,
 793                 sizeof(likelySubtagsBuffer),
 794                 err);
 795         if(U_FAILURE(*err)) {
 796             goto error;
 797         }
 798
 799         if (likelySubtags != NULL) {
 800             /* Always use the language tag from the
 801                maximal string, since it may be more
 802                specific than the one provided. */
 803             return createTagStringWithAlternates(
 804                         NULL,
 805                         0,
 806                         script,
 807                         scriptLength,
 808                         region,
 809                         regionLength,
 810                         variants,
 811                         variantsLength,
 812                         likelySubtags,
 813                         tag,
 814                         tagCapacity,
 815                         err);
 816         }
 817     }
 818
 819     return u_terminateChars(
 820                 tag,
 821                 tagCapacity,
 822                 0,
 823                 err);
 824
 825 error:
 826
 827     if (!U_FAILURE(*err)) {
 828         *err = U_ILLEGAL_ARGUMENT_ERROR;
 829     }
 830
 831     return -1;
 832 }
 833
 834 #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) \
 835     {   int32_t count = 0; \
 836         int32_t i; \
 837         for (i = 0; i < trailingLength; i++) { \
 838             if (trailing[i] == '-' || trailing[i] == '_') { \
 839                 count = 0; \
 840                 if (count > 8) { \
 841                     goto error; \
 842                 } \
 843             } else if (trailing[i] == '@') { \
 844                 break; \
 845             } else if (count > 8) { \
 846                 goto error; \
 847             } else { \
 848                 count++; \
 849             } \
 850         } \
 851     }
 852
 853 static int32_t
 854 _uloc_addLikelySubtags(const char*    localeID,
 855          char* maximizedLocaleID,
 856          int32_t maximizedLocaleIDCapacity,
 857          UErrorCode* err)
 858 {
 859     char lang[ULOC_LANG_CAPACITY];
 860     int32_t langLength = sizeof(lang);
 861     char script[ULOC_SCRIPT_CAPACITY];
 862     int32_t scriptLength = sizeof(script);
 863     char region[ULOC_COUNTRY_CAPACITY];
 864     int32_t regionLength = sizeof(region);
 865     const char* trailing = "";
 866     int32_t trailingLength = 0;
 867     int32_t trailingIndex = 0;
 868     int32_t resultLength = 0;
 869
 870     if(U_FAILURE(*err)) {
 871         goto error;
 872     }
 873     else if (localeID == NULL ||
 874              maximizedLocaleID == NULL ||
 875              maximizedLocaleIDCapacity <= 0) {
 876         goto error;
 877     }
 878
 879     trailingIndex = parseTagString(
 880         localeID,
 881         lang,
 882         &langLength,
 883         script,
 884         &scriptLength,
 885         region,
 886         &regionLength,
 887         err);
 888     if(U_FAILURE(*err)) {
 889         /* Overflow indicates an illegal argument error */
 890         if (*err == U_BUFFER_OVERFLOW_ERROR) {
 891             *err = U_ILLEGAL_ARGUMENT_ERROR;
 892         }
 893
 894         goto error;
 895     }
 896
 897     /* Find the length of the trailing portion. */
 898     while (_isIDSeparator(localeID[trailingIndex])) {
 899         trailingIndex++;
 900     }
 901     trailing = &localeID[trailingIndex];
 902     trailingLength = (int32_t)uprv_strlen(trailing);
 903
 904     CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
 905
 906     resultLength =
 907         createLikelySubtagsString(
 908             lang,
 909             langLength,
 910             script,
 911             scriptLength,
 912             region,
 913             regionLength,
 914             trailing,
 915             trailingLength,
 916             maximizedLocaleID,
 917             maximizedLocaleIDCapacity,
 918             err);
 919
 920     if (resultLength == 0) {
 921         const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
 922
 923         /*
 924          * If we get here, we need to return localeID.
 925          */
 926         uprv_memcpy(
 927             maximizedLocaleID,
 928             localeID,
 929             localIDLength <= maximizedLocaleIDCapacity ?
 930                 localIDLength : maximizedLocaleIDCapacity);
 931
 932         resultLength =
 933             u_terminateChars(
 934                 maximizedLocaleID,
 935                 maximizedLocaleIDCapacity,
 936                 localIDLength,
 937                 err);
 938     }
 939
 940     return resultLength;
 941
 942 error:
 943
 944     if (!U_FAILURE(*err)) {
 945         *err = U_ILLEGAL_ARGUMENT_ERROR;
 946     }
 947
 948     return -1;
 949 }
 950
 951 static int32_t
 952 _uloc_minimizeSubtags(const char*    localeID,
 953          char* minimizedLocaleID,
 954          int32_t minimizedLocaleIDCapacity,
 955          UErrorCode* err)
 956 {
 957     /**
 958      * ULOC_FULLNAME_CAPACITY will provide enough capacity
 959      * that we can build a string that contains the language,
 960      * script and region code without worrying about overrunning
 961      * the user-supplied buffer.
 962      **/
 963     char maximizedTagBuffer[ULOC_FULLNAME_CAPACITY];
 964     int32_t maximizedTagBufferLength = sizeof(maximizedTagBuffer);
 965
 966     char lang[ULOC_LANG_CAPACITY];
 967     int32_t langLength = sizeof(lang);
 968     char script[ULOC_SCRIPT_CAPACITY];
 969     int32_t scriptLength = sizeof(script);
 970     char region[ULOC_COUNTRY_CAPACITY];
 971     int32_t regionLength = sizeof(region);
 972     const char* trailing = "";
 973     int32_t trailingLength = 0;
 974     int32_t trailingIndex = 0;
 975
 976     if(U_FAILURE(*err)) {
 977         goto error;
 978     }
 979     else if (localeID == NULL ||
 980              minimizedLocaleID == NULL ||
 981              minimizedLocaleIDCapacity <= 0) {
 982         goto error;
 983     }
 984
 985     trailingIndex =
 986         parseTagString(
 987             localeID,
 988             lang,
 989             &langLength,
 990             script,
 991             &scriptLength,
 992             region,
 993             &regionLength,
 994             err);
 995     if(U_FAILURE(*err)) {
 996
 997         /* Overflow indicates an illegal argument error */
 998         if (*err == U_BUFFER_OVERFLOW_ERROR) {
 999             *err = U_ILLEGAL_ARGUMENT_ERROR;
1000         }
1001
1002         goto error;
1003     }
1004
1005     /* Find the spot where the variants or the keywords begin, if any. */
1006     while (_isIDSeparator(localeID[trailingIndex])) {
1007         trailingIndex++;
1008     }
1009     trailing = &localeID[trailingIndex];
1010     trailingLength = (int32_t)uprv_strlen(trailing);
1011
1012     CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
1013
1014     createTagString(
1015         lang,
1016         langLength,
1017         script,
1018         scriptLength,
1019         region,
1020         regionLength,
1021         NULL,
1022         0,
1023         maximizedTagBuffer,
1024         maximizedTagBufferLength,
1025         err);
1026     if(U_FAILURE(*err)) {
1027         goto error;
1028     }
1029
1030     /**
1031      * First, we need to first get the maximization
1032      * from AddLikelySubtags.
1033      **/
1034     maximizedTagBufferLength =
1035         uloc_addLikelySubtags(
1036             maximizedTagBuffer,
1037             maximizedTagBuffer,
1038             maximizedTagBufferLength,
1039             err);
1040
1041     if(U_FAILURE(*err)) {
1042         goto error;
1043     }
1044
1045     /**
1046      * Start first with just the language.
1047      **/
1048     {
1049         char tagBuffer[ULOC_FULLNAME_CAPACITY];
1050
1051         const int32_t tagBufferLength =
1052             createLikelySubtagsString(
1053                 lang,
1054                 langLength,
1055                 NULL,
1056                 0,
1057                 NULL,
1058                 0,
1059                 NULL,
1060                 0,
1061                 tagBuffer,
1062                 sizeof(tagBuffer),
1063                 err);
1064
1065         if(U_FAILURE(*err)) {
1066             goto error;
1067         }
1068         else if (uprv_strnicmp(
1069                     maximizedTagBuffer,
1070                     tagBuffer,
1071                     tagBufferLength) == 0) {
1072
1073             return createTagString(
1074                         lang,
1075                         langLength,
1076                         NULL,
1077                         0,
1078                         NULL,
1079                         0,
1080                         trailing,
1081                         trailingLength,
1082                         minimizedLocaleID,
1083                         minimizedLocaleIDCapacity,
1084                         err);
1085         }
1086     }
1087
1088     /**
1089      * Next, try the language and region.
1090      **/
1091     if (regionLength > 0) {
1092
1093         char tagBuffer[ULOC_FULLNAME_CAPACITY];
1094
1095         const int32_t tagBufferLength =
1096             createLikelySubtagsString(
1097                 lang,
1098                 langLength,
1099                 NULL,
1100                 0,
1101                 region,
1102                 regionLength,
1103                 NULL,
1104                 0,
1105                 tagBuffer,
1106                 sizeof(tagBuffer),
1107                 err);
1108
1109         if(U_FAILURE(*err)) {
1110             goto error;
1111         }
1112         else if (uprv_strnicmp(
1113                     maximizedTagBuffer,
1114                     tagBuffer,
1115                     tagBufferLength) == 0) {
1116
1117             return createTagString(
1118                         lang,
1119                         langLength,
1120                         NULL,
1121                         0,
1122                         region,
1123                         regionLength,
1124                         trailing,
1125                         trailingLength,
1126                         minimizedLocaleID,
1127                         minimizedLocaleIDCapacity,
1128                         err);
1129         }
1130     }
1131
1132     /**
1133      * Finally, try the language and script.  This is our last chance,
1134      * since trying with all three subtags would only yield the
1135      * maximal version that we already have.
1136      **/
1137     if (scriptLength > 0 && regionLength > 0) {
1138         char tagBuffer[ULOC_FULLNAME_CAPACITY];
1139
1140         const int32_t tagBufferLength =
1141             createLikelySubtagsString(
1142                 lang,
1143                 langLength,
1144                 script,
1145                 scriptLength,
1146                 NULL,
1147                 0,
1148                 NULL,
1149                 0,
1150                 tagBuffer,
1151                 sizeof(tagBuffer),
1152                 err);
1153
1154         if(U_FAILURE(*err)) {
1155             goto error;
1156         }
1157         else if (uprv_strnicmp(
1158                     maximizedTagBuffer,
1159                     tagBuffer,
1160                     tagBufferLength) == 0) {
1161
1162             return createTagString(
1163                         lang,
1164                         langLength,
1165                         script,
1166                         scriptLength,
1167                         NULL,
1168                         0,
1169                         trailing,
1170                         trailingLength,
1171                         minimizedLocaleID,
1172                         minimizedLocaleIDCapacity,
1173                         err);
1174         }
1175     }
1176
1177     {
1178         /**
1179          * If we got here, return the locale ID parameter.
1180          **/
1181         const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
1182
1183         uprv_memcpy(
1184             minimizedLocaleID,
1185             localeID,
1186             localeIDLength <= minimizedLocaleIDCapacity ?
1187                 localeIDLength : minimizedLocaleIDCapacity);
1188
1189         return u_terminateChars(
1190                     minimizedLocaleID,
1191                     minimizedLocaleIDCapacity,
1192                     localeIDLength,
1193                     err);
1194     }
1195
1196 error:
1197
1198     if (!U_FAILURE(*err)) {
1199         *err = U_ILLEGAL_ARGUMENT_ERROR;
1200     }
1201
1202     return -1;
1203
1204
1205 }
1206
1207 static UBool
1208 do_canonicalize(const char*    localeID,
1209          char* buffer,
1210          int32_t bufferCapacity,
1211          UErrorCode* err)
1212 {
1213     uloc_canonicalize(
1214         localeID,
1215         buffer,
1216         bufferCapacity,
1217         err);
1218
1219     if (*err == U_STRING_NOT_TERMINATED_WARNING ||
1220         *err == U_BUFFER_OVERFLOW_ERROR) {
1221         *err = U_ILLEGAL_ARGUMENT_ERROR;
1222
1223         return FALSE;
1224     }
1225     else if (U_FAILURE(*err)) {
1226
1227         return FALSE;
1228     }
1229     else {
1230         return TRUE;
1231     }
1232 }
1233
1234 U_CAPI int32_t U_EXPORT2
1235 uloc_addLikelySubtags(const char*    localeID,
1236          char* maximizedLocaleID,
1237          int32_t maximizedLocaleIDCapacity,
1238          UErrorCode* err)
1239 {
1240     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1241
1242     if (!do_canonicalize(
1243         localeID,
1244         localeBuffer,
1245         sizeof(localeBuffer),
1246         err)) {
1247         return -1;
1248     }
1249     else {
1250         return _uloc_addLikelySubtags(
1251                     localeBuffer,
1252                     maximizedLocaleID,
1253                     maximizedLocaleIDCapacity,
1254                     err);
1255     }
1256 }
1257
1258 U_CAPI int32_t U_EXPORT2
1259 uloc_minimizeSubtags(const char*    localeID,
1260          char* minimizedLocaleID,
1261          int32_t minimizedLocaleIDCapacity,
1262          UErrorCode* err)
1263 {
1264     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1265
1266     if (!do_canonicalize(
1267         localeID,
1268         localeBuffer,
1269         sizeof(localeBuffer),
1270         err)) {
1271         return -1;
1272     }
1273     else {
1274         return _uloc_minimizeSubtags(
1275                     localeBuffer,
1276                     minimizedLocaleID,
1277                     minimizedLocaleIDCapacity,
1278                     err);
1279     }
1280 }
1281
1282 // Pairs of (language subtag, + or -) for finding out fast if common languages
1283 // are LTR (minus) or RTL (plus).
1284 static const char* LANG_DIR_STRING =
1285         "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
1286
1287 // Implemented here because this calls uloc_addLikelySubtags().
1288 U_CAPI UBool U_EXPORT2
1289 uloc_isRightToLeft(const char *locale) {
1290     UErrorCode errorCode = U_ZERO_ERROR;
1291     char script[8];
1292     int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
1293     if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1294             scriptLength == 0) {
1295         // Fastpath: We know the likely scripts and their writing direction
1296         // for some common languages.
1297         errorCode = U_ZERO_ERROR;
1298         char lang[8];
1299         int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
1300         if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1301                 langLength == 0) {
1302             return FALSE;
1303         }
1304         const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
1305         if (langPtr != NULL) {
1306             switch (langPtr[langLength]) {
1307             case '-': return FALSE;
1308             case '+': return TRUE;
1309             default: break;  // partial match of a longer code
1310             }
1311         }
1312         // Otherwise, find the likely script.
1313         errorCode = U_ZERO_ERROR;
1314         char likely[ULOC_FULLNAME_CAPACITY];
1315         (void)uloc_addLikelySubtags(locale, likely, UPRV_LENGTHOF(likely), &errorCode);
1316         if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1317             return FALSE;
1318         }
1319         scriptLength = uloc_getScript(likely, script, UPRV_LENGTHOF(script), &errorCode);
1320         if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1321                 scriptLength == 0) {
1322             return FALSE;
1323         }
1324     }
1325     UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
1326     return uscript_isRightToLeft(scriptCode);
1327 }
1328
1329 U_NAMESPACE_BEGIN
1330
1331 UBool
1332 Locale::isRightToLeft() const {
1333     return uloc_isRightToLeft(getBaseName());
1334 }
1335
1336 U_NAMESPACE_END
1337
1338 // The following must at least allow for rg key value (6) plus terminator (1).
1339 #define ULOC_RG_BUFLEN 8
1340
1341 U_CAPI int32_t U_EXPORT2
1342 ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
1343                                      char *region, int32_t regionCapacity, UErrorCode* status) {
1344     if (U_FAILURE(*status)) {
1345         return 0;
1346     }
1347     char rgBuf[ULOC_RG_BUFLEN];
1348     UErrorCode rgStatus = U_ZERO_ERROR;
1349
1350     // First check for rg keyword value
1351     int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
1352     if (U_FAILURE(rgStatus) || rgLen != 6) {
1353         rgLen = 0;
1354     } else {
1355         // rgBuf guaranteed to be zero terminated here, with text len 6
1356         char *rgPtr = rgBuf;
1357         for (; *rgPtr!= 0; rgPtr++) {
1358             *rgPtr = uprv_toupper(*rgPtr);
1359         }
1360         rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
1361     }
1362
1363     if (rgLen == 0) {
1364         // No valid rg keyword value, try for unicode_region_subtag
1365         rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
1366         if (U_FAILURE(*status)) {
1367             rgLen = 0;
1368         } else if (rgLen == 0 && inferRegion) {
1369             // no unicode_region_subtag but inferRegion TRUE, try likely subtags
1370             char locBuf[ULOC_FULLNAME_CAPACITY];
1371             rgStatus = U_ZERO_ERROR;
1372             (void)uloc_addLikelySubtags(localeID, locBuf, ULOC_FULLNAME_CAPACITY, &rgStatus);
1373             if (U_SUCCESS(rgStatus)) {
1374                 rgLen = uloc_getCountry(locBuf, rgBuf, ULOC_RG_BUFLEN, status);
1375                 if (U_FAILURE(*status)) {
1376                     rgLen = 0;
1377                 }
1378             }
1379         }
1380     }
1381
1382     rgBuf[rgLen] = 0;
1383     uprv_strncpy(region, rgBuf, regionCapacity);
1384     return u_terminateChars(region, regionCapacity, rgLen, status);
1385 }
1386