icuSources/i18n/tridpars.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (c) 2002-2014, International Business Machines Corporation
   6 *   and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   01/14/2002  aliu        Creation.
  10 **********************************************************************
  11 */
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_TRANSLITERATION
  16
  17 #include "tridpars.h"
  18 #include "hash.h"
  19 #include "mutex.h"
  20 #include "transreg.h"
  21 #include "uassert.h"
  22 #include "ucln_in.h"
  23 #include "unicode/parsepos.h"
  24 #include "unicode/translit.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/uniset.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/utrans.h"
  29 #include "util.h"
  30 #include "uvector.h"
  31
  32 U_NAMESPACE_BEGIN
  33
  34 static const UChar ID_DELIM    = 0x003B; // ;
  35 static const UChar TARGET_SEP  = 0x002D; // -
  36 static const UChar VARIANT_SEP = 0x002F; // /
  37 static const UChar OPEN_REV    = 0x0028; // (
  38 static const UChar CLOSE_REV   = 0x0029; // )
  39
  40 //static const UChar EMPTY[]     = {0}; // ""
  41 static const UChar ANY[]       = {65,110,121,0}; // "Any"
  42 static const UChar ANY_NULL[]  = {65,110,121,45,78,117,108,108,0}; // "Any-Null"
  43
  44 static const int32_t FORWARD = UTRANS_FORWARD;
  45 static const int32_t REVERSE = UTRANS_REVERSE;
  46
  47 static Hashtable* SPECIAL_INVERSES = NULL;
  48 static UInitOnce gSpecialInversesInitOnce = U_INITONCE_INITIALIZER;
  49
  50 /**
  51  * The mutex controlling access to SPECIAL_INVERSES
  52  */
  53 static UMutex *LOCK() {
  54     static UMutex *m = STATIC_NEW(UMutex);
  55     return m;
  56 }
  57
  58 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t,
  59                                      const UnicodeString& v, UBool sawS,
  60                                      const UnicodeString& f) {
  61     source = s;
  62     target = t;
  63     variant = v;
  64     sawSource = sawS;
  65     filter = f;
  66 }
  67
  68 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b,
  69                                            const UnicodeString& f) {
  70     canonID = c;
  71     basicID = b;
  72     filter = f;
  73 }
  74
  75 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) {
  76     canonID = c;
  77     basicID = b;
  78 }
  79
  80 Transliterator* TransliteratorIDParser::SingleID::createInstance() {
  81     Transliterator* t;
  82     if (basicID.length() == 0) {
  83         t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID);
  84     } else {
  85         t = createBasicInstance(basicID, &canonID);
  86     }
  87     if (t != NULL) {
  88         if (filter.length() != 0) {
  89             UErrorCode ec = U_ZERO_ERROR;
  90             UnicodeSet *set = new UnicodeSet(filter, ec);
  91             if (U_FAILURE(ec)) {
  92                 delete set;
  93             } else {
  94                 t->adoptFilter(set);
  95             }
  96         }
  97     }
  98     return t;
  99 }
 100
 101
 102 /**
 103  * Parse a single ID, that is, an ID of the general form
 104  * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
 105  * optional, the filters optional, and the variants optional.
 106  * @param id the id to be parsed
 107  * @param pos INPUT-OUTPUT parameter.  On input, the position of
 108  * the first character to parse.  On output, the position after
 109  * the last character parsed.
 110  * @param dir the direction.  If the direction is REVERSE then the
 111  * SingleID is constructed for the reverse direction.
 112  * @return a SingleID object or NULL
 113  */
 114 TransliteratorIDParser::SingleID*
 115 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos,
 116                                       int32_t dir, UErrorCode& status) {
 117
 118     int32_t start = pos;
 119
 120     // The ID will be of the form A, A(), A(B), or (B), where
 121     // A and B are filter IDs.
 122     Specs* specsA = NULL;
 123     Specs* specsB = NULL;
 124     UBool sawParen = FALSE;
 125
 126     // On the first pass, look for (B) or ().  If this fails, then
 127     // on the second pass, look for A, A(B), or A().
 128     for (int32_t pass=1; pass<=2; ++pass) {
 129         if (pass == 2) {
 130             specsA = parseFilterID(id, pos, TRUE);
 131             if (specsA == NULL) {
 132                 pos = start;
 133                 return NULL;
 134             }
 135         }
 136         if (ICU_Utility::parseChar(id, pos, OPEN_REV)) {
 137             sawParen = TRUE;
 138             if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
 139                 specsB = parseFilterID(id, pos, TRUE);
 140                 // Must close with a ')'
 141                 if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
 142                     delete specsA;
 143                     pos = start;
 144                     return NULL;
 145                 }
 146             }
 147             break;
 148         }
 149     }
 150
 151     // Assemble return results
 152     SingleID* single;
 153     if (sawParen) {
 154         if (dir == FORWARD) {
 155             SingleID* b = specsToID(specsB, FORWARD);
 156             single = specsToID(specsA, FORWARD);
 157             // Null pointers check
 158             if (b == NULL || single == NULL) {
 159                 delete b;
 160                 delete single;
 161                 status = U_MEMORY_ALLOCATION_ERROR;
 162                 return NULL;
 163             }
 164             single->canonID.append(OPEN_REV)
 165                 .append(b->canonID).append(CLOSE_REV);
 166             if (specsA != NULL) {
 167                 single->filter = specsA->filter;
 168             }
 169             delete b;
 170         } else {
 171             SingleID* a = specsToID(specsA, FORWARD);
 172             single = specsToID(specsB, FORWARD);
 173             // Check for null pointer.
 174             if (a == NULL || single == NULL) {
 175                 delete a;
 176                 delete single;
 177                 status = U_MEMORY_ALLOCATION_ERROR;
 178                 return NULL;
 179             }
 180             single->canonID.append(OPEN_REV)
 181                 .append(a->canonID).append(CLOSE_REV);
 182             if (specsB != NULL) {
 183                 single->filter = specsB->filter;
 184             }
 185             delete a;
 186         }
 187     } else {
 188         // assert(specsA != NULL);
 189         if (dir == FORWARD) {
 190             single = specsToID(specsA, FORWARD);
 191         } else {
 192             single = specsToSpecialInverse(*specsA, status);
 193             if (single == NULL) {
 194                 single = specsToID(specsA, REVERSE);
 195             }
 196         }
 197         // Check for NULL pointer
 198         if (single == NULL) {
 199                 status = U_MEMORY_ALLOCATION_ERROR;
 200                 return NULL;
 201         }
 202         single->filter = specsA->filter;
 203     }
 204
 205     delete specsA;
 206     delete specsB;
 207
 208     return single;
 209 }
 210
 211 /**
 212  * Parse a filter ID, that is, an ID of the general form
 213  * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
 214  * @param id the id to be parsed
 215  * @param pos INPUT-OUTPUT parameter.  On input, the position of
 216  * the first character to parse.  On output, the position after
 217  * the last character parsed.
 218  * @return a SingleID object or null if the parse fails
 219  */
 220 TransliteratorIDParser::SingleID*
 221 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) {
 222
 223     int32_t start = pos;
 224
 225     Specs* specs = parseFilterID(id, pos, TRUE);
 226     if (specs == NULL) {
 227         pos = start;
 228         return NULL;
 229     }
 230
 231     // Assemble return results
 232     SingleID* single = specsToID(specs, FORWARD);
 233     if (single != NULL) {
 234         single->filter = specs->filter;
 235     }
 236     delete specs;
 237     return single;
 238 }
 239
 240 /**
 241  * Parse a global filter of the form "[f]" or "([f])", depending
 242  * on 'withParens'.
 243  * @param id the pattern the parse
 244  * @param pos INPUT-OUTPUT parameter.  On input, the position of
 245  * the first character to parse.  On output, the position after
 246  * the last character parsed.
 247  * @param dir the direction.
 248  * @param withParens INPUT-OUTPUT parameter.  On entry, if
 249  * withParens is 0, then parens are disallowed.  If it is 1,
 250  * then parens are requires.  If it is -1, then parens are
 251  * optional, and the return result will be set to 0 or 1.
 252  * @param canonID OUTPUT parameter.  The pattern for the filter
 253  * added to the canonID, either at the end, if dir is FORWARD, or
 254  * at the start, if dir is REVERSE.  The pattern will be enclosed
 255  * in parentheses if appropriate, and will be suffixed with an
 256  * ID_DELIM character.  May be NULL.
 257  * @return a UnicodeSet object or NULL.  A non-NULL results
 258  * indicates a successful parse, regardless of whether the filter
 259  * applies to the given direction.  The caller should discard it
 260  * if withParens != (dir == REVERSE).
 261  */
 262 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos,
 263                                                       int32_t dir,
 264                                                       int32_t& withParens,
 265                                                       UnicodeString* canonID) {
 266     UnicodeSet* filter = NULL;
 267     int32_t start = pos;
 268
 269     if (withParens == -1) {
 270         withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0;
 271     } else if (withParens == 1) {
 272         if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) {
 273             pos = start;
 274             return NULL;
 275         }
 276     }
 277
 278     ICU_Utility::skipWhitespace(id, pos, TRUE);
 279
 280     if (UnicodeSet::resemblesPattern(id, pos)) {
 281         ParsePosition ppos(pos);
 282         UErrorCode ec = U_ZERO_ERROR;
 283         filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec);
 284         /* test for NULL */
 285         if (filter == 0) {
 286             pos = start;
 287             return 0;
 288         }
 289         if (U_FAILURE(ec)) {
 290             delete filter;
 291             pos = start;
 292             return NULL;
 293         }
 294
 295         UnicodeString pattern;
 296         id.extractBetween(pos, ppos.getIndex(), pattern);
 297         pos = ppos.getIndex();
 298
 299         if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
 300             pos = start;
 301             return NULL;
 302         }
 303
 304         // In the forward direction, append the pattern to the
 305         // canonID.  In the reverse, insert it at zero, and invert
 306         // the presence of parens ("A" <-> "(A)").
 307         if (canonID != NULL) {
 308             if (dir == FORWARD) {
 309                 if (withParens == 1) {
 310                     pattern.insert(0, OPEN_REV);
 311                     pattern.append(CLOSE_REV);
 312                 }
 313                 canonID->append(pattern).append(ID_DELIM);
 314             } else {
 315                 if (withParens == 0) {
 316                     pattern.insert(0, OPEN_REV);
 317                     pattern.append(CLOSE_REV);
 318                 }
 319                 canonID->insert(0, pattern);
 320                 canonID->insert(pattern.length(), ID_DELIM);
 321             }
 322         }
 323     }
 324
 325     return filter;
 326 }
 327
 328 U_CDECL_BEGIN
 329 static void U_CALLCONV _deleteSingleID(void* obj) {
 330     delete (TransliteratorIDParser::SingleID*) obj;
 331 }
 332
 333 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) {
 334     delete (Transliterator*) obj;
 335 }
 336 U_CDECL_END
 337
 338 /**
 339  * Parse a compound ID, consisting of an optional forward global
 340  * filter, a separator, one or more single IDs delimited by
 341  * separators, an an optional reverse global filter.  The
 342  * separator is a semicolon.  The global filters are UnicodeSet
 343  * patterns.  The reverse global filter must be enclosed in
 344  * parentheses.
 345  * @param id the pattern the parse
 346  * @param dir the direction.
 347  * @param canonID OUTPUT parameter that receives the canonical ID,
 348  * consisting of canonical IDs for all elements, as returned by
 349  * parseSingleID(), separated by semicolons.  Previous contents
 350  * are discarded.
 351  * @param list OUTPUT parameter that receives a list of SingleID
 352  * objects representing the parsed IDs.  Previous contents are
 353  * discarded.
 354  * @param globalFilter OUTPUT parameter that receives a pointer to
 355  * a newly created global filter for this ID in this direction, or
 356  * NULL if there is none.
 357  * @return TRUE if the parse succeeds, that is, if the entire
 358  * id is consumed without syntax error.
 359  */
 360 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir,
 361                                               UnicodeString& canonID,
 362                                               UVector& list,
 363                                               UnicodeSet*& globalFilter) {
 364     UErrorCode ec = U_ZERO_ERROR;
 365     int32_t i;
 366     int32_t pos = 0;
 367     int32_t withParens = 1;
 368     list.removeAllElements();
 369     UnicodeSet* filter;
 370     globalFilter = NULL;
 371     canonID.truncate(0);
 372
 373     // Parse leading global filter, if any
 374     withParens = 0; // parens disallowed
 375     filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
 376     if (filter != NULL) {
 377         if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
 378             // Not a global filter; backup and resume
 379             canonID.truncate(0);
 380             pos = 0;
 381         }
 382         if (dir == FORWARD) {
 383             globalFilter = filter;
 384         } else {
 385             delete filter;
 386         }
 387         filter = NULL;
 388     }
 389
 390     UBool sawDelimiter = TRUE;
 391     for (;;) {
 392         SingleID* single = parseSingleID(id, pos, dir, ec);
 393         if (single == NULL) {
 394             break;
 395         }
 396         if (dir == FORWARD) {
 397             list.addElement(single, ec);
 398         } else {
 399             list.insertElementAt(single, 0, ec);
 400         }
 401         if (U_FAILURE(ec)) {
 402             goto FAIL;
 403         }
 404         if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
 405             sawDelimiter = FALSE;
 406             break;
 407         }
 408     }
 409
 410     if (list.size() == 0) {
 411         goto FAIL;
 412     }
 413
 414     // Construct canonical ID
 415     for (i=0; i<list.size(); ++i) {
 416         SingleID* single = (SingleID*) list.elementAt(i);
 417         canonID.append(single->canonID);
 418         if (i != (list.size()-1)) {
 419             canonID.append(ID_DELIM);
 420         }
 421     }
 422
 423     // Parse trailing global filter, if any, and only if we saw
 424     // a trailing delimiter after the IDs.
 425     if (sawDelimiter) {
 426         withParens = 1; // parens required
 427         filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
 428         if (filter != NULL) {
 429             // Don't require trailing ';', but parse it if present
 430             ICU_Utility::parseChar(id, pos, ID_DELIM);
 431
 432             if (dir == REVERSE) {
 433                 globalFilter = filter;
 434             } else {
 435                 delete filter;
 436             }
 437             filter = NULL;
 438         }
 439     }
 440
 441     // Trailing unparsed text is a syntax error
 442     ICU_Utility::skipWhitespace(id, pos, TRUE);
 443     if (pos != id.length()) {
 444         goto FAIL;
 445     }
 446
 447     return TRUE;
 448
 449  FAIL:
 450     UObjectDeleter *save = list.setDeleter(_deleteSingleID);
 451     list.removeAllElements();
 452     list.setDeleter(save);
 453     delete globalFilter;
 454     globalFilter = NULL;
 455     return FALSE;
 456 }
 457
 458 /**
 459  * Convert the elements of the 'list' vector, which are SingleID
 460  * objects, into actual Transliterator objects.  In the course of
 461  * this, some (or all) entries may be removed.  If all entries
 462  * are removed, the NULL transliterator will be added.
 463  *
 464  * Delete entries with empty basicIDs; these are generated by
 465  * elements like "(A)" in the forward direction, or "A()" in
 466  * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
 467  * SingleID entries to actual transliterators.
 468  *
 469  * @param list vector of SingleID objects.  On exit, vector
 470  * of one or more Transliterators.
 471  * @return new value of insertIndex.  The index will shift if
 472  * there are empty items, like "(Lower)", with indices less than
 473  * insertIndex.
 474  */
 475 void TransliteratorIDParser::instantiateList(UVector& list,
 476                                                 UErrorCode& ec) {
 477     UVector tlist(ec);
 478     if (U_FAILURE(ec)) {
 479         goto RETURN;
 480     }
 481     tlist.setDeleter(_deleteTransliteratorTrIDPars);
 482
 483     Transliterator* t;
 484     int32_t i;
 485     for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
 486         // We run the loop too long by one, so we can
 487         // do an insert after the last element
 488         if (i==list.size()) {
 489             break;
 490         }
 491
 492         SingleID* single = (SingleID*) list.elementAt(i);
 493         if (single->basicID.length() != 0) {
 494             t = single->createInstance();
 495             if (t == NULL) {
 496                 ec = U_INVALID_ID;
 497                 goto RETURN;
 498             }
 499             tlist.addElement(t, ec);
 500             if (U_FAILURE(ec)) {
 501                 delete t;
 502                 goto RETURN;
 503             }
 504         }
 505     }
 506
 507     // An empty list is equivalent to a NULL transliterator.
 508     if (tlist.size() == 0) {
 509         t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL);
 510         if (t == NULL) {
 511             // Should never happen
 512             ec = U_INTERNAL_TRANSLITERATOR_ERROR;
 513         }
 514         tlist.addElement(t, ec);
 515         if (U_FAILURE(ec)) {
 516             delete t;
 517         }
 518     }
 519
 520  RETURN:
 521
 522     UObjectDeleter *save = list.setDeleter(_deleteSingleID);
 523     list.removeAllElements();
 524
 525     if (U_SUCCESS(ec)) {
 526         list.setDeleter(_deleteTransliteratorTrIDPars);
 527
 528         while (tlist.size() > 0) {
 529             t = (Transliterator*) tlist.orphanElementAt(0);
 530             list.addElement(t, ec);
 531             if (U_FAILURE(ec)) {
 532                 delete t;
 533                 list.removeAllElements();
 534                 break;
 535             }
 536         }
 537     }
 538
 539     list.setDeleter(save);
 540 }
 541
 542 /**
 543  * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
 544  * S-T/V, or S/V-T.  If the source is missing, return a source of
 545  * ANY.
 546  * @param id the id string, in any of several forms
 547  * @return an array of 4 strings: source, target, variant, and
 548  * isSourcePresent.  If the source is not present, ANY will be
 549  * given as the source, and isSourcePresent will be NULL.  Otherwise
 550  * isSourcePresent will be non-NULL.  The target may be empty if the
 551  * id is not well-formed.  The variant may be empty.
 552  */
 553 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
 554                                      UnicodeString& source,
 555                                      UnicodeString& target,
 556                                      UnicodeString& variant,
 557                                      UBool& isSourcePresent) {
 558     source.setTo(ANY, 3);
 559     target.truncate(0);
 560     variant.truncate(0);
 561
 562     int32_t sep = id.indexOf(TARGET_SEP);
 563     int32_t var = id.indexOf(VARIANT_SEP);
 564     if (var < 0) {
 565         var = id.length();
 566     }
 567     isSourcePresent = FALSE;
 568
 569     if (sep < 0) {
 570         // Form: T/V or T (or /V)
 571         id.extractBetween(0, var, target);
 572         id.extractBetween(var, id.length(), variant);
 573     } else if (sep < var) {
 574         // Form: S-T/V or S-T (or -T/V or -T)
 575         if (sep > 0) {
 576             id.extractBetween(0, sep, source);
 577             isSourcePresent = TRUE;
 578         }
 579         id.extractBetween(++sep, var, target);
 580         id.extractBetween(var, id.length(), variant);
 581     } else {
 582         // Form: (S/V-T or /V-T)
 583         if (var > 0) {
 584             id.extractBetween(0, var, source);
 585             isSourcePresent = TRUE;
 586         }
 587         id.extractBetween(var, sep++, variant);
 588         id.extractBetween(sep, id.length(), target);
 589     }
 590
 591     if (variant.length() > 0) {
 592         variant.remove(0, 1);
 593     }
 594 }
 595
 596 /**
 597  * Given source, target, and variant strings, concatenate them into a
 598  * full ID.  If the source is empty, then "Any" will be used for the
 599  * source, so the ID will always be of the form s-t/v or s-t.
 600  */
 601 void TransliteratorIDParser::STVtoID(const UnicodeString& source,
 602                                      const UnicodeString& target,
 603                                      const UnicodeString& variant,
 604                                      UnicodeString& id) {
 605     id = source;
 606     if (id.length() == 0) {
 607         id.setTo(ANY, 3);
 608     }
 609     id.append(TARGET_SEP).append(target);
 610     if (variant.length() != 0) {
 611         id.append(VARIANT_SEP).append(variant);
 612     }
 613     // NUL-terminate the ID string for getTerminatedBuffer.
 614     // This prevents valgrind and Purify warnings.
 615     id.append((UChar)0);
 616     id.truncate(id.length()-1);
 617 }
 618
 619 /**
 620  * Register two targets as being inverses of one another.  For
 621  * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes
 622  * Transliterator to form the following inverse relationships:
 623  *
 624  * <pre>NFC => NFD
 625  * Any-NFC => Any-NFD
 626  * NFD => NFC
 627  * Any-NFD => Any-NFC</pre>
 628  *
 629  * (Without the special inverse registration, the inverse of NFC
 630  * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
 631  * that the presence or absence of "Any-" is preserved.
 632  *
 633  * <p>The relationship is symmetrical; registering (a, b) is
 634  * equivalent to registering (b, a).
 635  *
 636  * <p>The relevant IDs must still be registered separately as
 637  * factories or classes.
 638  *
 639  * <p>Only the targets are specified.  Special inverses always
 640  * have the form Any-Target1 <=> Any-Target2.  The target should
 641  * have canonical casing (the casing desired to be produced when
 642  * an inverse is formed) and should contain no whitespace or other
 643  * extraneous characters.
 644  *
 645  * @param target the target against which to register the inverse
 646  * @param inverseTarget the inverse of target, that is
 647  * Any-target.getInverse() => Any-inverseTarget
 648  * @param bidirectional if TRUE, register the reverse relation
 649  * as well, that is, Any-inverseTarget.getInverse() => Any-target
 650  */
 651 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target,
 652                                                     const UnicodeString& inverseTarget,
 653                                                     UBool bidirectional,
 654                                                     UErrorCode &status) {
 655     umtx_initOnce(gSpecialInversesInitOnce, init, status);
 656     if (U_FAILURE(status)) {
 657         return;
 658     }
 659
 660     // If target == inverseTarget then force bidirectional => FALSE
 661     if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) {
 662         bidirectional = FALSE;
 663     }
 664
 665     Mutex lock(LOCK());
 666
 667     UnicodeString *tempus = new UnicodeString(inverseTarget);  // Used for null pointer check before usage.
 668     if (tempus == NULL) {
 669         status = U_MEMORY_ALLOCATION_ERROR;
 670         return;
 671     }
 672     SPECIAL_INVERSES->put(target, tempus, status);
 673     if (bidirectional) {
 674         tempus = new UnicodeString(target);
 675         if (tempus == NULL) {
 676                 status = U_MEMORY_ALLOCATION_ERROR;
 677                 return;
 678         }
 679         SPECIAL_INVERSES->put(inverseTarget, tempus, status);
 680     }
 681 }
 682
 683 //----------------------------------------------------------------
 684 // Private implementation
 685 //----------------------------------------------------------------
 686
 687 /**
 688  * Parse an ID into component pieces.  Take IDs of the form T,
 689  * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
 690  * source of ANY.
 691  * @param id the id string, in any of several forms
 692  * @param pos INPUT-OUTPUT parameter.  On input, pos is the
 693  * offset of the first character to parse in id.  On output,
 694  * pos is the offset after the last parsed character.  If the
 695  * parse failed, pos will be unchanged.
 696  * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed
 697  * at any location between specs or delimiters, and is returned
 698  * as the fifth string in the array.
 699  * @return a Specs object, or NULL if the parse failed.  If
 700  * neither source nor target was seen in the parsed id, then the
 701  * parse fails.  If allowFilter is TRUE, then the parsed filter
 702  * pattern is returned in the Specs object, otherwise the returned
 703  * filter reference is NULL.  If the parse fails for any reason
 704  * NULL is returned.
 705  */
 706 TransliteratorIDParser::Specs*
 707 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos,
 708                                       UBool allowFilter) {
 709     UnicodeString first;
 710     UnicodeString source;
 711     UnicodeString target;
 712     UnicodeString variant;
 713     UnicodeString filter;
 714     UChar delimiter = 0;
 715     int32_t specCount = 0;
 716     int32_t start = pos;
 717
 718     // This loop parses one of the following things with each
 719     // pass: a filter, a delimiter character (either '-' or '/'),
 720     // or a spec (source, target, or variant).
 721     for (;;) {
 722         ICU_Utility::skipWhitespace(id, pos, TRUE);
 723         if (pos == id.length()) {
 724             break;
 725         }
 726
 727         // Parse filters
 728         if (allowFilter && filter.length() == 0 &&
 729             UnicodeSet::resemblesPattern(id, pos)) {
 730
 731             ParsePosition ppos(pos);
 732             UErrorCode ec = U_ZERO_ERROR;
 733             UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec);
 734             if (U_FAILURE(ec)) {
 735                 pos = start;
 736                 return NULL;
 737             }
 738             id.extractBetween(pos, ppos.getIndex(), filter);
 739             pos = ppos.getIndex();
 740             continue;
 741         }
 742
 743         if (delimiter == 0) {
 744             UChar c = id.charAt(pos);
 745             if ((c == TARGET_SEP && target.length() == 0) ||
 746                 (c == VARIANT_SEP && variant.length() == 0)) {
 747                 delimiter = c;
 748                 ++pos;
 749                 continue;
 750             }
 751         }
 752
 753         // We are about to try to parse a spec with no delimiter
 754         // when we can no longer do so (we can only do so at the
 755         // start); break.
 756         if (delimiter == 0 && specCount > 0) {
 757             break;
 758         }
 759
 760         UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos);
 761         if (spec.length() == 0) {
 762             // Note that if there was a trailing delimiter, we
 763             // consume it.  So Foo-, Foo/, Foo-Bar/, and Foo/Bar-
 764             // are legal.
 765             break;
 766         }
 767
 768         switch (delimiter) {
 769         case 0:
 770             first = spec;
 771             break;
 772         case TARGET_SEP:
 773             target = spec;
 774             break;
 775         case VARIANT_SEP:
 776             variant = spec;
 777             break;
 778         }
 779         ++specCount;
 780         delimiter = 0;
 781     }
 782
 783     // A spec with no prior character is either source or target,
 784     // depending on whether an explicit "-target" was seen.
 785     if (first.length() != 0) {
 786         if (target.length() == 0) {
 787             target = first;
 788         } else {
 789             source = first;
 790         }
 791     }
 792
 793     // Must have either source or target
 794     if (source.length() == 0 && target.length() == 0) {
 795         pos = start;
 796         return NULL;
 797     }
 798
 799     // Empty source or target defaults to ANY
 800     UBool sawSource = TRUE;
 801     if (source.length() == 0) {
 802         source.setTo(ANY, 3);
 803         sawSource = FALSE;
 804     }
 805     if (target.length() == 0) {
 806         target.setTo(ANY, 3);
 807     }
 808
 809     return new Specs(source, target, variant, sawSource, filter);
 810 }
 811
 812 /**
 813  * Givens a Spec object, convert it to a SingleID object.  The
 814  * Spec object is a more unprocessed parse result.  The SingleID
 815  * object contains information about canonical and basic IDs.
 816  * @return a SingleID; never returns NULL.  Returned object always
 817  * has 'filter' field of NULL.
 818  */
 819 TransliteratorIDParser::SingleID*
 820 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) {
 821     UnicodeString canonID;
 822     UnicodeString basicID;
 823     UnicodeString basicPrefix;
 824     if (specs != NULL) {
 825         UnicodeString buf;
 826         if (dir == FORWARD) {
 827             if (specs->sawSource) {
 828                 buf.append(specs->source).append(TARGET_SEP);
 829             } else {
 830                 basicPrefix = specs->source;
 831                 basicPrefix.append(TARGET_SEP);
 832             }
 833             buf.append(specs->target);
 834         } else {
 835             buf.append(specs->target).append(TARGET_SEP).append(specs->source);
 836         }
 837         if (specs->variant.length() != 0) {
 838             buf.append(VARIANT_SEP).append(specs->variant);
 839         }
 840         basicID = basicPrefix;
 841         basicID.append(buf);
 842         if (specs->filter.length() != 0) {
 843             buf.insert(0, specs->filter);
 844         }
 845         canonID = buf;
 846     }
 847     return new SingleID(canonID, basicID);
 848 }
 849
 850 /**
 851  * Given a Specs object, return a SingleID representing the
 852  * special inverse of that ID.  If there is no special inverse
 853  * then return NULL.
 854  * @return a SingleID or NULL.  Returned object always has
 855  * 'filter' field of NULL.
 856  */
 857 TransliteratorIDParser::SingleID*
 858 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) {
 859     if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) {
 860         return NULL;
 861     }
 862     umtx_initOnce(gSpecialInversesInitOnce, init, status);
 863     if (U_FAILURE(status)) {
 864         return NULL;
 865     }
 866
 867     UnicodeString* inverseTarget;
 868
 869     umtx_lock(LOCK());
 870     inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target);
 871     umtx_unlock(LOCK());
 872
 873     if (inverseTarget != NULL) {
 874         // If the original ID contained "Any-" then make the
 875         // special inverse "Any-Foo"; otherwise make it "Foo".
 876         // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
 877         UnicodeString buf;
 878         if (specs.filter.length() != 0) {
 879             buf.append(specs.filter);
 880         }
 881         if (specs.sawSource) {
 882             buf.append(ANY, 3).append(TARGET_SEP);
 883         }
 884         buf.append(*inverseTarget);
 885
 886         UnicodeString basicID(TRUE, ANY, 3);
 887         basicID.append(TARGET_SEP).append(*inverseTarget);
 888
 889         if (specs.variant.length() != 0) {
 890             buf.append(VARIANT_SEP).append(specs.variant);
 891             basicID.append(VARIANT_SEP).append(specs.variant);
 892         }
 893         return new SingleID(buf, basicID);
 894     }
 895     return NULL;
 896 }
 897
 898 /**
 899  * Glue method to get around access problems in C++.  This would
 900  * ideally be inline but we want to avoid a circular header
 901  * dependency.
 902  */
 903 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
 904     return Transliterator::createBasicInstance(id, canonID);
 905 }
 906
 907 /**
 908  * Initialize static memory. Called through umtx_initOnce only.
 909  */
 910 void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) {
 911     U_ASSERT(SPECIAL_INVERSES == NULL);
 912     ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
 913
 914     SPECIAL_INVERSES = new Hashtable(TRUE, status);
 915     if (SPECIAL_INVERSES == NULL) {
 916         status = U_MEMORY_ALLOCATION_ERROR;
 917         return;
 918     }
 919     SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject);
 920 }
 921
 922 /**
 923  * Free static memory.
 924  */
 925 void TransliteratorIDParser::cleanup() {
 926     if (SPECIAL_INVERSES) {
 927         delete SPECIAL_INVERSES;
 928         SPECIAL_INVERSES = NULL;
 929     }
 930     gSpecialInversesInitOnce.reset();
 931 }
 932
 933 U_NAMESPACE_END
 934
 935 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 936
 937 //eof