icuSources/common/util.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   11/19/2001  aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "util.h"
  12 #include "unicode/uchar.h"
  13 #include "unicode/unimatch.h"
  14 #include "uprops.h"
  15
  16 // Define UChar constants using hex for EBCDIC compatibility
  17
  18 static const UChar BACKSLASH  = 0x005C; /*\*/
  19 static const UChar UPPER_U    = 0x0055; /*U*/
  20 static const UChar LOWER_U    = 0x0075; /*u*/
  21 static const UChar APOSTROPHE = 0x0027; // '\''
  22 static const UChar SPACE      = 0x0020; // ' '
  23
  24 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  25 static const UChar DIGITS[] = {
  26     48,49,50,51,52,53,54,55,56,57,
  27     65,66,67,68,69,70,71,72,73,74,
  28     75,76,77,78,79,80,81,82,83,84,
  29     85,86,87,88,89,90
  30 };
  31
  32 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
  33                                      int32_t radix, int32_t minDigits) {
  34     if (radix < 2 || radix > 36) {
  35         // Bogus radix
  36         return result.append((UChar)63/*?*/);
  37     }
  38     // Handle negatives
  39     if (n < 0) {
  40         n = -n;
  41         result.append((UChar)45/*-*/);
  42     }
  43     // First determine the number of digits
  44     int32_t nn = n;
  45     int32_t r = 1;
  46     while (nn >= radix) {
  47         nn /= radix;
  48         r *= radix;
  49         --minDigits;
  50     }
  51     // Now generate the digits
  52     while (--minDigits > 0) {
  53         result.append(DIGITS[0]);
  54     }
  55     while (r > 0) {
  56         int32_t digit = n / r;
  57         result.append(DIGITS[digit]);
  58         n -= digit * r;
  59         r /= radix;
  60     }
  61     return result;
  62 }
  63
  64 static const UChar HEX[16] = {48,49,50,51,52,53,54,55,  // 0-7
  65                               56,57,65,66,67,68,69,70}; // 8-9 A-F
  66
  67 /**
  68  * Return true if the character is NOT printable ASCII.
  69  */
  70 UBool ICU_Utility::isUnprintable(UChar32 c) {
  71     return !(c >= 0x20 && c <= 0x7E);
  72 }
  73
  74 /**
  75  * Escape unprintable characters using \uxxxx notation for U+0000 to
  76  * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
  77  * printable ASCII, then do nothing and return FALSE.  Otherwise,
  78  * append the escaped notation and return TRUE.
  79  */
  80 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
  81     if (isUnprintable(c)) {
  82         result.append(BACKSLASH);
  83         if (c & ~0xFFFF) {
  84             result.append(UPPER_U);
  85             result.append(HEX[0xF&(c>>28)]);
  86             result.append(HEX[0xF&(c>>24)]);
  87             result.append(HEX[0xF&(c>>20)]);
  88             result.append(HEX[0xF&(c>>16)]);
  89         } else {
  90             result.append(LOWER_U);
  91         }
  92         result.append(HEX[0xF&(c>>12)]);
  93         result.append(HEX[0xF&(c>>8)]);
  94         result.append(HEX[0xF&(c>>4)]);
  95         result.append(HEX[0xF&c]);
  96         return TRUE;
  97     }
  98     return FALSE;
  99 }
 100
 101 /**
 102  * Returns the index of a character, ignoring quoted text.
 103  * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
 104  * found by a search for 'h'.
 105  */
 106 // FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
 107 /*
 108 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
 109                                int32_t start, int32_t limit,
 110                                UChar charToFind) {
 111     for (int32_t i=start; i<limit; ++i) {
 112         UChar c = text.charAt(i);
 113         if (c == BACKSLASH) {
 114             ++i;
 115         } else if (c == APOSTROPHE) {
 116             while (++i < limit
 117                    && text.charAt(i) != APOSTROPHE) {}
 118         } else if (c == charToFind) {
 119             return i;
 120         }
 121     }
 122     return -1;
 123 }
 124 */
 125
 126 /**
 127  * Skip over a sequence of zero or more white space characters at pos.
 128  * @param advance if true, advance pos to the first non-white-space
 129  * character at or after pos, or str.length(), if there is none.
 130  * Otherwise leave pos unchanged.
 131  * @return the index of the first non-white-space character at or
 132  * after pos, or str.length(), if there is none.
 133  */
 134 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
 135                                     UBool advance) {
 136     int32_t p = pos;
 137     while (p < str.length()) {
 138         UChar32 c = str.char32At(p);
 139         if (!uprv_isRuleWhiteSpace(c)) {
 140             break;
 141         }
 142         p += UTF_CHAR_LENGTH(c);
 143     }
 144     if (advance) {
 145         pos = p;
 146     }
 147     return p;
 148 }
 149
 150 /**
 151  * Skip over whitespace in a Replaceable.  Whitespace is defined by
 152  * uprv_isRuleWhiteSpace().  Skipping may be done in the forward or
 153  * reverse direction.  In either case, the leftmost index will be
 154  * inclusive, and the rightmost index will be exclusive.  That is,
 155  * given a range defined as [start, limit), the call
 156  * skipWhitespace(text, start, limit) will advance start past leading
 157  * whitespace, whereas the call skipWhitespace(text, limit, start),
 158  * will back up limit past trailing whitespace.
 159  * @param text the text to be analyzed
 160  * @param pos either the start or limit of a range of 'text', to skip
 161  * leading or trailing whitespace, respectively
 162  * @param stop either the limit or start of a range of 'text', to skip
 163  * leading or trailing whitespace, respectively
 164  * @return the new start or limit, depending on what was passed in to
 165  * 'pos'
 166  */
 167 //?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
 168 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
 169 //?                                    int32_t pos, int32_t stop) {
 170 //?    UChar32 c;
 171 //?    UBool isForward = (stop >= pos);
 172 //?
 173 //?    if (!isForward) {
 174 //?        --pos; // pos is a limit, so back up by one
 175 //?    }
 176 //?
 177 //?    while (pos != stop &&
 178 //?           uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
 179 //?        if (isForward) {
 180 //?            pos += UTF_CHAR_LENGTH(c);
 181 //?        } else {
 182 //?            pos -= UTF_CHAR_LENGTH(c);
 183 //?        }
 184 //?    }
 185 //?
 186 //?    if (!isForward) {
 187 //?        ++pos; // make pos back into a limit
 188 //?    }
 189 //?
 190 //?    return pos;
 191 //?}
 192
 193 /**
 194  * Parse a single non-whitespace character 'ch', optionally
 195  * preceded by whitespace.
 196  * @param id the string to be parsed
 197  * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
 198  * offset of the first character to be parsed.  On output, pos[0]
 199  * is the index after the last parsed character.  If the parse
 200  * fails, pos[0] will be unchanged.
 201  * @param ch the non-whitespace character to be parsed.
 202  * @return true if 'ch' is seen preceded by zero or more
 203  * whitespace characters.
 204  */
 205 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
 206     int32_t start = pos;
 207     skipWhitespace(id, pos, TRUE);
 208     if (pos == id.length() ||
 209         id.charAt(pos) != ch) {
 210         pos = start;
 211         return FALSE;
 212     }
 213     ++pos;
 214     return TRUE;
 215 }
 216
 217 /**
 218  * Parse a pattern string starting at offset pos.  Keywords are
 219  * matched case-insensitively.  Spaces may be skipped and may be
 220  * optional or required.  Integer values may be parsed, and if
 221  * they are, they will be returned in the given array.  If
 222  * successful, the offset of the next non-space character is
 223  * returned.  On failure, -1 is returned.
 224  * @param pattern must only contain lowercase characters, which
 225  * will match their uppercase equivalents as well.  A space
 226  * character matches one or more required spaces.  A '~' character
 227  * matches zero or more optional spaces.  A '#' character matches
 228  * an integer and stores it in parsedInts, which the caller must
 229  * ensure has enough capacity.
 230  * @param parsedInts array to receive parsed integers.  Caller
 231  * must ensure that parsedInts.length is >= the number of '#'
 232  * signs in 'pattern'.
 233  * @return the position after the last character parsed, or -1 if
 234  * the parse failed
 235  */
 236 int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
 237                               const UnicodeString& pattern, int32_t* parsedInts) {
 238     // TODO Update this to handle surrogates
 239     int32_t p;
 240     int32_t intCount = 0; // number of integers parsed
 241     for (int32_t i=0; i<pattern.length(); ++i) {
 242         UChar cpat = pattern.charAt(i);
 243         UChar c;
 244         switch (cpat) {
 245         case 32 /*' '*/:
 246             if (pos >= limit) {
 247                 return -1;
 248             }
 249             c = rule.charAt(pos++);
 250             if (!uprv_isRuleWhiteSpace(c)) {
 251                 return -1;
 252             }
 253             // FALL THROUGH to skipWhitespace
 254         case 126 /*'~'*/:
 255             pos = skipWhitespace(rule, pos);
 256             break;
 257         case 35 /*'#'*/:
 258             p = pos;
 259             parsedInts[intCount++] = parseInteger(rule, p, limit);
 260             if (p == pos) {
 261                 // Syntax error; failed to parse integer
 262                 return -1;
 263             }
 264             pos = p;
 265             break;
 266         default:
 267             if (pos >= limit) {
 268                 return -1;
 269             }
 270             c = (UChar) u_tolower(rule.charAt(pos++));
 271             if (c != cpat) {
 272                 return -1;
 273             }
 274             break;
 275         }
 276     }
 277     return pos;
 278 }
 279
 280 /**
 281  * Parse a pattern string within the given Replaceable and a parsing
 282  * pattern.  Characters are matched literally and case-sensitively
 283  * except for the following special characters:
 284  *
 285  * ~  zero or more uprv_isRuleWhiteSpace chars
 286  *
 287  * If end of pattern is reached with all matches along the way,
 288  * pos is advanced to the first unparsed index and returned.
 289  * Otherwise -1 is returned.
 290  * @param pat pattern that controls parsing
 291  * @param text text to be parsed, starting at index
 292  * @param index offset to first character to parse
 293  * @param limit offset after last character to parse
 294  * @return index after last parsed character, or -1 on parse failure.
 295  */
 296 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
 297                                   const Replaceable& text,
 298                                   int32_t index,
 299                                   int32_t limit) {
 300     int32_t ipat = 0;
 301
 302     // empty pattern matches immediately
 303     if (ipat == pat.length()) {
 304         return index;
 305     }
 306
 307     UChar32 cpat = pat.char32At(ipat);
 308
 309     while (index < limit) {
 310         UChar32 c = text.char32At(index);
 311
 312         // parse \s*
 313         if (cpat == 126 /*~*/) {
 314             if (uprv_isRuleWhiteSpace(c)) {
 315                 index += UTF_CHAR_LENGTH(c);
 316                 continue;
 317             } else {
 318                 if (++ipat == pat.length()) {
 319                     return index; // success; c unparsed
 320                 }
 321                 // fall thru; process c again with next cpat
 322             }
 323         }
 324
 325         // parse literal
 326         else if (c == cpat) {
 327             index += UTF_CHAR_LENGTH(c);
 328             ipat += UTF_CHAR_LENGTH(cpat);
 329             if (ipat == pat.length()) {
 330                 return index; // success; c parsed
 331             }
 332             // fall thru; get next cpat
 333         }
 334
 335         // match failure of literal
 336         else {
 337             return -1;
 338         }
 339
 340         cpat = pat.char32At(ipat);
 341     }
 342
 343     return -1; // text ended before end of pat
 344 }
 345
 346 /**
 347  * Parse an integer at pos, either of the form \d+ or of the form
 348  * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
 349  * or octal format.
 350  * @param pos INPUT-OUTPUT parameter.  On input, the first
 351  * character to parse.  On output, the character after the last
 352  * parsed character.
 353  */
 354 int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
 355     int32_t count = 0;
 356     int32_t value = 0;
 357     int32_t p = pos;
 358     int8_t radix = 10;
 359
 360     if (p < limit && rule.charAt(p) == 48 /*0*/) {
 361         if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) {
 362             p += 2;
 363             radix = 16;
 364         }
 365         else {
 366             p++;
 367             count = 1;
 368             radix = 8;
 369         }
 370     }
 371
 372     while (p < limit) {
 373         int32_t d = u_digit(rule.charAt(p++), radix);
 374         if (d < 0) {
 375             --p;
 376             break;
 377         }
 378         ++count;
 379         int32_t v = (value * radix) + d;
 380         if (v <= value) {
 381             // If there are too many input digits, at some point
 382             // the value will go negative, e.g., if we have seen
 383             // "0x8000000" already and there is another '0', when
 384             // we parse the next 0 the value will go negative.
 385             return 0;
 386         }
 387         value = v;
 388     }
 389     if (count > 0) {
 390         pos = p;
 391     }
 392     return value;
 393 }
 394
 395 /**
 396  * Parse a Unicode identifier from the given string at the given
 397  * position.  Return the identifier, or an empty string if there
 398  * is no identifier.
 399  * @param str the string to parse
 400  * @param pos INPUT-OUPUT parameter.  On INPUT, pos is the
 401  * first character to examine.  It must be less than str.length(),
 402  * and it must not point to a whitespace character.  That is, must
 403  * have pos < str.length() and
 404  * !uprv_isRuleWhiteSpace(str.char32At(pos)).  On
 405  * OUTPUT, the position after the last parsed character.
 406  * @return the Unicode identifier, or an empty string if there is
 407  * no valid identifier at pos.
 408  */
 409 UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
 410     // assert(pos < str.length());
 411     // assert(!uprv_isRuleWhiteSpace(str.char32At(pos)));
 412     UnicodeString buf;
 413     int p = pos;
 414     while (p < str.length()) {
 415         UChar32 ch = str.char32At(p);
 416         if (buf.length() == 0) {
 417             if (u_isIDStart(ch)) {
 418                 buf.append(ch);
 419             } else {
 420                 buf.truncate(0);
 421                 return buf;
 422             }
 423         } else {
 424             if (u_isIDPart(ch)) {
 425                 buf.append(ch);
 426             } else {
 427                 break;
 428             }
 429         }
 430         p += UTF_CHAR_LENGTH(ch);
 431     }
 432     pos = p;
 433     return buf;
 434 }
 435
 436 /**
 437  * Parse an unsigned 31-bit integer at the given offset.  Use
 438  * UCharacter.digit() to parse individual characters into digits.
 439  * @param text the text to be parsed
 440  * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
 441  * offset within text at which to start parsing; it should point
 442  * to a valid digit.  On exit, pos[0] is the offset after the last
 443  * parsed character.  If the parse failed, it will be unchanged on
 444  * exit.  Must be >= 0 on entry.
 445  * @param radix the radix in which to parse; must be >= 2 and <=
 446  * 36.
 447  * @return a non-negative parsed number, or -1 upon parse failure.
 448  * Parse fails if there are no digits, that is, if pos[0] does not
 449  * point to a valid digit on entry, or if the number to be parsed
 450  * does not fit into a 31-bit unsigned integer.
 451  */
 452 int32_t ICU_Utility::parseNumber(const UnicodeString& text,
 453                                  int32_t& pos, int8_t radix) {
 454     // assert(pos[0] >= 0);
 455     // assert(radix >= 2);
 456     // assert(radix <= 36);
 457     int32_t n = 0;
 458     int32_t p = pos;
 459     while (p < text.length()) {
 460         UChar32 ch = text.char32At(p);
 461         int32_t d = u_digit(ch, radix);
 462         if (d < 0) {
 463             break;
 464         }
 465         n = radix*n + d;
 466         // ASSUME that when a 32-bit integer overflows it becomes
 467         // negative.  E.g., 214748364 * 10 + 8 => negative value.
 468         if (n < 0) {
 469             return -1;
 470         }
 471         ++p;
 472     }
 473     if (p == pos) {
 474         return -1;
 475     }
 476     pos = p;
 477     return n;
 478 }
 479
 480 /**
 481  * Append a character to a rule that is being built up.  To flush
 482  * the quoteBuf to rule, make one final call with isLiteral == TRUE.
 483  * If there is no final character, pass in (UChar32)-1 as c.
 484  * @param rule the string to append the character to
 485  * @param c the character to append, or (UChar32)-1 if none.
 486  * @param isLiteral if true, then the given character should not be
 487  * quoted or escaped.  Usually this means it is a syntactic element
 488  * such as > or $
 489  * @param escapeUnprintable if true, then unprintable characters
 490  * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
 491  * appear outside of quotes.
 492  * @param quoteBuf a buffer which is used to build up quoted
 493  * substrings.  The caller should initially supply an empty buffer,
 494  * and thereafter should not modify the buffer.  The buffer should be
 495  * cleared out by, at the end, calling this method with a literal
 496  * character.
 497  */
 498 void ICU_Utility::appendToRule(UnicodeString& rule,
 499                                UChar32 c,
 500                                UBool isLiteral,
 501                                UBool escapeUnprintable,
 502                                UnicodeString& quoteBuf) {
 503     // If we are escaping unprintables, then escape them outside
 504     // quotes.  \u and \U are not recognized within quotes.  The same
 505     // logic applies to literals, but literals are never escaped.
 506     if (isLiteral ||
 507         (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
 508         if (quoteBuf.length() > 0) {
 509             // We prefer backslash APOSTROPHE to double APOSTROPHE
 510             // (more readable, less similar to ") so if there are
 511             // double APOSTROPHEs at the ends, we pull them outside
 512             // of the quote.
 513
 514             // If the first thing in the quoteBuf is APOSTROPHE
 515             // (doubled) then pull it out.
 516             while (quoteBuf.length() >= 2 &&
 517                    quoteBuf.charAt(0) == APOSTROPHE &&
 518                    quoteBuf.charAt(1) == APOSTROPHE) {
 519                 rule.append(BACKSLASH).append(APOSTROPHE);
 520                 quoteBuf.remove(0, 2);
 521             }
 522             // If the last thing in the quoteBuf is APOSTROPHE
 523             // (doubled) then remove and count it and add it after.
 524             int32_t trailingCount = 0;
 525             while (quoteBuf.length() >= 2 &&
 526                    quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
 527                    quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
 528                 quoteBuf.truncate(quoteBuf.length()-2);
 529                 ++trailingCount;
 530             }
 531             if (quoteBuf.length() > 0) {
 532                 rule.append(APOSTROPHE);
 533                 rule.append(quoteBuf);
 534                 rule.append(APOSTROPHE);
 535                 quoteBuf.truncate(0);
 536             }
 537             while (trailingCount-- > 0) {
 538                 rule.append(BACKSLASH).append(APOSTROPHE);
 539             }
 540         }
 541         if (c != (UChar32)-1) {
 542             /* Since spaces are ignored during parsing, they are
 543              * emitted only for readability.  We emit one here
 544              * only if there isn't already one at the end of the
 545              * rule.
 546              */
 547             if (c == SPACE) {
 548                 int32_t len = rule.length();
 549                 if (len > 0 && rule.charAt(len-1) != c) {
 550                     rule.append(c);
 551                 }
 552             } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
 553                 rule.append(c);
 554             }
 555         }
 556     }
 557
 558     // Escape ' and '\' and don't begin a quote just for them
 559     else if (quoteBuf.length() == 0 &&
 560              (c == APOSTROPHE || c == BACKSLASH)) {
 561         rule.append(BACKSLASH);
 562         rule.append(c);
 563     }
 564
 565     // Specials (printable ascii that isn't [0-9a-zA-Z]) and
 566     // whitespace need quoting.  Also append stuff to quotes if we are
 567     // building up a quoted substring already.
 568     else if (quoteBuf.length() > 0 ||
 569              (c >= 0x0021 && c <= 0x007E &&
 570               !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
 571                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
 572                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
 573              uprv_isRuleWhiteSpace(c)) {
 574         quoteBuf.append(c);
 575         // Double ' within a quote
 576         if (c == APOSTROPHE) {
 577             quoteBuf.append(c);
 578         }
 579     }
 580
 581     // Otherwise just append
 582     else {
 583         rule.append(c);
 584     }
 585 }
 586
 587 void ICU_Utility::appendToRule(UnicodeString& rule,
 588                                const UnicodeString& text,
 589                                UBool isLiteral,
 590                                UBool escapeUnprintable,
 591                                UnicodeString& quoteBuf) {
 592     for (int32_t i=0; i<text.length(); ++i) {
 593         appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
 594     }
 595 }
 596
 597 /**
 598  * Given a matcher reference, which may be null, append its
 599  * pattern as a literal to the given rule.
 600  */
 601 void ICU_Utility::appendToRule(UnicodeString& rule,
 602                                const UnicodeMatcher* matcher,
 603                                UBool escapeUnprintable,
 604                                UnicodeString& quoteBuf) {
 605     if (matcher != NULL) {
 606         UnicodeString pat;
 607         appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
 608                      TRUE, escapeUnprintable, quoteBuf);
 609     }
 610 }
 611
 612 U_CAPI UBool U_EXPORT2
 613 uprv_isRuleWhiteSpace(UChar32 c) {
 614     /* "white space" in the sense of ICU rule parsers
 615        This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
 616        See UTR #31: http://www.unicode.org/reports/tr31/.
 617        U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
 618     */
 619     return (c >= 0x0009 && c <= 0x2029 &&
 620             (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
 621              c == 0x200E || c == 0x200F || c >= 0x2028));
 622 }
 623
 624 //eof