icuSources/common/util.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2006, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   11/19/2001  aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "util.h"
  12 #include "unicode/unimatch.h"
  13
  14 // Define UChar constants using hex for EBCDIC compatibility
  15
  16 static const UChar BACKSLASH  = 0x005C; /*\*/
  17 static const UChar UPPER_U    = 0x0055; /*U*/
  18 static const UChar LOWER_U    = 0x0075; /*u*/
  19 static const UChar APOSTROPHE = 0x0027; // '\''
  20 static const UChar SPACE      = 0x0020; // ' '
  21
  22 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  23 static const UChar DIGITS[] = {
  24     48,49,50,51,52,53,54,55,56,57,
  25     65,66,67,68,69,70,71,72,73,74,
  26     75,76,77,78,79,80,81,82,83,84,
  27     85,86,87,88,89,90
  28 };
  29
  30 U_NAMESPACE_BEGIN
  31
  32 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
  33                                      int32_t radix, int32_t minDigits) {
  34     if (radix < 2 || radix > 36) {
  35         // Bogus radix
  36         return result.append((UChar)63/*?*/);
  37     }
  38     // Handle negatives
  39     if (n < 0) {
  40         n = -n;
  41         result.append((UChar)45/*-*/);
  42     }
  43     // First determine the number of digits
  44     int32_t nn = n;
  45     int32_t r = 1;
  46     while (nn >= radix) {
  47         nn /= radix;
  48         r *= radix;
  49         --minDigits;
  50     }
  51     // Now generate the digits
  52     while (--minDigits > 0) {
  53         result.append(DIGITS[0]);
  54     }
  55     while (r > 0) {
  56         int32_t digit = n / r;
  57         result.append(DIGITS[digit]);
  58         n -= digit * r;
  59         r /= radix;
  60     }
  61     return result;
  62 }
  63
  64 static const UChar HEX[16] = {48,49,50,51,52,53,54,55,  // 0-7
  65                               56,57,65,66,67,68,69,70}; // 8-9 A-F
  66
  67 /**
  68  * Return true if the character is NOT printable ASCII.
  69  */
  70 UBool ICU_Utility::isUnprintable(UChar32 c) {
  71     return !(c >= 0x20 && c <= 0x7E);
  72 }
  73
  74 /**
  75  * Escape unprintable characters using \uxxxx notation for U+0000 to
  76  * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
  77  * printable ASCII, then do nothing and return FALSE.  Otherwise,
  78  * append the escaped notation and return TRUE.
  79  */
  80 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
  81     if (isUnprintable(c)) {
  82         result.append(BACKSLASH);
  83         if (c & ~0xFFFF) {
  84             result.append(UPPER_U);
  85             result.append(HEX[0xF&(c>>28)]);
  86             result.append(HEX[0xF&(c>>24)]);
  87             result.append(HEX[0xF&(c>>20)]);
  88             result.append(HEX[0xF&(c>>16)]);
  89         } else {
  90             result.append(LOWER_U);
  91         }
  92         result.append(HEX[0xF&(c>>12)]);
  93         result.append(HEX[0xF&(c>>8)]);
  94         result.append(HEX[0xF&(c>>4)]);
  95         result.append(HEX[0xF&c]);
  96         return TRUE;
  97     }
  98     return FALSE;
  99 }
 100
 101 /**
 102  * Returns the index of a character, ignoring quoted text.
 103  * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
 104  * found by a search for 'h'.
 105  */
 106 // FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
 107 /*
 108 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
 109                                int32_t start, int32_t limit,
 110                                UChar charToFind) {
 111     for (int32_t i=start; i<limit; ++i) {
 112         UChar c = text.charAt(i);
 113         if (c == BACKSLASH) {
 114             ++i;
 115         } else if (c == APOSTROPHE) {
 116             while (++i < limit
 117                    && text.charAt(i) != APOSTROPHE) {}
 118         } else if (c == charToFind) {
 119             return i;
 120         }
 121     }
 122     return -1;
 123 }
 124 */
 125
 126 /**
 127  * Skip over a sequence of zero or more white space characters at pos.
 128  * @param advance if true, advance pos to the first non-white-space
 129  * character at or after pos, or str.length(), if there is none.
 130  * Otherwise leave pos unchanged.
 131  * @return the index of the first non-white-space character at or
 132  * after pos, or str.length(), if there is none.
 133  */
 134 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
 135                                     UBool advance) {
 136     int32_t p = pos;
 137     while (p < str.length()) {
 138         UChar32 c = str.char32At(p);
 139         if (!uprv_isRuleWhiteSpace(c)) {
 140             break;
 141         }
 142         p += UTF_CHAR_LENGTH(c);
 143     }
 144     if (advance) {
 145         pos = p;
 146     }
 147     return p;
 148 }
 149
 150 /**
 151  * Skip over whitespace in a Replaceable.  Whitespace is defined by
 152  * uprv_isRuleWhiteSpace().  Skipping may be done in the forward or
 153  * reverse direction.  In either case, the leftmost index will be
 154  * inclusive, and the rightmost index will be exclusive.  That is,
 155  * given a range defined as [start, limit), the call
 156  * skipWhitespace(text, start, limit) will advance start past leading
 157  * whitespace, whereas the call skipWhitespace(text, limit, start),
 158  * will back up limit past trailing whitespace.
 159  * @param text the text to be analyzed
 160  * @param pos either the start or limit of a range of 'text', to skip
 161  * leading or trailing whitespace, respectively
 162  * @param stop either the limit or start of a range of 'text', to skip
 163  * leading or trailing whitespace, respectively
 164  * @return the new start or limit, depending on what was passed in to
 165  * 'pos'
 166  */
 167 //?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
 168 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
 169 //?                                    int32_t pos, int32_t stop) {
 170 //?    UChar32 c;
 171 //?    UBool isForward = (stop >= pos);
 172 //?
 173 //?    if (!isForward) {
 174 //?        --pos; // pos is a limit, so back up by one
 175 //?    }
 176 //?
 177 //?    while (pos != stop &&
 178 //?           uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
 179 //?        if (isForward) {
 180 //?            pos += UTF_CHAR_LENGTH(c);
 181 //?        } else {
 182 //?            pos -= UTF_CHAR_LENGTH(c);
 183 //?        }
 184 //?    }
 185 //?
 186 //?    if (!isForward) {
 187 //?        ++pos; // make pos back into a limit
 188 //?    }
 189 //?
 190 //?    return pos;
 191 //?}
 192
 193 /**
 194  * Parse a single non-whitespace character 'ch', optionally
 195  * preceded by whitespace.
 196  * @param id the string to be parsed
 197  * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
 198  * offset of the first character to be parsed.  On output, pos[0]
 199  * is the index after the last parsed character.  If the parse
 200  * fails, pos[0] will be unchanged.
 201  * @param ch the non-whitespace character to be parsed.
 202  * @return true if 'ch' is seen preceded by zero or more
 203  * whitespace characters.
 204  */
 205 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
 206     int32_t start = pos;
 207     skipWhitespace(id, pos, TRUE);
 208     if (pos == id.length() ||
 209         id.charAt(pos) != ch) {
 210         pos = start;
 211         return FALSE;
 212     }
 213     ++pos;
 214     return TRUE;
 215 }
 216
 217 /**
 218  * Parse a pattern string within the given Replaceable and a parsing
 219  * pattern.  Characters are matched literally and case-sensitively
 220  * except for the following special characters:
 221  *
 222  * ~  zero or more uprv_isRuleWhiteSpace chars
 223  *
 224  * If end of pattern is reached with all matches along the way,
 225  * pos is advanced to the first unparsed index and returned.
 226  * Otherwise -1 is returned.
 227  * @param pat pattern that controls parsing
 228  * @param text text to be parsed, starting at index
 229  * @param index offset to first character to parse
 230  * @param limit offset after last character to parse
 231  * @return index after last parsed character, or -1 on parse failure.
 232  */
 233 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
 234                                   const Replaceable& text,
 235                                   int32_t index,
 236                                   int32_t limit) {
 237     int32_t ipat = 0;
 238
 239     // empty pattern matches immediately
 240     if (ipat == pat.length()) {
 241         return index;
 242     }
 243
 244     UChar32 cpat = pat.char32At(ipat);
 245
 246     while (index < limit) {
 247         UChar32 c = text.char32At(index);
 248
 249         // parse \s*
 250         if (cpat == 126 /*~*/) {
 251             if (uprv_isRuleWhiteSpace(c)) {
 252                 index += UTF_CHAR_LENGTH(c);
 253                 continue;
 254             } else {
 255                 if (++ipat == pat.length()) {
 256                     return index; // success; c unparsed
 257                 }
 258                 // fall thru; process c again with next cpat
 259             }
 260         }
 261
 262         // parse literal
 263         else if (c == cpat) {
 264             index += UTF_CHAR_LENGTH(c);
 265             ipat += UTF_CHAR_LENGTH(cpat);
 266             if (ipat == pat.length()) {
 267                 return index; // success; c parsed
 268             }
 269             // fall thru; get next cpat
 270         }
 271
 272         // match failure of literal
 273         else {
 274             return -1;
 275         }
 276
 277         cpat = pat.char32At(ipat);
 278     }
 279
 280     return -1; // text ended before end of pat
 281 }
 282
 283 /**
 284  * Append a character to a rule that is being built up.  To flush
 285  * the quoteBuf to rule, make one final call with isLiteral == TRUE.
 286  * If there is no final character, pass in (UChar32)-1 as c.
 287  * @param rule the string to append the character to
 288  * @param c the character to append, or (UChar32)-1 if none.
 289  * @param isLiteral if true, then the given character should not be
 290  * quoted or escaped.  Usually this means it is a syntactic element
 291  * such as > or $
 292  * @param escapeUnprintable if true, then unprintable characters
 293  * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
 294  * appear outside of quotes.
 295  * @param quoteBuf a buffer which is used to build up quoted
 296  * substrings.  The caller should initially supply an empty buffer,
 297  * and thereafter should not modify the buffer.  The buffer should be
 298  * cleared out by, at the end, calling this method with a literal
 299  * character.
 300  */
 301 void ICU_Utility::appendToRule(UnicodeString& rule,
 302                                UChar32 c,
 303                                UBool isLiteral,
 304                                UBool escapeUnprintable,
 305                                UnicodeString& quoteBuf) {
 306     // If we are escaping unprintables, then escape them outside
 307     // quotes.  \u and \U are not recognized within quotes.  The same
 308     // logic applies to literals, but literals are never escaped.
 309     if (isLiteral ||
 310         (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
 311         if (quoteBuf.length() > 0) {
 312             // We prefer backslash APOSTROPHE to double APOSTROPHE
 313             // (more readable, less similar to ") so if there are
 314             // double APOSTROPHEs at the ends, we pull them outside
 315             // of the quote.
 316
 317             // If the first thing in the quoteBuf is APOSTROPHE
 318             // (doubled) then pull it out.
 319             while (quoteBuf.length() >= 2 &&
 320                    quoteBuf.charAt(0) == APOSTROPHE &&
 321                    quoteBuf.charAt(1) == APOSTROPHE) {
 322                 rule.append(BACKSLASH).append(APOSTROPHE);
 323                 quoteBuf.remove(0, 2);
 324             }
 325             // If the last thing in the quoteBuf is APOSTROPHE
 326             // (doubled) then remove and count it and add it after.
 327             int32_t trailingCount = 0;
 328             while (quoteBuf.length() >= 2 &&
 329                    quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
 330                    quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
 331                 quoteBuf.truncate(quoteBuf.length()-2);
 332                 ++trailingCount;
 333             }
 334             if (quoteBuf.length() > 0) {
 335                 rule.append(APOSTROPHE);
 336                 rule.append(quoteBuf);
 337                 rule.append(APOSTROPHE);
 338                 quoteBuf.truncate(0);
 339             }
 340             while (trailingCount-- > 0) {
 341                 rule.append(BACKSLASH).append(APOSTROPHE);
 342             }
 343         }
 344         if (c != (UChar32)-1) {
 345             /* Since spaces are ignored during parsing, they are
 346              * emitted only for readability.  We emit one here
 347              * only if there isn't already one at the end of the
 348              * rule.
 349              */
 350             if (c == SPACE) {
 351                 int32_t len = rule.length();
 352                 if (len > 0 && rule.charAt(len-1) != c) {
 353                     rule.append(c);
 354                 }
 355             } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
 356                 rule.append(c);
 357             }
 358         }
 359     }
 360
 361     // Escape ' and '\' and don't begin a quote just for them
 362     else if (quoteBuf.length() == 0 &&
 363              (c == APOSTROPHE || c == BACKSLASH)) {
 364         rule.append(BACKSLASH);
 365         rule.append(c);
 366     }
 367
 368     // Specials (printable ascii that isn't [0-9a-zA-Z]) and
 369     // whitespace need quoting.  Also append stuff to quotes if we are
 370     // building up a quoted substring already.
 371     else if (quoteBuf.length() > 0 ||
 372              (c >= 0x0021 && c <= 0x007E &&
 373               !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
 374                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
 375                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
 376              uprv_isRuleWhiteSpace(c)) {
 377         quoteBuf.append(c);
 378         // Double ' within a quote
 379         if (c == APOSTROPHE) {
 380             quoteBuf.append(c);
 381         }
 382     }
 383
 384     // Otherwise just append
 385     else {
 386         rule.append(c);
 387     }
 388 }
 389
 390 void ICU_Utility::appendToRule(UnicodeString& rule,
 391                                const UnicodeString& text,
 392                                UBool isLiteral,
 393                                UBool escapeUnprintable,
 394                                UnicodeString& quoteBuf) {
 395     for (int32_t i=0; i<text.length(); ++i) {
 396         appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
 397     }
 398 }
 399
 400 /**
 401  * Given a matcher reference, which may be null, append its
 402  * pattern as a literal to the given rule.
 403  */
 404 void ICU_Utility::appendToRule(UnicodeString& rule,
 405                                const UnicodeMatcher* matcher,
 406                                UBool escapeUnprintable,
 407                                UnicodeString& quoteBuf) {
 408     if (matcher != NULL) {
 409         UnicodeString pat;
 410         appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
 411                      TRUE, escapeUnprintable, quoteBuf);
 412     }
 413 }
 414
 415 U_NAMESPACE_END
 416
 417 U_CAPI UBool U_EXPORT2
 418 uprv_isRuleWhiteSpace(UChar32 c) {
 419     /* "white space" in the sense of ICU rule parsers
 420        This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
 421        See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
 422        U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
 423        Equivalent to test for Pattern_White_Space Unicode property.
 424     */
 425     return (c >= 0x0009 && c <= 0x2029 &&
 426             (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
 427              c == 0x200E || c == 0x200F || c >= 0x2028));
 428 }
 429
 430 //eof