icuSources/common/util.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2008, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   11/19/2001  aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "util.h"
  12 #include "unicode/unimatch.h"
  13 #include "unicode/uniset.h"
  14
  15 // Define UChar constants using hex for EBCDIC compatibility
  16
  17 static const UChar BACKSLASH  = 0x005C; /*\*/
  18 static const UChar UPPER_U    = 0x0055; /*U*/
  19 static const UChar LOWER_U    = 0x0075; /*u*/
  20 static const UChar APOSTROPHE = 0x0027; // '\''
  21 static const UChar SPACE      = 0x0020; // ' '
  22
  23 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  24 static const UChar DIGITS[] = {
  25     48,49,50,51,52,53,54,55,56,57,
  26     65,66,67,68,69,70,71,72,73,74,
  27     75,76,77,78,79,80,81,82,83,84,
  28     85,86,87,88,89,90
  29 };
  30
  31 U_NAMESPACE_BEGIN
  32
  33 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
  34                                      int32_t radix, int32_t minDigits) {
  35     if (radix < 2 || radix > 36) {
  36         // Bogus radix
  37         return result.append((UChar)63/*?*/);
  38     }
  39     // Handle negatives
  40     if (n < 0) {
  41         n = -n;
  42         result.append((UChar)45/*-*/);
  43     }
  44     // First determine the number of digits
  45     int32_t nn = n;
  46     int32_t r = 1;
  47     while (nn >= radix) {
  48         nn /= radix;
  49         r *= radix;
  50         --minDigits;
  51     }
  52     // Now generate the digits
  53     while (--minDigits > 0) {
  54         result.append(DIGITS[0]);
  55     }
  56     while (r > 0) {
  57         int32_t digit = n / r;
  58         result.append(DIGITS[digit]);
  59         n -= digit * r;
  60         r /= radix;
  61     }
  62     return result;
  63 }
  64
  65 /**
  66  * Return true if the character is NOT printable ASCII.
  67  */
  68 UBool ICU_Utility::isUnprintable(UChar32 c) {
  69     return !(c >= 0x20 && c <= 0x7E);
  70 }
  71
  72 /**
  73  * Escape unprintable characters using \uxxxx notation for U+0000 to
  74  * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
  75  * printable ASCII, then do nothing and return FALSE.  Otherwise,
  76  * append the escaped notation and return TRUE.
  77  */
  78 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
  79     if (isUnprintable(c)) {
  80         result.append(BACKSLASH);
  81         if (c & ~0xFFFF) {
  82             result.append(UPPER_U);
  83             result.append(DIGITS[0xF&(c>>28)]);
  84             result.append(DIGITS[0xF&(c>>24)]);
  85             result.append(DIGITS[0xF&(c>>20)]);
  86             result.append(DIGITS[0xF&(c>>16)]);
  87         } else {
  88             result.append(LOWER_U);
  89         }
  90         result.append(DIGITS[0xF&(c>>12)]);
  91         result.append(DIGITS[0xF&(c>>8)]);
  92         result.append(DIGITS[0xF&(c>>4)]);
  93         result.append(DIGITS[0xF&c]);
  94         return TRUE;
  95     }
  96     return FALSE;
  97 }
  98
  99 /**
 100  * Returns the index of a character, ignoring quoted text.
 101  * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
 102  * found by a search for 'h'.
 103  */
 104 // FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
 105 /*
 106 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
 107                                int32_t start, int32_t limit,
 108                                UChar charToFind) {
 109     for (int32_t i=start; i<limit; ++i) {
 110         UChar c = text.charAt(i);
 111         if (c == BACKSLASH) {
 112             ++i;
 113         } else if (c == APOSTROPHE) {
 114             while (++i < limit
 115                    && text.charAt(i) != APOSTROPHE) {}
 116         } else if (c == charToFind) {
 117             return i;
 118         }
 119     }
 120     return -1;
 121 }
 122 */
 123
 124 /**
 125  * Skip over a sequence of zero or more white space characters at pos.
 126  * @param advance if true, advance pos to the first non-white-space
 127  * character at or after pos, or str.length(), if there is none.
 128  * Otherwise leave pos unchanged.
 129  * @return the index of the first non-white-space character at or
 130  * after pos, or str.length(), if there is none.
 131  */
 132 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
 133                                     UBool advance) {
 134     int32_t p = pos;
 135     while (p < str.length()) {
 136         UChar32 c = str.char32At(p);
 137         if (!uprv_isRuleWhiteSpace(c)) {
 138             break;
 139         }
 140         p += UTF_CHAR_LENGTH(c);
 141     }
 142     if (advance) {
 143         pos = p;
 144     }
 145     return p;
 146 }
 147
 148 /**
 149  * Skip over whitespace in a Replaceable.  Whitespace is defined by
 150  * uprv_isRuleWhiteSpace().  Skipping may be done in the forward or
 151  * reverse direction.  In either case, the leftmost index will be
 152  * inclusive, and the rightmost index will be exclusive.  That is,
 153  * given a range defined as [start, limit), the call
 154  * skipWhitespace(text, start, limit) will advance start past leading
 155  * whitespace, whereas the call skipWhitespace(text, limit, start),
 156  * will back up limit past trailing whitespace.
 157  * @param text the text to be analyzed
 158  * @param pos either the start or limit of a range of 'text', to skip
 159  * leading or trailing whitespace, respectively
 160  * @param stop either the limit or start of a range of 'text', to skip
 161  * leading or trailing whitespace, respectively
 162  * @return the new start or limit, depending on what was passed in to
 163  * 'pos'
 164  */
 165 //?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
 166 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
 167 //?                                    int32_t pos, int32_t stop) {
 168 //?    UChar32 c;
 169 //?    UBool isForward = (stop >= pos);
 170 //?
 171 //?    if (!isForward) {
 172 //?        --pos; // pos is a limit, so back up by one
 173 //?    }
 174 //?
 175 //?    while (pos != stop &&
 176 //?           uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
 177 //?        if (isForward) {
 178 //?            pos += UTF_CHAR_LENGTH(c);
 179 //?        } else {
 180 //?            pos -= UTF_CHAR_LENGTH(c);
 181 //?        }
 182 //?    }
 183 //?
 184 //?    if (!isForward) {
 185 //?        ++pos; // make pos back into a limit
 186 //?    }
 187 //?
 188 //?    return pos;
 189 //?}
 190
 191 /**
 192  * Parse a single non-whitespace character 'ch', optionally
 193  * preceded by whitespace.
 194  * @param id the string to be parsed
 195  * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
 196  * offset of the first character to be parsed.  On output, pos[0]
 197  * is the index after the last parsed character.  If the parse
 198  * fails, pos[0] will be unchanged.
 199  * @param ch the non-whitespace character to be parsed.
 200  * @return true if 'ch' is seen preceded by zero or more
 201  * whitespace characters.
 202  */
 203 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
 204     int32_t start = pos;
 205     skipWhitespace(id, pos, TRUE);
 206     if (pos == id.length() ||
 207         id.charAt(pos) != ch) {
 208         pos = start;
 209         return FALSE;
 210     }
 211     ++pos;
 212     return TRUE;
 213 }
 214
 215 /**
 216  * Parse a pattern string within the given Replaceable and a parsing
 217  * pattern.  Characters are matched literally and case-sensitively
 218  * except for the following special characters:
 219  *
 220  * ~  zero or more uprv_isRuleWhiteSpace chars
 221  *
 222  * If end of pattern is reached with all matches along the way,
 223  * pos is advanced to the first unparsed index and returned.
 224  * Otherwise -1 is returned.
 225  * @param pat pattern that controls parsing
 226  * @param text text to be parsed, starting at index
 227  * @param index offset to first character to parse
 228  * @param limit offset after last character to parse
 229  * @return index after last parsed character, or -1 on parse failure.
 230  */
 231 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
 232                                   const Replaceable& text,
 233                                   int32_t index,
 234                                   int32_t limit) {
 235     int32_t ipat = 0;
 236
 237     // empty pattern matches immediately
 238     if (ipat == pat.length()) {
 239         return index;
 240     }
 241
 242     UChar32 cpat = pat.char32At(ipat);
 243
 244     while (index < limit) {
 245         UChar32 c = text.char32At(index);
 246
 247         // parse \s*
 248         if (cpat == 126 /*~*/) {
 249             if (uprv_isRuleWhiteSpace(c)) {
 250                 index += UTF_CHAR_LENGTH(c);
 251                 continue;
 252             } else {
 253                 if (++ipat == pat.length()) {
 254                     return index; // success; c unparsed
 255                 }
 256                 // fall thru; process c again with next cpat
 257             }
 258         }
 259
 260         // parse literal
 261         else if (c == cpat) {
 262             index += UTF_CHAR_LENGTH(c);
 263             ipat += UTF_CHAR_LENGTH(cpat);
 264             if (ipat == pat.length()) {
 265                 return index; // success; c parsed
 266             }
 267             // fall thru; get next cpat
 268         }
 269
 270         // match failure of literal
 271         else {
 272             return -1;
 273         }
 274
 275         cpat = pat.char32At(ipat);
 276     }
 277
 278     return -1; // text ended before end of pat
 279 }
 280
 281 /**
 282  * Append a character to a rule that is being built up.  To flush
 283  * the quoteBuf to rule, make one final call with isLiteral == TRUE.
 284  * If there is no final character, pass in (UChar32)-1 as c.
 285  * @param rule the string to append the character to
 286  * @param c the character to append, or (UChar32)-1 if none.
 287  * @param isLiteral if true, then the given character should not be
 288  * quoted or escaped.  Usually this means it is a syntactic element
 289  * such as > or $
 290  * @param escapeUnprintable if true, then unprintable characters
 291  * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
 292  * appear outside of quotes.
 293  * @param quoteBuf a buffer which is used to build up quoted
 294  * substrings.  The caller should initially supply an empty buffer,
 295  * and thereafter should not modify the buffer.  The buffer should be
 296  * cleared out by, at the end, calling this method with a literal
 297  * character.
 298  */
 299 void ICU_Utility::appendToRule(UnicodeString& rule,
 300                                UChar32 c,
 301                                UBool isLiteral,
 302                                UBool escapeUnprintable,
 303                                UnicodeString& quoteBuf) {
 304     // If we are escaping unprintables, then escape them outside
 305     // quotes.  \u and \U are not recognized within quotes.  The same
 306     // logic applies to literals, but literals are never escaped.
 307     if (isLiteral ||
 308         (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
 309         if (quoteBuf.length() > 0) {
 310             // We prefer backslash APOSTROPHE to double APOSTROPHE
 311             // (more readable, less similar to ") so if there are
 312             // double APOSTROPHEs at the ends, we pull them outside
 313             // of the quote.
 314
 315             // If the first thing in the quoteBuf is APOSTROPHE
 316             // (doubled) then pull it out.
 317             while (quoteBuf.length() >= 2 &&
 318                    quoteBuf.charAt(0) == APOSTROPHE &&
 319                    quoteBuf.charAt(1) == APOSTROPHE) {
 320                 rule.append(BACKSLASH).append(APOSTROPHE);
 321                 quoteBuf.remove(0, 2);
 322             }
 323             // If the last thing in the quoteBuf is APOSTROPHE
 324             // (doubled) then remove and count it and add it after.
 325             int32_t trailingCount = 0;
 326             while (quoteBuf.length() >= 2 &&
 327                    quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
 328                    quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
 329                 quoteBuf.truncate(quoteBuf.length()-2);
 330                 ++trailingCount;
 331             }
 332             if (quoteBuf.length() > 0) {
 333                 rule.append(APOSTROPHE);
 334                 rule.append(quoteBuf);
 335                 rule.append(APOSTROPHE);
 336                 quoteBuf.truncate(0);
 337             }
 338             while (trailingCount-- > 0) {
 339                 rule.append(BACKSLASH).append(APOSTROPHE);
 340             }
 341         }
 342         if (c != (UChar32)-1) {
 343             /* Since spaces are ignored during parsing, they are
 344              * emitted only for readability.  We emit one here
 345              * only if there isn't already one at the end of the
 346              * rule.
 347              */
 348             if (c == SPACE) {
 349                 int32_t len = rule.length();
 350                 if (len > 0 && rule.charAt(len-1) != c) {
 351                     rule.append(c);
 352                 }
 353             } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
 354                 rule.append(c);
 355             }
 356         }
 357     }
 358
 359     // Escape ' and '\' and don't begin a quote just for them
 360     else if (quoteBuf.length() == 0 &&
 361              (c == APOSTROPHE || c == BACKSLASH)) {
 362         rule.append(BACKSLASH);
 363         rule.append(c);
 364     }
 365
 366     // Specials (printable ascii that isn't [0-9a-zA-Z]) and
 367     // whitespace need quoting.  Also append stuff to quotes if we are
 368     // building up a quoted substring already.
 369     else if (quoteBuf.length() > 0 ||
 370              (c >= 0x0021 && c <= 0x007E &&
 371               !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
 372                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
 373                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
 374              uprv_isRuleWhiteSpace(c)) {
 375         quoteBuf.append(c);
 376         // Double ' within a quote
 377         if (c == APOSTROPHE) {
 378             quoteBuf.append(c);
 379         }
 380     }
 381
 382     // Otherwise just append
 383     else {
 384         rule.append(c);
 385     }
 386 }
 387
 388 void ICU_Utility::appendToRule(UnicodeString& rule,
 389                                const UnicodeString& text,
 390                                UBool isLiteral,
 391                                UBool escapeUnprintable,
 392                                UnicodeString& quoteBuf) {
 393     for (int32_t i=0; i<text.length(); ++i) {
 394         appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
 395     }
 396 }
 397
 398 /**
 399  * Given a matcher reference, which may be null, append its
 400  * pattern as a literal to the given rule.
 401  */
 402 void ICU_Utility::appendToRule(UnicodeString& rule,
 403                                const UnicodeMatcher* matcher,
 404                                UBool escapeUnprintable,
 405                                UnicodeString& quoteBuf) {
 406     if (matcher != NULL) {
 407         UnicodeString pat;
 408         appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
 409                      TRUE, escapeUnprintable, quoteBuf);
 410     }
 411 }
 412
 413 U_NAMESPACE_END
 414
 415 U_CAPI UBool U_EXPORT2
 416 uprv_isRuleWhiteSpace(UChar32 c) {
 417     /* "white space" in the sense of ICU rule parsers
 418        This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
 419        See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
 420        U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
 421        Equivalent to test for Pattern_White_Space Unicode property.
 422     */
 423     return (c >= 0x0009 && c <= 0x2029 &&
 424             (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
 425              c == 0x200E || c == 0x200F || c >= 0x2028));
 426 }
 427
 428 U_CAPI U_NAMESPACE_QUALIFIER UnicodeSet* U_EXPORT2
 429 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
 430     if(U_FAILURE(*ec)) {
 431         return NULL;
 432     }
 433     // create a set with the Pattern_White_Space characters,
 434     // without a pattern for fewer code dependencies
 435     U_NAMESPACE_QUALIFIER UnicodeSet *set=new U_NAMESPACE_QUALIFIER UnicodeSet(9, 0xd);
 436     // Check for new failure.
 437     if (set == NULL) {
 438         *ec = U_MEMORY_ALLOCATION_ERROR;
 439         return NULL;
 440     }
 441     set->UnicodeSet::add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
 442     return set;
 443 }
 444
 445 //eof