]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/util.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / util.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
3* Copyright (c) 2001, International Business Machines
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* Date Name Description
7* 11/19/2001 aliu Creation.
8**********************************************************************
9*/
10
11#include "util.h"
12#include "unicode/uchar.h"
13#include "unicode/unimatch.h"
14#include "uprops.h"
15
16// Define UChar constants using hex for EBCDIC compatibility
17
18static const UChar BACKSLASH = 0x005C; /*\*/
19static const UChar UPPER_U = 0x0055; /*U*/
20static const UChar LOWER_U = 0x0075; /*u*/
21static const UChar APOSTROPHE = 0x0027; // '\''
22static const UChar SPACE = 0x0020; // ' '
23
24// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
25static const UChar DIGITS[] = {
26 48,49,50,51,52,53,54,55,56,57,
27 65,66,67,68,69,70,71,72,73,74,
28 75,76,77,78,79,80,81,82,83,84,
29 85,86,87,88,89,90
30};
31
32UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
33 int32_t radix, int32_t minDigits) {
34 if (radix < 2 || radix > 36) {
35 // Bogus radix
36 return result.append((UChar)63/*?*/);
37 }
38 // Handle negatives
39 if (n < 0) {
40 n = -n;
41 result.append((UChar)45/*-*/);
42 }
43 // First determine the number of digits
44 int32_t nn = n;
45 int32_t r = 1;
46 while (nn >= radix) {
47 nn /= radix;
48 r *= radix;
49 --minDigits;
50 }
51 // Now generate the digits
52 while (--minDigits > 0) {
53 result.append(DIGITS[0]);
54 }
55 while (r > 0) {
56 int32_t digit = n / r;
57 result.append(DIGITS[digit]);
58 n -= digit * r;
59 r /= radix;
60 }
61 return result;
62}
63
64static const UChar HEX[16] = {48,49,50,51,52,53,54,55, // 0-7
65 56,57,65,66,67,68,69,70}; // 8-9 A-F
66
67/**
68 * Return true if the character is NOT printable ASCII.
69 */
70UBool ICU_Utility::isUnprintable(UChar32 c) {
71 return !(c == 0x0A || (c >= 0x20 && c <= 0x7E));
72}
73
74/**
75 * Escape unprintable characters using \uxxxx notation for U+0000 to
76 * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
77 * printable ASCII, then do nothing and return FALSE. Otherwise,
78 * append the escaped notation and return TRUE.
79 */
80UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
81 if (isUnprintable(c)) {
82 result.append(BACKSLASH);
83 if (c & ~0xFFFF) {
84 result.append(UPPER_U);
85 result.append(HEX[0xF&(c>>28)]);
86 result.append(HEX[0xF&(c>>24)]);
87 result.append(HEX[0xF&(c>>20)]);
88 result.append(HEX[0xF&(c>>16)]);
89 } else {
90 result.append(LOWER_U);
91 }
92 result.append(HEX[0xF&(c>>12)]);
93 result.append(HEX[0xF&(c>>8)]);
94 result.append(HEX[0xF&(c>>4)]);
95 result.append(HEX[0xF&c]);
96 return TRUE;
97 }
98 return FALSE;
99}
100
101/**
102 * Returns the index of a character, ignoring quoted text.
103 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
104 * found by a search for 'h'.
105 */
106int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
107 int32_t start, int32_t limit,
108 UChar charToFind) {
109 for (int32_t i=start; i<limit; ++i) {
110 UChar c = text.charAt(i);
111 if (c == BACKSLASH) {
112 ++i;
113 } else if (c == APOSTROPHE) {
114 while (++i < limit
115 && text.charAt(i) != APOSTROPHE) {}
116 } else if (c == charToFind) {
117 return i;
118 }
119 }
120 return -1;
121}
122
123/**
124 * Skip over a sequence of zero or more white space characters at pos.
125 * @param advance if true, advance pos to the first non-white-space
126 * character at or after pos, or str.length(), if there is none.
127 * Otherwise leave pos unchanged.
128 * @return the index of the first non-white-space character at or
129 * after pos, or str.length(), if there is none.
130 */
131int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
132 UBool advance) {
133 int32_t p = pos;
134 while (p < str.length()) {
135 UChar32 c = str.char32At(p);
136 if (!uprv_isRuleWhiteSpace(c)) {
137 break;
138 }
139 p += UTF_CHAR_LENGTH(c);
140 }
141 if (advance) {
142 pos = p;
143 }
144 return p;
145}
146
147/**
148 * Skip over whitespace in a Replaceable. Whitespace is defined by
149 * uprv_isRuleWhiteSpace(). Skipping may be done in the forward or
150 * reverse direction. In either case, the leftmost index will be
151 * inclusive, and the rightmost index will be exclusive. That is,
152 * given a range defined as [start, limit), the call
153 * skipWhitespace(text, start, limit) will advance start past leading
154 * whitespace, whereas the call skipWhitespace(text, limit, start),
155 * will back up limit past trailing whitespace.
156 * @param text the text to be analyzed
157 * @param pos either the start or limit of a range of 'text', to skip
158 * leading or trailing whitespace, respectively
159 * @param stop either the limit or start of a range of 'text', to skip
160 * leading or trailing whitespace, respectively
161 * @return the new start or limit, depending on what was passed in to
162 * 'pos'
163 */
164//?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
165//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
166//? int32_t pos, int32_t stop) {
167//? UChar32 c;
168//? UBool isForward = (stop >= pos);
169//?
170//? if (!isForward) {
171//? --pos; // pos is a limit, so back up by one
172//? }
173//?
174//? while (pos != stop &&
175//? uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
176//? if (isForward) {
177//? pos += UTF_CHAR_LENGTH(c);
178//? } else {
179//? pos -= UTF_CHAR_LENGTH(c);
180//? }
181//? }
182//?
183//? if (!isForward) {
184//? ++pos; // make pos back into a limit
185//? }
186//?
187//? return pos;
188//?}
189
190/**
191 * Parse a single non-whitespace character 'ch', optionally
192 * preceded by whitespace.
193 * @param id the string to be parsed
194 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
195 * offset of the first character to be parsed. On output, pos[0]
196 * is the index after the last parsed character. If the parse
197 * fails, pos[0] will be unchanged.
198 * @param ch the non-whitespace character to be parsed.
199 * @return true if 'ch' is seen preceded by zero or more
200 * whitespace characters.
201 */
202UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
203 int32_t start = pos;
204 skipWhitespace(id, pos, TRUE);
205 if (pos == id.length() ||
206 id.charAt(pos) != ch) {
207 pos = start;
208 return FALSE;
209 }
210 ++pos;
211 return TRUE;
212}
213
214/**
215 * Parse a pattern string starting at offset pos. Keywords are
216 * matched case-insensitively. Spaces may be skipped and may be
217 * optional or required. Integer values may be parsed, and if
218 * they are, they will be returned in the given array. If
219 * successful, the offset of the next non-space character is
220 * returned. On failure, -1 is returned.
221 * @param pattern must only contain lowercase characters, which
222 * will match their uppercase equivalents as well. A space
223 * character matches one or more required spaces. A '~' character
224 * matches zero or more optional spaces. A '#' character matches
225 * an integer and stores it in parsedInts, which the caller must
226 * ensure has enough capacity.
227 * @param parsedInts array to receive parsed integers. Caller
228 * must ensure that parsedInts.length is >= the number of '#'
229 * signs in 'pattern'.
230 * @return the position after the last character parsed, or -1 if
231 * the parse failed
232 */
233int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
234 const UnicodeString& pattern, int32_t* parsedInts) {
235 // TODO Update this to handle surrogates
236 int32_t p;
237 int32_t intCount = 0; // number of integers parsed
238 for (int32_t i=0; i<pattern.length(); ++i) {
239 UChar cpat = pattern.charAt(i);
240 UChar c;
241 switch (cpat) {
242 case 32 /*' '*/:
243 if (pos >= limit) {
244 return -1;
245 }
246 c = rule.charAt(pos++);
247 if (!uprv_isRuleWhiteSpace(c)) {
248 return -1;
249 }
250 // FALL THROUGH to skipWhitespace
251 case 126 /*'~'*/:
252 pos = skipWhitespace(rule, pos);
253 break;
254 case 35 /*'#'*/:
255 p = pos;
256 parsedInts[intCount++] = parseInteger(rule, p, limit);
257 if (p == pos) {
258 // Syntax error; failed to parse integer
259 return -1;
260 }
261 pos = p;
262 break;
263 default:
264 if (pos >= limit) {
265 return -1;
266 }
267 c = (UChar) u_tolower(rule.charAt(pos++));
268 if (c != cpat) {
269 return -1;
270 }
271 break;
272 }
273 }
274 return pos;
275}
276
277/**
278 * Parse a pattern string within the given Replaceable and a parsing
279 * pattern. Characters are matched literally and case-sensitively
280 * except for the following special characters:
281 *
282 * ~ zero or more uprv_isRuleWhiteSpace chars
283 *
284 * If end of pattern is reached with all matches along the way,
285 * pos is advanced to the first unparsed index and returned.
286 * Otherwise -1 is returned.
287 * @param pat pattern that controls parsing
288 * @param text text to be parsed, starting at index
289 * @param index offset to first character to parse
290 * @param limit offset after last character to parse
291 * @return index after last parsed character, or -1 on parse failure.
292 */
293int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
294 const Replaceable& text,
295 int32_t index,
296 int32_t limit) {
297 int32_t ipat = 0;
298
299 // empty pattern matches immediately
300 if (ipat == pat.length()) {
301 return index;
302 }
303
304 UChar32 cpat = pat.char32At(ipat);
305
306 while (index < limit) {
307 UChar32 c = text.char32At(index);
308
309 // parse \s*
310 if (cpat == 126 /*~*/) {
311 if (uprv_isRuleWhiteSpace(c)) {
312 index += UTF_CHAR_LENGTH(c);
313 continue;
314 } else {
315 if (++ipat == pat.length()) {
316 return index; // success; c unparsed
317 }
318 // fall thru; process c again with next cpat
319 }
320 }
321
322 // parse literal
323 else if (c == cpat) {
324 index += UTF_CHAR_LENGTH(c);
325 ipat += UTF_CHAR_LENGTH(cpat);
326 if (ipat == pat.length()) {
327 return index; // success; c parsed
328 }
329 // fall thru; get next cpat
330 }
331
332 // match failure of literal
333 else {
334 return -1;
335 }
336
337 cpat = pat.char32At(ipat);
338 }
339
340 return -1; // text ended before end of pat
341}
342
343static const UChar ZERO_X[] = {48, 120, 0}; // "0x"
344
345/**
346 * Parse an integer at pos, either of the form \d+ or of the form
347 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
348 * or octal format.
349 * @param pos INPUT-OUTPUT parameter. On input, the first
350 * character to parse. On output, the character after the last
351 * parsed character.
352 */
353int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
354 int32_t count = 0;
355 int32_t value = 0;
356 int32_t p = pos;
357 int8_t radix = 10;
358
359 if (0 == rule.caseCompare(p, 2, ZERO_X, U_FOLD_CASE_DEFAULT)) {
360 p += 2;
361 radix = 16;
362 } else if (p < limit && rule.charAt(p) == 48 /*0*/) {
363 p++;
364 count = 1;
365 radix = 8;
366 }
367
368 while (p < limit) {
369 int32_t d = u_digit(rule.charAt(p++), radix);
370 if (d < 0) {
371 --p;
372 break;
373 }
374 ++count;
375 int32_t v = (value * radix) + d;
376 if (v <= value) {
377 // If there are too many input digits, at some point
378 // the value will go negative, e.g., if we have seen
379 // "0x8000000" already and there is another '0', when
380 // we parse the next 0 the value will go negative.
381 return 0;
382 }
383 value = v;
384 }
385 if (count > 0) {
386 pos = p;
387 }
388 return value;
389}
390
391/**
392 * Parse a Unicode identifier from the given string at the given
393 * position. Return the identifier, or an empty string if there
394 * is no identifier.
395 * @param str the string to parse
396 * @param pos INPUT-OUPUT parameter. On INPUT, pos is the
397 * first character to examine. It must be less than str.length(),
398 * and it must not point to a whitespace character. That is, must
399 * have pos < str.length() and
400 * !uprv_isRuleWhiteSpace(str.char32At(pos)). On
401 * OUTPUT, the position after the last parsed character.
402 * @return the Unicode identifier, or an empty string if there is
403 * no valid identifier at pos.
404 */
405UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
406 // assert(pos < str.length());
407 // assert(!uprv_isRuleWhiteSpace(str.char32At(pos)));
408 UnicodeString buf;
409 int p = pos;
410 while (p < str.length()) {
411 UChar32 ch = str.char32At(p);
412 if (buf.length() == 0) {
413 if (u_isIDStart(ch)) {
414 buf.append(ch);
415 } else {
416 buf.truncate(0);
417 return buf;
418 }
419 } else {
420 if (u_isIDPart(ch)) {
421 buf.append(ch);
422 } else {
423 break;
424 }
425 }
426 p += UTF_CHAR_LENGTH(ch);
427 }
428 pos = p;
429 return buf;
430}
431
432/**
433 * Parse an unsigned 31-bit integer at the given offset. Use
434 * UCharacter.digit() to parse individual characters into digits.
435 * @param text the text to be parsed
436 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
437 * offset within text at which to start parsing; it should point
438 * to a valid digit. On exit, pos[0] is the offset after the last
439 * parsed character. If the parse failed, it will be unchanged on
440 * exit. Must be >= 0 on entry.
441 * @param radix the radix in which to parse; must be >= 2 and <=
442 * 36.
443 * @return a non-negative parsed number, or -1 upon parse failure.
444 * Parse fails if there are no digits, that is, if pos[0] does not
445 * point to a valid digit on entry, or if the number to be parsed
446 * does not fit into a 31-bit unsigned integer.
447 */
448int32_t ICU_Utility::parseNumber(const UnicodeString& text,
449 int32_t& pos, int8_t radix) {
450 // assert(pos[0] >= 0);
451 // assert(radix >= 2);
452 // assert(radix <= 36);
453 int32_t n = 0;
454 int32_t p = pos;
455 while (p < text.length()) {
456 UChar32 ch = text.char32At(p);
457 int32_t d = u_digit(ch, radix);
458 if (d < 0) {
459 break;
460 }
461 n = radix*n + d;
462 // ASSUME that when a 32-bit integer overflows it becomes
463 // negative. E.g., 214748364 * 10 + 8 => negative value.
464 if (n < 0) {
465 return -1;
466 }
467 ++p;
468 }
469 if (p == pos) {
470 return -1;
471 }
472 pos = p;
473 return n;
474}
475
476/**
477 * Append a character to a rule that is being built up. To flush
478 * the quoteBuf to rule, make one final call with isLiteral == TRUE.
479 * If there is no final character, pass in (UChar32)-1 as c.
480 * @param rule the string to append the character to
481 * @param c the character to append, or (UChar32)-1 if none.
482 * @param isLiteral if true, then the given character should not be
483 * quoted or escaped. Usually this means it is a syntactic element
484 * such as > or $
485 * @param escapeUnprintable if true, then unprintable characters
486 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will
487 * appear outside of quotes.
488 * @param quoteBuf a buffer which is used to build up quoted
489 * substrings. The caller should initially supply an empty buffer,
490 * and thereafter should not modify the buffer. The buffer should be
491 * cleared out by, at the end, calling this method with a literal
492 * character.
493 */
494void ICU_Utility::appendToRule(UnicodeString& rule,
495 UChar32 c,
496 UBool isLiteral,
497 UBool escapeUnprintable,
498 UnicodeString& quoteBuf) {
499 // If we are escaping unprintables, then escape them outside
500 // quotes. \u and \U are not recognized within quotes. The same
501 // logic applies to literals, but literals are never escaped.
502 if (isLiteral ||
503 (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
504 if (quoteBuf.length() > 0) {
505 // We prefer backslash APOSTROPHE to double APOSTROPHE
506 // (more readable, less similar to ") so if there are
507 // double APOSTROPHEs at the ends, we pull them outside
508 // of the quote.
509
510 // If the first thing in the quoteBuf is APOSTROPHE
511 // (doubled) then pull it out.
512 while (quoteBuf.length() >= 2 &&
513 quoteBuf.charAt(0) == APOSTROPHE &&
514 quoteBuf.charAt(1) == APOSTROPHE) {
515 rule.append(BACKSLASH).append(APOSTROPHE);
516 quoteBuf.remove(0, 2);
517 }
518 // If the last thing in the quoteBuf is APOSTROPHE
519 // (doubled) then remove and count it and add it after.
520 int32_t trailingCount = 0;
521 while (quoteBuf.length() >= 2 &&
522 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
523 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
524 quoteBuf.truncate(quoteBuf.length()-2);
525 ++trailingCount;
526 }
527 if (quoteBuf.length() > 0) {
528 rule.append(APOSTROPHE);
529 rule.append(quoteBuf);
530 rule.append(APOSTROPHE);
531 quoteBuf.truncate(0);
532 }
533 while (trailingCount-- > 0) {
534 rule.append(BACKSLASH).append(APOSTROPHE);
535 }
536 }
537 if (c != (UChar32)-1) {
538 /* Since spaces are ignored during parsing, they are
539 * emitted only for readability. We emit one here
540 * only if there isn't already one at the end of the
541 * rule.
542 */
543 if (c == SPACE) {
544 int32_t len = rule.length();
545 if (len > 0 && rule.charAt(len-1) != c) {
546 rule.append(c);
547 }
548 } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
549 rule.append(c);
550 }
551 }
552 }
553
554 // Escape ' and '\' and don't begin a quote just for them
555 else if (quoteBuf.length() == 0 &&
556 (c == APOSTROPHE || c == BACKSLASH)) {
557 rule.append(BACKSLASH);
558 rule.append(c);
559 }
560
561 // Specials (printable ascii that isn't [0-9a-zA-Z]) and
562 // whitespace need quoting. Also append stuff to quotes if we are
563 // building up a quoted substring already.
564 else if (quoteBuf.length() > 0 ||
565 (c >= 0x0021 && c <= 0x007E &&
566 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
567 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
568 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
569 uprv_isRuleWhiteSpace(c)) {
570 quoteBuf.append(c);
571 // Double ' within a quote
572 if (c == APOSTROPHE) {
573 quoteBuf.append(c);
574 }
575 }
576
577 // Otherwise just append
578 else {
579 rule.append(c);
580 }
581}
582
583void ICU_Utility::appendToRule(UnicodeString& rule,
584 const UnicodeString& text,
585 UBool isLiteral,
586 UBool escapeUnprintable,
587 UnicodeString& quoteBuf) {
588 for (int32_t i=0; i<text.length(); ++i) {
589 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
590 }
591}
592
593/**
594 * Given a matcher reference, which may be null, append its
595 * pattern as a literal to the given rule.
596 */
597void ICU_Utility::appendToRule(UnicodeString& rule,
598 const UnicodeMatcher* matcher,
599 UBool escapeUnprintable,
600 UnicodeString& quoteBuf) {
601 if (matcher != NULL) {
602 UnicodeString pat;
603 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
604 TRUE, escapeUnprintable, quoteBuf);
605 }
606}
607
608//eof