git.saurik.com Git - apple/icu.git/blame - icuSources/common/util

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf A	3	/*
73c04bcf A	4	**********************************************************************
2ca993e8	5	* Copyright (c) 2001-2016, International Business Machines
73c04bcf A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	* Date Name Description
	9	* 11/19/2001 aliu Creation.
	10	**********************************************************************
	11	*/
	12
73c04bcf	13	#include "unicode/uchar.h"
4388f060 A	14	#include "unicode/utf16.h"
	15	#include "patternprops.h"
	16	#include "util.h"
73c04bcf A	17
	18	U_NAMESPACE_BEGIN
	19
	20	/**
	21	* Parse an integer at pos, either of the form \d+ or of the form
	22	* 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
	23	* or octal format.
	24	* @param pos INPUT-OUTPUT parameter. On input, the first
	25	* character to parse. On output, the character after the last
	26	* parsed character.
	27	*/
	28	int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
	29	int32_t count = 0;
	30	int32_t value = 0;
	31	int32_t p = pos;
	32	int8_t radix = 10;
	33
	34	if (p < limit && rule.charAt(p) == 48 /0/) {
	35	if (p+1 < limit && (rule.charAt(p+1) == 0x78 /x/ \|\| rule.charAt(p+1) == 0x58 /X/)) {
	36	p += 2;
	37	radix = 16;
	38	}
	39	else {
	40	p++;
	41	count = 1;
	42	radix = 8;
	43	}
	44	}
	45
	46	while (p < limit) {
	47	int32_t d = u_digit(rule.charAt(p++), radix);
	48	if (d < 0) {
	49	--p;
	50	break;
	51	}
	52	++count;
	53	int32_t v = (value * radix) + d;
	54	if (v <= value) {
	55	// If there are too many input digits, at some point
	56	// the value will go negative, e.g., if we have seen
	57	// "0x8000000" already and there is another '0', when
	58	// we parse the next 0 the value will go negative.
	59	return 0;
	60	}
	61	value = v;
	62	}
	63	if (count > 0) {
	64	pos = p;
	65	}
	66	return value;
	67	}
	68
	69	/**
	70	* Parse a pattern string starting at offset pos. Keywords are
	71	* matched case-insensitively. Spaces may be skipped and may be
	72	* optional or required. Integer values may be parsed, and if
	73	* they are, they will be returned in the given array. If
	74	* successful, the offset of the next non-space character is
	75	* returned. On failure, -1 is returned.
	76	* @param pattern must only contain lowercase characters, which
	77	* will match their uppercase equivalents as well. A space
	78	* character matches one or more required spaces. A '~' character
	79	* matches zero or more optional spaces. A '#' character matches
	80	* an integer and stores it in parsedInts, which the caller must
81	* ensure has enough capacity.
82	* @param parsedInts array to receive parsed integers. Caller
83	* must ensure that parsedInts.length is >= the number of '#'
84	* signs in 'pattern'.
85	* @return the position after the last character parsed, or -1 if
86	* the parse failed
87	*/
88	int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
89	const UnicodeString& pattern, int32_t* parsedInts) {
90	// TODO Update this to handle surrogates
91	int32_t p;
92	int32_t intCount = 0; // number of integers parsed
93	for (int32_t i=0; i<pattern.length(); ++i) {
94	UChar cpat = pattern.charAt(i);
95	UChar c;
96	switch (cpat) {
97	case 32 /' '/:
98	if (pos >= limit) {
99	return -1;
100	}
101	c = rule.charAt(pos++);
4388f060	102	if (!PatternProps::isWhiteSpace(c)) {
73c04bcf A	103	return -1;
	104	}
	105	// FALL THROUGH to skipWhitespace
2ca993e8	106	U_FALLTHROUGH;
73c04bcf A	107	case 126 /'~'/:
	108	pos = skipWhitespace(rule, pos);
	109	break;
	110	case 35 /'#'/:
	111	p = pos;
	112	parsedInts[intCount++] = parseInteger(rule, p, limit);
	113	if (p == pos) {
	114	// Syntax error; failed to parse integer
	115	return -1;
	116	}
	117	pos = p;
	118	break;
	119	default:
	120	if (pos >= limit) {
	121	return -1;
	122	}
	123	c = (UChar) u_tolower(rule.charAt(pos++));
	124	if (c != cpat) {
	125	return -1;
	126	}
	127	break;
	128	}
	129	}
	130	return pos;
	131	}
	132
	133	/**
	134	* Parse a Unicode identifier from the given string at the given
	135	* position. Return the identifier, or an empty string if there
	136	* is no identifier.
	137	* @param str the string to parse
	138	* @param pos INPUT-OUPUT parameter. On INPUT, pos is the
	139	* first character to examine. It must be less than str.length(),
	140	* and it must not point to a whitespace character. That is, must
4388f060	141	* have pos < str.length(). On
73c04bcf A	142	* OUTPUT, the position after the last parsed character.
	143	* @return the Unicode identifier, or an empty string if there is
	144	* no valid identifier at pos.
	145	*/
	146	UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
	147	// assert(pos < str.length());
73c04bcf A	148	UnicodeString buf;
	149	int p = pos;
	150	while (p < str.length()) {
	151	UChar32 ch = str.char32At(p);
	152	if (buf.length() == 0) {
	153	if (u_isIDStart(ch)) {
	154	buf.append(ch);
	155	} else {
	156	buf.truncate(0);
	157	return buf;
	158	}
	159	} else {
	160	if (u_isIDPart(ch)) {
	161	buf.append(ch);
	162	} else {
	163	break;
	164	}
	165	}
4388f060	166	p += U16_LENGTH(ch);
73c04bcf A	167	}
	168	pos = p;
	169	return buf;
	170	}
	171
	172	/**
	173	* Parse an unsigned 31-bit integer at the given offset. Use
	174	* UCharacter.digit() to parse individual characters into digits.
	175	* @param text the text to be parsed
	176	* @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
	177	* offset within text at which to start parsing; it should point
	178	* to a valid digit. On exit, pos[0] is the offset after the last
	179	* parsed character. If the parse failed, it will be unchanged on
	180	* exit. Must be >= 0 on entry.
	181	* @param radix the radix in which to parse; must be >= 2 and <=
	182	* 36.
	183	* @return a non-negative parsed number, or -1 upon parse failure.
	184	* Parse fails if there are no digits, that is, if pos[0] does not
	185	* point to a valid digit on entry, or if the number to be parsed
	186	* does not fit into a 31-bit unsigned integer.
	187	*/
	188	int32_t ICU_Utility::parseNumber(const UnicodeString& text,
	189	int32_t& pos, int8_t radix) {
	190	// assert(pos[0] >= 0);
	191	// assert(radix >= 2);
	192	// assert(radix <= 36);
	193	int32_t n = 0;
	194	int32_t p = pos;
	195	while (p < text.length()) {
	196	UChar32 ch = text.char32At(p);
	197	int32_t d = u_digit(ch, radix);
	198	if (d < 0) {
	199	break;
	200	}
	201	n = radix*n + d;
	202	// ASSUME that when a 32-bit integer overflows it becomes
	203	// negative. E.g., 214748364 * 10 + 8 => negative value.
	204	if (n < 0) {
	205	return -1;
	206	}
	207	++p;
	208	}
	209	if (p == pos) {
	210	return -1;
	211	}
	212	pos = p;
	213	return n;
	214	}
	215
	216	U_NAMESPACE_END
	217