git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/common/uniset

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1999-2006, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: uniset_props.cpp
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2004aug25
	14	* created by: Markus W. Scherer
	15	*
	16	* Character property dependent functions moved here from uniset.cpp
	17	*/
	18
	19	#include "unicode/utypes.h"
	20	#include "unicode/uniset.h"
	21	#include "unicode/parsepos.h"
	22	#include "unicode/uchar.h"
	23	#include "unicode/uscript.h"
	24	#include "unicode/symtable.h"
	25	#include "unicode/uset.h"
	26	#include "unicode/locid.h"
	27	#include "unicode/brkiter.h"
	28	#include "uset_imp.h"
	29	#include "ruleiter.h"
	30	#include "cmemory.h"
	31	#include "ucln_cmn.h"
	32	#include "util.h"
	33	#include "uvector.h"
	34	#include "uprops.h"
	35	#include "propname.h"
	36	#include "unormimp.h"
	37	#include "ucase.h"
	38	#include "ubidi_props.h"
	39	#include "uinvchar.h"
	40	#include "charstr.h"
	41	#include "cstring.h"
	42	#include "mutex.h"
	43	#include "uassert.h"
	44	#include "hash.h"
	45
	46	#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
	47
	48	// initial storage. Must be >= 0
	49	// * same as in uniset.cpp ! *
	50	#define START_EXTRA 16
	51
	52	// Define UChar constants using hex for EBCDIC compatibility
	53	// Used #define to reduce private static exports and memory access time.
	54	#define SET_OPEN ((UChar)0x005B) /[/
	55	#define SET_CLOSE ((UChar)0x005D) /]/
	56	#define HYPHEN ((UChar)0x002D) /-/
	57	#define COMPLEMENT ((UChar)0x005E) /^/
	58	#define COLON ((UChar)0x003A) /:/
	59	#define BACKSLASH ((UChar)0x005C) /\/
	60	#define INTERSECTION ((UChar)0x0026) /&/
	61	#define UPPER_U ((UChar)0x0055) /U/
	62	#define LOWER_U ((UChar)0x0075) /u/
	63	#define OPEN_BRACE ((UChar)123) /{/
	64	#define CLOSE_BRACE ((UChar)125) /}/
	65	#define UPPER_P ((UChar)0x0050) /P/
	66	#define LOWER_P ((UChar)0x0070) /p/
	67	#define UPPER_N ((UChar)78) /N/
	68	#define EQUALS ((UChar)0x003D) /=/
	69
	70	//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
	71	static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
	72	//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
	73	static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
	74	//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
	75	static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /-]/
	76
	77	// Special property set IDs
	78	static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
	79	static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
	80	static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
	81
	82	// Unicode name property alias
	83	#define NAME_PROP "na"
	84	#define NAME_PROP_LENGTH 2
	85
	86	/**
	87	* Delimiter string used in patterns to close a category reference:
	88	* ":]". Example: "[:Lu:]".
	89	*/
	90	//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
	91
	92	U_NAMESPACE_BEGIN
	93
	94	static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
	95
	96	// helper functions for matching of pattern syntax pieces ------------------ ***
	97	// these functions are parallel to the PERL_OPEN etc. strings above
	98
	99	// using these functions is not only faster than UnicodeString::compare() and
	100	// caseCompare(), but they also make UnicodeSet work for simple patterns when
	101	// no Unicode properties data is available - when caseCompare() fails
	102
	103	static inline UBool
	104	isPerlOpen(const UnicodeString &pattern, int32_t pos) {
	105	UChar c;
	106	return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P \|\| c==UPPER_P);
	107	}
	108
	109	/*static inline UBool
	110	isPerlClose(const UnicodeString &pattern, int32_t pos) {
	111	return pattern.charAt(pos)==CLOSE_BRACE;
	112	}*/
	113
	114	static inline UBool
	115	isNameOpen(const UnicodeString &pattern, int32_t pos) {
	116	return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
	117	}
	118
	119	static inline UBool
	120	isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
	121	return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
	122	}
	123
	124	/*static inline UBool
	125	isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
	126	return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
	127	}*/
	128
	129	// TODO memory debugging provided inside uniset.cpp
	130	// could be made available here but probably obsolete with use of modern
	131	// memory leak checker tools
	132	#define _dbgct(me)
	133
	134	//----------------------------------------------------------------
	135	// Constructors &c
	136	//----------------------------------------------------------------
	137
	138	/**
	139	* Constructs a set from the given pattern, optionally ignoring
	140	* white space. See the class description for the syntax of the
	141	* pattern language.
	142	* @param pattern a string specifying what characters are in the set
	143	*/
	144	UnicodeSet::UnicodeSet(const UnicodeString& pattern,
	145	UErrorCode& status) :
	146	len(0), capacity(START_EXTRA), bufferCapacity(0),
	147	list(0), buffer(0), strings(0)
	148	{
	149	if(U_SUCCESS(status)){
	150	list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);
	151	/* test for NULL */
	152	if(list == NULL) {
	153	status = U_MEMORY_ALLOCATION_ERROR;
	154	}else{
	155	allocateStrings();
	156	applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
	157	}
	158	}
	159	_dbgct(this);
	160	}
	161
	162	/**
	163	* Constructs a set from the given pattern, optionally ignoring
	164	* white space. See the class description for the syntax of the
	165	* pattern language.
	166	* @param pattern a string specifying what characters are in the set
	167	* @param options bitmask for options to apply to the pattern.
	168	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
	169	*/
	170	UnicodeSet::UnicodeSet(const UnicodeString& pattern,
	171	uint32_t options,
	172	const SymbolTable* symbols,
	173	UErrorCode& status) :
	174	len(0), capacity(START_EXTRA), bufferCapacity(0),
	175	list(0), buffer(0), strings(0)
	176	{
	177	if(U_SUCCESS(status)){
	178	list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);
	179	/* test for NULL */
	180	if(list == NULL) {
	181	status = U_MEMORY_ALLOCATION_ERROR;
	182	}else{
	183	allocateStrings();
	184	applyPattern(pattern, options, symbols, status);
	185	}
	186	}
	187	_dbgct(this);
	188	}
	189
	190	UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
	191	uint32_t options,
	192	const SymbolTable* symbols,
	193	UErrorCode& status) :
	194	len(0), capacity(START_EXTRA), bufferCapacity(0),
	195	list(0), buffer(0), strings(0)
	196	{
	197	if(U_SUCCESS(status)){
	198	list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);
	199	/* test for NULL */
	200	if(list == NULL) {
	201	status = U_MEMORY_ALLOCATION_ERROR;
	202	}else{
	203	allocateStrings();
	204	applyPattern(pattern, pos, options, symbols, status);
	205	}
	206	}
	207	_dbgct(this);
	208	}
	209
	210	//----------------------------------------------------------------
	211	// Public API
	212	//----------------------------------------------------------------
	213
	214	/**
	215	* Modifies this set to represent the set specified by the given
	216	* pattern, optionally ignoring white space. See the class
	217	* description for the syntax of the pattern language.
	218	* @param pattern a string specifying what characters are in the set
	219	* @param ignoreSpaces if <code>true</code>, all spaces in the
	220	* pattern are ignored. Spaces are those characters for which
	221	* <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
	222	* Characters preceded by '\\' are escaped, losing any special
	223	* meaning they otherwise have. Spaces may be included by
	224	* escaping them.
	225	* @exception <code>IllegalArgumentException</code> if the pattern
	226	* contains a syntax error.
	227	*/
	228	UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
	229	UErrorCode& status) {
	230	return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
	231	}
	232
	233
	234	/**
	235	* Modifies this set to represent the set specified by the given
	236	* pattern, optionally ignoring white space. See the class
	237	* description for the syntax of the pattern language.
	238	* @param pattern a string specifying what characters are in the set
	239	* @param options bitmask for options to apply to the pattern.
	240	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
	241	*/
	242	UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
	243	uint32_t options,
	244	const SymbolTable* symbols,
	245	UErrorCode& status) {
	246	if (U_FAILURE(status)) {
	247	return *this;
	248	}
	249
	250	ParsePosition pos(0);
	251	applyPattern(pattern, pos, options, symbols, status);
	252	if (U_FAILURE(status)) return *this;
	253
	254	int32_t i = pos.getIndex();
	255
	256	if (options & USET_IGNORE_SPACE) {
	257	// Skip over trailing whitespace
	258	ICU_Utility::skipWhitespace(pattern, i, TRUE);
	259	}
	260
	261	if (i != pattern.length()) {
	262	status = U_ILLEGAL_ARGUMENT_ERROR;
	263	}
	264	return *this;
	265	}
	266
	267	UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
	268	ParsePosition& pos,
	269	uint32_t options,
	270	const SymbolTable* symbols,
	271	UErrorCode& status) {
	272	if (U_FAILURE(status)) {
	273	return *this;
	274	}
	275	// Need to build the pattern in a temporary string because
	276	// _applyPattern calls add() etc., which set pat to empty.
	277	UnicodeString rebuiltPat;
	278	RuleCharacterIterator chars(pattern, symbols, pos);
	279	applyPattern(chars, symbols, rebuiltPat, options, status);
	280	if (U_FAILURE(status)) return *this;
	281	if (chars.inVariable()) {
	282	// syntaxError(chars, "Extra chars in variable value");
	283	status = U_MALFORMED_SET;
	284	return *this;
	285	}
	286	pat = rebuiltPat;
	287	return *this;
	288	}
	289
	290	/**
	291	* Return true if the given position, in the given pattern, appears
	292	* to be the start of a UnicodeSet pattern.
	293	*/
	294	UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
	295	return ((pos+1) < pattern.length() &&
	296	pattern.charAt(pos) == (UChar)91/[/) \|\|
	297	resemblesPropertyPattern(pattern, pos);
	298	}
	299
	300	//----------------------------------------------------------------
	301	// Implementation: Pattern parsing
	302	//----------------------------------------------------------------
	303
	304	/**
	305	* A small all-inline class to manage a UnicodeSet pointer. Add
	306	* operator->() etc. as needed.
	307	*/
	308	class UnicodeSetPointer {
	309	UnicodeSet* p;
	310	public:
	311	inline UnicodeSetPointer() : p(0) {}
	312	inline ~UnicodeSetPointer() { delete p; }
	313	inline UnicodeSet* pointer() { return p; }
	314	inline UBool allocate() {
	315	if (p == 0) {
	316	p = new UnicodeSet();
	317	}
	318	return p != 0;
	319	}
	320	};
	321
	322	/**
	323	* Parse the pattern from the given RuleCharacterIterator. The
	324	* iterator is advanced over the parsed pattern.
	325	* @param chars iterator over the pattern characters. Upon return
	326	* it will be advanced to the first character after the parsed
	327	* pattern, or the end of the iteration if all characters are
	328	* parsed.
	329	* @param symbols symbol table to use to parse and dereference
	330	* variables, or null if none.
	331	* @param rebuiltPat the pattern that was parsed, rebuilt or
	332	* copied from the input pattern, as appropriate.
	333	* @param options a bit mask of zero or more of the following:
	334	* IGNORE_SPACE, CASE.
	335	*/
	336	void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
	337	const SymbolTable* symbols,
	338	UnicodeString& rebuiltPat,
	339	uint32_t options,
	340	UErrorCode& ec) {
	341	if (U_FAILURE(ec)) return;
	342
	343	// Syntax characters: [ ] ^ - & { }
	344
	345	// Recognized special forms for chars, sets: c-c s-s s&s
	346
	347	int32_t opts = RuleCharacterIterator::PARSE_VARIABLES \|
	348	RuleCharacterIterator::PARSE_ESCAPES;
	349	if ((options & USET_IGNORE_SPACE) != 0) {
	350	opts \|= RuleCharacterIterator::SKIP_WHITESPACE;
	351	}
	352
	353	UnicodeString patLocal, buf;
	354	UBool usePat = FALSE;
	355	UnicodeSetPointer scratch;
	356	RuleCharacterIterator::Pos backup;
	357
	358	// mode: 0=before [, 1=between [...], 2=after ]
	359	// lastItem: 0=none, 1=char, 2=set
	360	int8_t lastItem = 0, mode = 0;
	361	UChar32 lastChar = 0;
	362	UChar op = 0;
	363
	364	UBool invert = FALSE;
	365
	366	clear();
	367
	368	while (mode != 2 && !chars.atEnd()) {
	369	U_ASSERT((lastItem == 0 && op == 0) \|\|
	370	(lastItem == 1 && (op == 0 \|\| op == HYPHEN /'-'/)) \|\|
	371	(lastItem == 2 && (op == 0 \|\| op == HYPHEN /'-'/ \|\|
	372	op == INTERSECTION /'&'/)));
	373
	374	UChar32 c = 0;
	375	UBool literal = FALSE;
	376	UnicodeSet* nested = 0; // alias - do not delete
	377
	378	// -------- Check for property pattern
	379
	380	// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
	381	int8_t setMode = 0;
	382	if (resemblesPropertyPattern(chars, opts)) {
	383	setMode = 2;
	384	}
	385
	386	// -------- Parse '[' of opening delimiter OR nested set.
	387	// If there is a nested set, use `setMode' to define how
	388	// the set should be parsed. If the '[' is part of the
	389	// opening delimiter for this pattern, parse special
	390	// strings "[", "[^", "[-", and "[^-". Check for stand-in
	391	// characters representing a nested set in the symbol
	392	// table.
	393
	394	else {
	395	// Prepare to backup if necessary
	396	chars.getPos(backup);
	397	c = chars.next(opts, literal, ec);
	398	if (U_FAILURE(ec)) return;
	399
	400	if (c == 0x5B /'['/ && !literal) {
	401	if (mode == 1) {
	402	chars.setPos(backup); // backup
	403	setMode = 1;
	404	} else {
	405	// Handle opening '[' delimiter
	406	mode = 1;
	407	patLocal.append((UChar) 0x5B /'['/);
	408	chars.getPos(backup); // prepare to backup
	409	c = chars.next(opts, literal, ec);
	410	if (U_FAILURE(ec)) return;
	411	if (c == 0x5E /'^'/ && !literal) {
	412	invert = TRUE;
	413	patLocal.append((UChar) 0x5E /'^'/);
	414	chars.getPos(backup); // prepare to backup
	415	c = chars.next(opts, literal, ec);
	416	if (U_FAILURE(ec)) return;
	417	}
	418	// Fall through to handle special leading '-';
	419	// otherwise restart loop for nested [], \p{}, etc.
	420	if (c == HYPHEN /'-'/) {
	421	literal = TRUE;
	422	// Fall through to handle literal '-' below
	423	} else {
	424	chars.setPos(backup); // backup
	425	continue;
	426	}
	427	}
	428	} else if (symbols != 0) {
	429	const UnicodeFunctor *m = symbols->lookupMatcher(c);
	430	if (m != 0) {
	431	if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
	432	ec = U_MALFORMED_SET;
	433	return;
	434	}
	435	// casting away const, but `nested' won't be modified
	436	// (important not to modify stored set)
	437	nested = (UnicodeSet*) m;
	438	setMode = 3;
	439	}
	440	}
	441	}
	442
	443	// -------- Handle a nested set. This either is inline in
	444	// the pattern or represented by a stand-in that has
	445	// previously been parsed and was looked up in the symbol
	446	// table.
	447
	448	if (setMode != 0) {
	449	if (lastItem == 1) {
	450	if (op != 0) {
	451	// syntaxError(chars, "Char expected after operator");
	452	ec = U_MALFORMED_SET;
	453	return;
	454	}
	455	add(lastChar, lastChar);
	456	_appendToPat(patLocal, lastChar, FALSE);
	457	lastItem = 0;
	458	op = 0;
	459	}
	460
	461	if (op == HYPHEN /'-'/ \|\| op == INTERSECTION /'&'/) {
	462	patLocal.append(op);
	463	}
	464
	465	if (nested == 0) {
	466	// lazy allocation
	467	if (!scratch.allocate()) {
	468	ec = U_MEMORY_ALLOCATION_ERROR;
	469	return;
	470	}
	471	nested = scratch.pointer();
	472	}
	473	switch (setMode) {
	474	case 1:
	475	nested->applyPattern(chars, symbols, patLocal, options, ec);
	476	break;
	477	case 2:
	478	chars.skipIgnored(opts);
	479	nested->applyPropertyPattern(chars, patLocal, ec);
	480	if (U_FAILURE(ec)) return;
	481	break;
	482	case 3: // `nested' already parsed
	483	nested->_toPattern(patLocal, FALSE);
	484	break;
	485	}
	486
	487	usePat = TRUE;
	488
	489	if (mode == 0) {
	490	// Entire pattern is a category; leave parse loop
	491	this = nested;
	492	mode = 2;
	493	break;
	494	}
	495
	496	switch (op) {
	497	case HYPHEN: /'-'/
	498	removeAll(*nested);
	499	break;
	500	case INTERSECTION: /'&'/
	501	retainAll(*nested);
	502	break;
	503	case 0:
	504	addAll(*nested);
	505	break;
	506	}
	507
	508	op = 0;
	509	lastItem = 2;
	510
	511	continue;
	512	}
	513
	514	if (mode == 0) {
	515	// syntaxError(chars, "Missing '['");
	516	ec = U_MALFORMED_SET;
	517	return;
	518	}
	519
	520	// -------- Parse special (syntax) characters. If the
	521	// current character is not special, or if it is escaped,
	522	// then fall through and handle it below.
	523
	524	if (!literal) {
	525	switch (c) {
	526	case 0x5D /']'/:
	527	if (lastItem == 1) {
	528	add(lastChar, lastChar);
	529	_appendToPat(patLocal, lastChar, FALSE);
	530	}
	531	// Treat final trailing '-' as a literal
	532	if (op == HYPHEN /'-'/) {
	533	add(op, op);
	534	patLocal.append(op);
	535	} else if (op == INTERSECTION /'&'/) {
	536	// syntaxError(chars, "Trailing '&'");
	537	ec = U_MALFORMED_SET;
	538	return;
	539	}
	540	patLocal.append((UChar) 0x5D /']'/);
	541	mode = 2;
	542	continue;
	543	case HYPHEN /'-'/:
	544	if (op == 0) {
	545	if (lastItem != 0) {
	546	op = (UChar) c;
	547	continue;
	548	} else {
	549	// Treat final trailing '-' as a literal
	550	add(c, c);
	551	c = chars.next(opts, literal, ec);
	552	if (U_FAILURE(ec)) return;
	553	if (c == 0x5D /']'/ && !literal) {
	554	patLocal.append(HYPHEN_RIGHT_BRACE);
	555	mode = 2;
	556	continue;
	557	}
	558	}
	559	}
	560	// syntaxError(chars, "'-' not after char or set");
	561	ec = U_MALFORMED_SET;
	562	return;
	563	case INTERSECTION /'&'/:
	564	if (lastItem == 2 && op == 0) {
	565	op = (UChar) c;
	566	continue;
	567	}
	568	// syntaxError(chars, "'&' not after set");
	569	ec = U_MALFORMED_SET;
	570	return;
	571	case 0x5E /'^'/:
	572	// syntaxError(chars, "'^' not after '['");
	573	ec = U_MALFORMED_SET;
	574	return;
	575	case 0x7B /'{'/:
	576	if (op != 0) {
	577	// syntaxError(chars, "Missing operand after operator");
	578	ec = U_MALFORMED_SET;
	579	return;
	580	}
	581	if (lastItem == 1) {
	582	add(lastChar, lastChar);
	583	_appendToPat(patLocal, lastChar, FALSE);
	584	}
	585	lastItem = 0;
	586	buf.truncate(0);
	587	{
	588	UBool ok = FALSE;
	589	while (!chars.atEnd()) {
	590	c = chars.next(opts, literal, ec);
	591	if (U_FAILURE(ec)) return;
	592	if (c == 0x7D /'}'/ && !literal) {
	593	ok = TRUE;
	594	break;
	595	}
	596	buf.append(c);
	597	}
	598	if (buf.length() < 1 \|\| !ok) {
	599	// syntaxError(chars, "Invalid multicharacter string");
	600	ec = U_MALFORMED_SET;
	601	return;
	602	}
	603	}
	604	// We have new string. Add it to set and continue;
	605	// we don't need to drop through to the further
	606	// processing
	607	add(buf);
	608	patLocal.append((UChar) 0x7B /'{'/);
	609	_appendToPat(patLocal, buf, FALSE);
	610	patLocal.append((UChar) 0x7D /'}'/);
	611	continue;
	612	case SymbolTable::SYMBOL_REF:
	613	// symbols nosymbols
	614	// [a-$] error error (ambiguous)
	615	// [a$] anchor anchor
	616	// [a-$x] var "x"* literal '$'
	617	// [a-$.] error literal '$'
	618	// *We won't get here in the case of var "x"
	619	{
	620	chars.getPos(backup);
	621	c = chars.next(opts, literal, ec);
	622	if (U_FAILURE(ec)) return;
	623	UBool anchor = (c == 0x5D /']'/ && !literal);
	624	if (symbols == 0 && !anchor) {
	625	c = SymbolTable::SYMBOL_REF;
	626	chars.setPos(backup);
	627	break; // literal '$'
	628	}
	629	if (anchor && op == 0) {
	630	if (lastItem == 1) {
	631	add(lastChar, lastChar);
	632	_appendToPat(patLocal, lastChar, FALSE);
	633	}
	634	add(U_ETHER);
	635	usePat = TRUE;
	636	patLocal.append((UChar) SymbolTable::SYMBOL_REF);
	637	patLocal.append((UChar) 0x5D /']'/);
	638	mode = 2;
	639	continue;
	640	}
	641	// syntaxError(chars, "Unquoted '$'");
	642	ec = U_MALFORMED_SET;
	643	return;
	644	}
	645	default:
	646	break;
	647	}
	648	}
	649
	650	// -------- Parse literal characters. This includes both
	651	// escaped chars ("\u4E01") and non-syntax characters
	652	// ("a").
	653
	654	switch (lastItem) {
	655	case 0:
	656	lastItem = 1;
	657	lastChar = c;
	658	break;
	659	case 1:
	660	if (op == HYPHEN /'-'/) {
	661	if (lastChar >= c) {
	662	// Don't allow redundant (a-a) or empty (b-a) ranges;
	663	// these are most likely typos.
	664	// syntaxError(chars, "Invalid range");
	665	ec = U_MALFORMED_SET;
	666	return;
	667	}
	668	add(lastChar, c);
	669	_appendToPat(patLocal, lastChar, FALSE);
	670	patLocal.append(op);
	671	_appendToPat(patLocal, c, FALSE);
	672	lastItem = 0;
	673	op = 0;
	674	} else {
	675	add(lastChar, lastChar);
	676	_appendToPat(patLocal, lastChar, FALSE);
	677	lastChar = c;
	678	}
	679	break;
	680	case 2:
	681	if (op != 0) {
	682	// syntaxError(chars, "Set expected after operator");
	683	ec = U_MALFORMED_SET;
	684	return;
	685	}
	686	lastChar = c;
	687	lastItem = 1;
	688	break;
	689	}
	690	}
	691
	692	if (mode != 2) {
	693	// syntaxError(chars, "Missing ']'");
	694	ec = U_MALFORMED_SET;
	695	return;
	696	}
	697
	698	chars.skipIgnored(opts);
	699
	700	/**
	701	* Handle global flags (invert, case insensitivity). If this
	702	* pattern should be compiled case-insensitive, then we need
	703	* to close over case BEFORE COMPLEMENTING. This makes
	704	* patterns like /[^abc]/i work.
	705	*/
	706	if ((options & USET_CASE_INSENSITIVE) != 0) {
	707	closeOver(USET_CASE_INSENSITIVE);
	708	}
	709	else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
	710	closeOver(USET_ADD_CASE_MAPPINGS);
	711	}
	712	if (invert) {
	713	complement();
	714	}
	715
	716	// Use the rebuilt pattern (patLocal) only if necessary. Prefer the
	717	// generated pattern.
	718	if (usePat) {
	719	rebuiltPat.append(patLocal);
	720	} else {
	721	_generatePattern(rebuiltPat, FALSE);
	722	}
	723	}
	724
	725	//----------------------------------------------------------------
	726	// Property set implementation
	727	//----------------------------------------------------------------
	728
	729	static UBool numericValueFilter(UChar32 ch, void* context) {
	730	return u_getNumericValue(ch) == (double)context;
	731	}
	732
	733	static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
	734	int32_t value = (int32_t)context;
	735	return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
	736	}
	737
	738	static UBool versionFilter(UChar32 ch, void* context) {
	739	UVersionInfo v, none = { 0, 0, 0, 0};
	740	UVersionInfo* version = (UVersionInfo*)context;
	741	u_charAge(ch, v);
	742	return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
	743	}
	744
	745	typedef struct {
	746	UProperty prop;
	747	int32_t value;
	748	} IntPropertyContext;
	749
	750	static UBool intPropertyFilter(UChar32 ch, void* context) {
	751	IntPropertyContext* c = (IntPropertyContext*)context;
	752	return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
	753	}
	754
	755
	756	/**
	757	* Generic filter-based scanning code for UCD property UnicodeSets.
	758	*/
	759	void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
	760	void* context,
	761	int32_t src,
	762	UErrorCode &status) {
	763	// Walk through all Unicode characters, noting the start
	764	// and end of each range for which filter.contain(c) is
	765	// true. Add each range to a set.
	766	//
	767	// To improve performance, use the INCLUSIONS set, which
	768	// encodes information about character ranges that are known
	769	// to have identical properties. INCLUSIONS contains
	770	// only the first characters of such ranges.
	771	//
	772	// TODO Where possible, instead of scanning over code points,
	773	// use internal property data to initialize UnicodeSets for
	774	// those properties. Scanning code points is slow.
	775	if (U_FAILURE(status)) return;
	776
	777	const UnicodeSet* inclusions = getInclusions(src, status);
	778	if (U_FAILURE(status)) {
	779	return;
	780	}
	781
	782	clear();
	783
	784	UChar32 startHasProperty = -1;
	785	int limitRange = inclusions->getRangeCount();
	786
	787	for (int j=0; j<limitRange; ++j) {
	788	// get current range
	789	UChar32 start = inclusions->getRangeStart(j);
	790	UChar32 end = inclusions->getRangeEnd(j);
	791
	792	// for all the code points in the range, process
	793	for (UChar32 ch = start; ch <= end; ++ch) {
	794	// only add to this UnicodeSet on inflection points --
	795	// where the hasProperty value changes to false
	796	if ((*filter)(ch, context)) {
	797	if (startHasProperty < 0) {
	798	startHasProperty = ch;
	799	}
	800	} else if (startHasProperty >= 0) {
	801	add(startHasProperty, ch-1);
	802	startHasProperty = -1;
	803	}
	804	}
	805	}
	806	if (startHasProperty >= 0) {
	807	add((UChar32)startHasProperty, (UChar32)0x10FFFF);
	808	}
	809	}
	810
	811	static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
	812	/* Note: we use ' ' in compiler code page */
	813	int32_t j = 0;
	814	char ch;
	815	--dstCapacity; /* make room for term. zero */
	816	while ((ch = *src++) != 0) {
	817	if (ch == ' ' && (j==0 \|\| (j>0 && dst[j-1]==' '))) {
	818	continue;
	819	}
	820	if (j >= dstCapacity) return FALSE;
	821	dst[j++] = ch;
	822	}
	823	if (j > 0 && dst[j-1] == ' ') --j;
	824	dst[j] = 0;
	825	return TRUE;
	826	}
	827
	828	//----------------------------------------------------------------
	829	// Property set API
	830	//----------------------------------------------------------------
	831
	832	#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
	833
	834	UnicodeSet&
	835	UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
	836	if (U_FAILURE(ec)) return *this;
	837
	838	if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
	839	applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
	840	} else {
	841	IntPropertyContext c = {prop, value};
	842	applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
	843	}
	844	return *this;
	845	}
	846
	847	UnicodeSet&
	848	UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
	849	const UnicodeString& value,
	850	UErrorCode& ec) {
	851	if (U_FAILURE(ec)) return *this;
	852
	853	// prop and value used to be converted to char * using the default
	854	// converter instead of the invariant conversion.
	855	// This should not be necessary because all Unicode property and value
	856	// names use only invariant characters.
	857	// If there are any variant characters, then we won't find them anyway.
	858	// Checking first avoids assertion failures in the conversion.
	859	if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) \|\|
	860	!uprv_isInvariantUString(value.getBuffer(), value.length())
	861	) {
	862	FAIL(ec);
	863	}
	864	CharString pname(prop);
	865	CharString vname(value);
	866
	867	UProperty p;
	868	int32_t v;
	869	UBool mustNotBeEmpty = FALSE, invert = FALSE;
	870
	871	if (value.length() > 0) {
	872	p = u_getPropertyEnum(pname);
	873	if (p == UCHAR_INVALID_CODE) FAIL(ec);
	874
	875	// Treat gc as gcm
	876	if (p == UCHAR_GENERAL_CATEGORY) {
	877	p = UCHAR_GENERAL_CATEGORY_MASK;
	878	}
	879
	880	if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) \|\|
	881	(p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) \|\|
	882	(p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
	883	v = u_getPropertyValueEnum(p, vname);
	884	if (v == UCHAR_INVALID_CODE) {
	885	// Handle numeric CCC
	886	if (p == UCHAR_CANONICAL_COMBINING_CLASS \|\|
	887	p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS \|\|
	888	p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
	889	char* end;
	890	double value = uprv_strtod(vname, &end);
	891	v = (int32_t) value;
	892	if (v != value \|\| v < 0 \|\| *end != 0) {
	893	// non-integral or negative value, or trailing junk
	894	FAIL(ec);
	895	}
	896	// If the resultant set is empty then the numeric value
	897	// was invalid.
	898	mustNotBeEmpty = TRUE;
	899	} else {
	900	FAIL(ec);
	901	}
	902	}
	903	}
	904
	905	else {
	906
	907	switch (p) {
	908	case UCHAR_NUMERIC_VALUE:
	909	{
	910	char* end;
	911	double value = uprv_strtod(vname, &end);
	912	if (*end != 0) {
	913	FAIL(ec);
	914	}
	915	applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
	916	return *this;
	917	}
	918	break;
	919	case UCHAR_NAME:
	920	case UCHAR_UNICODE_1_NAME:
	921	{
	922	// Must munge name, since u_charFromName() does not do
	923	// 'loose' matching.
	924	char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
	925	if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
	926	UCharNameChoice choice = (p == UCHAR_NAME) ?
	927	U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
	928	UChar32 ch = u_charFromName(choice, buf, &ec);
	929	if (U_SUCCESS(ec)) {
	930	clear();
	931	add(ch);
	932	return *this;
	933	} else {
	934	FAIL(ec);
	935	}
	936	}
	937	break;
	938	case UCHAR_AGE:
	939	{
	940	// Must munge name, since u_versionFromString() does not do
	941	// 'loose' matching.
	942	char buf[128];
	943	if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
	944	UVersionInfo version;
	945	u_versionFromString(version, buf);
	946	applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
	947	return *this;
	948	}
	949	break;
	950	default:
	951	// p is a non-binary, non-enumerated property that we
	952	// don't support (yet).
	953	FAIL(ec);
	954	}
	955	}
	956	}
	957
	958	else {
	959	// value is empty. Interpret as General Category, Script, or
	960	// Binary property.
	961	p = UCHAR_GENERAL_CATEGORY_MASK;
	962	v = u_getPropertyValueEnum(p, pname);
	963	if (v == UCHAR_INVALID_CODE) {
	964	p = UCHAR_SCRIPT;
	965	v = u_getPropertyValueEnum(p, pname);
	966	if (v == UCHAR_INVALID_CODE) {
	967	p = u_getPropertyEnum(pname);
	968	if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
	969	v = 1;
	970	} else if (0 == uprv_comparePropertyNames(ANY, pname)) {
	971	set(MIN_VALUE, MAX_VALUE);
	972	return *this;
	973	} else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
	974	set(0, 0x7F);
	975	return *this;
	976	} else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
	977	// [:Assigned:]=[:^Cn:]
	978	p = UCHAR_GENERAL_CATEGORY_MASK;
	979	v = U_GC_CN_MASK;
	980	invert = TRUE;
	981	} else {
	982	FAIL(ec);
	983	}
	984	}
	985	}
	986	}
	987
	988	applyIntPropertyValue(p, v, ec);
	989	if(invert) {
	990	complement();
	991	}
	992
	993	if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
	994	// mustNotBeEmpty is set to true if an empty set indicates
	995	// invalid input.
	996	ec = U_ILLEGAL_ARGUMENT_ERROR;
	997	}
	998
	999	return *this;
	1000	}
	1001
	1002	//----------------------------------------------------------------
	1003	// Property set patterns
	1004	//----------------------------------------------------------------
	1005
	1006	/**
	1007	* Return true if the given position, in the given pattern, appears
	1008	* to be the start of a property set pattern.
	1009	*/
	1010	UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
	1011	int32_t pos) {
	1012	// Patterns are at least 5 characters long
	1013	if ((pos+5) > pattern.length()) {
	1014	return FALSE;
	1015	}
	1016
	1017	// Look for an opening [:, [:^, \p, or \P
	1018	return isPOSIXOpen(pattern, pos) \|\| isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos);
	1019	}
	1020
	1021	/**
	1022	* Return true if the given iterator appears to point at a
	1023	* property pattern. Regardless of the result, return with the
	1024	* iterator unchanged.
	1025	* @param chars iterator over the pattern characters. Upon return
	1026	* it will be unchanged.
	1027	* @param iterOpts RuleCharacterIterator options
	1028	*/
	1029	UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
	1030	int32_t iterOpts) {
	1031	// NOTE: literal will always be FALSE, because we don't parse escapes.
	1032	UBool result = FALSE, literal;
	1033	UErrorCode ec = U_ZERO_ERROR;
	1034	iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
	1035	RuleCharacterIterator::Pos pos;
	1036	chars.getPos(pos);
	1037	UChar32 c = chars.next(iterOpts, literal, ec);
	1038	if (c == 0x5B /'['/ \|\| c == 0x5C /'\\'/) {
	1039	UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
	1040	literal, ec);
	1041	result = (c == 0x5B /'['/) ? (d == 0x3A /':'/) :
	1042	(d == 0x4E /'N'/ \|\| d == 0x70 /'p'/ \|\| d == 0x50 /'P'/);
	1043	}
	1044	chars.setPos(pos);
	1045	return result && U_SUCCESS(ec);
	1046	}
	1047
	1048	/**
	1049	* Parse the given property pattern at the given parse position.
	1050	*/
	1051	UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
	1052	ParsePosition& ppos,
	1053	UErrorCode &ec) {
	1054	int32_t pos = ppos.getIndex();
	1055
	1056	UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
	1057	UBool isName = FALSE; // true for \N{pat}, o/w false
	1058	UBool invert = FALSE;
	1059
	1060	if (U_FAILURE(ec)) return *this;
	1061
	1062	// Minimum length is 5 characters, e.g. \p{L}
	1063	if ((pos+5) > pattern.length()) {
	1064	FAIL(ec);
	1065	}
	1066
	1067	// On entry, ppos should point to one of the following locations:
	1068	// Look for an opening [:, [:^, \p, or \P
	1069	if (isPOSIXOpen(pattern, pos)) {
	1070	posix = TRUE;
	1071	pos += 2;
	1072	pos = ICU_Utility::skipWhitespace(pattern, pos);
	1073	if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
	1074	++pos;
	1075	invert = TRUE;
	1076	}
	1077	} else if (isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos)) {
	1078	UChar c = pattern.charAt(pos+1);
	1079	invert = (c == UPPER_P);
	1080	isName = (c == UPPER_N);
	1081	pos += 2;
	1082	pos = ICU_Utility::skipWhitespace(pattern, pos);
	1083	if (pos == pattern.length() \|\| pattern.charAt(pos++) != OPEN_BRACE) {
	1084	// Syntax error; "\p" or "\P" not followed by "{"
	1085	FAIL(ec);
	1086	}
	1087	} else {
	1088	// Open delimiter not seen
	1089	FAIL(ec);
	1090	}
	1091
	1092	// Look for the matching close delimiter, either :] or }
	1093	int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
	1094	if (close < 0) {
	1095	// Syntax error; close delimiter missing
	1096	FAIL(ec);
	1097	}
	1098
	1099	// Look for an '=' sign. If this is present, we will parse a
	1100	// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
	1101	// pattern.
	1102	int32_t equals = pattern.indexOf(EQUALS, pos);
	1103	UnicodeString propName, valueName;
	1104	if (equals >= 0 && equals < close && !isName) {
	1105	// Equals seen; parse medium/long pattern
	1106	pattern.extractBetween(pos, equals, propName);
	1107	pattern.extractBetween(equals+1, close, valueName);
	1108	}
	1109
	1110	else {
	1111	// Handle case where no '=' is seen, and \N{}
	1112	pattern.extractBetween(pos, close, propName);
	1113
	1114	// Handle \N{name}
	1115	if (isName) {
	1116	// This is a little inefficient since it means we have to
	1117	// parse NAME_PROP back to UCHAR_NAME even though we already
	1118	// know it's UCHAR_NAME. If we refactor the API to
	1119	// support args of (UProperty, char*) then we can remove
	1120	// NAME_PROP and make this a little more efficient.
	1121	valueName = propName;
	1122	propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
	1123	}
	1124	}
	1125
	1126	applyPropertyAlias(propName, valueName, ec);
	1127
	1128	if (U_SUCCESS(ec)) {
	1129	if (invert) {
	1130	complement();
	1131	}
	1132
	1133	// Move to the limit position after the close delimiter if the
	1134	// parse succeeded.
	1135	ppos.setIndex(close + (posix ? 2 : 1));
	1136	}
	1137
	1138	return *this;
	1139	}
	1140
	1141	/**
	1142	* Parse a property pattern.
	1143	* @param chars iterator over the pattern characters. Upon return
	1144	* it will be advanced to the first character after the parsed
	1145	* pattern, or the end of the iteration if all characters are
	1146	* parsed.
	1147	* @param rebuiltPat the pattern that was parsed, rebuilt or
	1148	* copied from the input pattern, as appropriate.
	1149	*/
	1150	void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
	1151	UnicodeString& rebuiltPat,
	1152	UErrorCode& ec) {
	1153	if (U_FAILURE(ec)) return;
	1154	UnicodeString pattern;
	1155	chars.lookahead(pattern);
	1156	ParsePosition pos(0);
	1157	applyPropertyPattern(pattern, pos, ec);
	1158	if (U_FAILURE(ec)) return;
	1159	if (pos.getIndex() == 0) {
	1160	// syntaxError(chars, "Invalid property pattern");
	1161	ec = U_MALFORMED_SET;
	1162	return;
	1163	}
	1164	chars.jumpahead(pos.getIndex());
	1165	rebuiltPat.append(pattern, 0, pos.getIndex());
	1166	}
	1167
	1168	//----------------------------------------------------------------
	1169	// Inclusions list
	1170	//----------------------------------------------------------------
	1171
	1172	U_CDECL_BEGIN
	1173
	1174	// USetAdder implementation
	1175	// Does not use uset.h to reduce code dependencies
	1176	static void U_CALLCONV
	1177	_set_add(USet *set, UChar32 c) {
	1178	((UnicodeSet *)set)->add(c);
	1179	}
	1180
	1181	static void U_CALLCONV
	1182	_set_addRange(USet *set, UChar32 start, UChar32 end) {
	1183	((UnicodeSet *)set)->add(start, end);
	1184	}
	1185
	1186	static void U_CALLCONV
	1187	_set_addString(USet set, const UChar str, int32_t length) {
	1188	((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
	1189	}
	1190
	1191	/**
	1192	* Cleanup function for UnicodeSet
	1193	*/
	1194	static UBool U_CALLCONV uset_cleanup(void) {
	1195	int32_t i;
	1196
	1197	for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
	1198	if (INCLUSIONS[i] != NULL) {
	1199	delete INCLUSIONS[i];
	1200	INCLUSIONS[i] = NULL;
	1201	}
	1202	}
	1203
	1204	return TRUE;
	1205	}
	1206
	1207	U_CDECL_END
	1208
	1209	const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
	1210	umtx_lock(NULL);
	1211	UBool f = (INCLUSIONS[src] == NULL);
	1212	umtx_unlock(NULL);
	1213	if (f) {
	1214	UnicodeSet* incl = new UnicodeSet();
	1215	USetAdder sa = {
	1216	(USet *)incl,
	1217	_set_add,
	1218	_set_addRange,
	1219	_set_addString,
	1220	NULL // don't need remove()
	1221	};
	1222
	1223	if (incl != NULL) {
	1224	switch(src) {
	1225	case UPROPS_SRC_CHAR:
	1226	uchar_addPropertyStarts(&sa, &status);
	1227	break;
	1228	case UPROPS_SRC_PROPSVEC:
	1229	upropsvec_addPropertyStarts(&sa, &status);
	1230	break;
	1231	case UPROPS_SRC_CHAR_AND_PROPSVEC:
	1232	uchar_addPropertyStarts(&sa, &status);
	1233	upropsvec_addPropertyStarts(&sa, &status);
	1234	break;
	1235	case UPROPS_SRC_HST:
	1236	uhst_addPropertyStarts(&sa, &status);
	1237	break;
	1238	#if !UCONFIG_NO_NORMALIZATION
	1239	case UPROPS_SRC_NORM:
	1240	unorm_addPropertyStarts(&sa, &status);
	1241	break;
	1242	#endif
	1243	case UPROPS_SRC_CASE:
	1244	ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
	1245	break;
	1246	case UPROPS_SRC_BIDI:
	1247	ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
	1248	break;
	1249	default:
	1250	status = U_INTERNAL_PROGRAM_ERROR;
	1251	break;
	1252	}
	1253	if (U_SUCCESS(status)) {
	1254	umtx_lock(NULL);
	1255	if (INCLUSIONS[src] == NULL) {
	1256	INCLUSIONS[src] = incl;
	1257	incl = NULL;
	1258	ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
	1259	}
	1260	umtx_unlock(NULL);
	1261	}
	1262	delete incl;
	1263	} else {
	1264	status = U_MEMORY_ALLOCATION_ERROR;
	1265	}
	1266	}
	1267	return INCLUSIONS[src];
	1268	}
	1269
	1270	//----------------------------------------------------------------
	1271	// Case folding API
	1272	//----------------------------------------------------------------
	1273
	1274	// add the result of a full case mapping to the set
	1275	// use str as a temporary string to avoid constructing one
	1276	static inline void
	1277	addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
	1278	if(result >= 0) {
	1279	if(result > UCASE_MAX_STRING_LENGTH) {
	1280	// add a single-code point case mapping
	1281	set.add(result);
	1282	} else {
	1283	// add a string case mapping from full with length result
	1284	str.setTo((UBool)FALSE, full, result);
	1285	set.add(str);
	1286	}
	1287	}
	1288	// result < 0: the code point mapped to itself, no need to add it
	1289	// see ucase.h
	1290	}
	1291
	1292	UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
	1293	if (attribute & (USET_CASE_INSENSITIVE \| USET_ADD_CASE_MAPPINGS)) {
	1294	UErrorCode status = U_ZERO_ERROR;
	1295	const UCaseProps *csp = ucase_getSingleton(&status);
	1296	if (U_SUCCESS(status)) {
	1297	UnicodeSet foldSet(*this);
	1298	UnicodeString str;
	1299	USetAdder sa = {
	1300	(USet *)&foldSet,
	1301	_set_add,
	1302	_set_addRange,
	1303	_set_addString,
	1304	NULL // don't need remove()
	1305	};
	1306
	1307	// start with input set to guarantee inclusion
	1308	// USET_CASE: remove strings because the strings will actually be reduced (folded);
	1309	// therefore, start with no strings and add only those needed
	1310	if (attribute & USET_CASE_INSENSITIVE) {
	1311	foldSet.strings->removeAllElements();
	1312	}
	1313
	1314	int32_t n = getRangeCount();
	1315	UChar32 result;
	1316	const UChar *full;
	1317	int32_t locCache = 0;
	1318
	1319	for (int32_t i=0; i<n; ++i) {
	1320	UChar32 start = getRangeStart(i);
	1321	UChar32 end = getRangeEnd(i);
	1322
	1323	if (attribute & USET_CASE_INSENSITIVE) {
	1324	// full case closure
	1325	for (UChar32 cp=start; cp<=end; ++cp) {
	1326	ucase_addCaseClosure(csp, cp, &sa);
	1327	}
	1328	} else {
	1329	// add case mappings
	1330	// (does not add long s for regular s, or Kelvin for k, for example)
	1331	for (UChar32 cp=start; cp<=end; ++cp) {
	1332	result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
	1333	addCaseMapping(foldSet, result, full, str);
	1334
	1335	result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
	1336	addCaseMapping(foldSet, result, full, str);
	1337
	1338	result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
	1339	addCaseMapping(foldSet, result, full, str);
	1340
	1341	result = ucase_toFullFolding(csp, cp, &full, 0);
	1342	addCaseMapping(foldSet, result, full, str);
	1343	}
	1344	}
	1345	}
	1346	if (strings != NULL && strings->size() > 0) {
	1347	if (attribute & USET_CASE_INSENSITIVE) {
	1348	for (int32_t j=0; j<strings->size(); ++j) {
	1349	str = (const UnicodeString ) strings->elementAt(j);
	1350	str.foldCase();
	1351	if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
	1352	foldSet.add(str); // does not map to code points: add the folded string itself
	1353	}
	1354	}
	1355	} else {
	1356	Locale root("");
	1357	#if !UCONFIG_NO_BREAK_ITERATION
	1358	BreakIterator *bi = BreakIterator::createWordInstance(root, status);
	1359	#endif
	1360	if (U_SUCCESS(status)) {
	1361	const UnicodeString *pStr;
	1362
	1363	for (int32_t j=0; j<strings->size(); ++j) {
	1364	pStr = (const UnicodeString *) strings->elementAt(j);
	1365	(str = *pStr).toLower(root);
	1366	foldSet.add(str);
	1367	#if !UCONFIG_NO_BREAK_ITERATION
	1368	(str = *pStr).toTitle(bi, root);
	1369	foldSet.add(str);
	1370	#endif
	1371	(str = *pStr).toUpper(root);
	1372	foldSet.add(str);
	1373	(str = *pStr).foldCase();
	1374	foldSet.add(str);
	1375	}
	1376	}
	1377	#if !UCONFIG_NO_BREAK_ITERATION
	1378	delete bi;
	1379	#endif
	1380	}
	1381	}
	1382	*this = foldSet;
	1383	}
	1384	}
	1385	return *this;
	1386	}
	1387
	1388	U_NAMESPACE_END