]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/uniset_props.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / uniset_props.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
46f4442e 4* Copyright (C) 1999-2008, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: uniset_props.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug25
14* created by: Markus W. Scherer
15*
16* Character property dependent functions moved here from uniset.cpp
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/uniset.h"
21#include "unicode/parsepos.h"
22#include "unicode/uchar.h"
23#include "unicode/uscript.h"
24#include "unicode/symtable.h"
25#include "unicode/uset.h"
26#include "unicode/locid.h"
27#include "unicode/brkiter.h"
28#include "uset_imp.h"
29#include "ruleiter.h"
30#include "cmemory.h"
374ca955
A
31#include "ucln_cmn.h"
32#include "util.h"
33#include "uvector.h"
34#include "uprops.h"
35#include "propname.h"
36#include "unormimp.h"
37#include "ucase.h"
73c04bcf 38#include "ubidi_props.h"
374ca955
A
39#include "uinvchar.h"
40#include "charstr.h"
41#include "cstring.h"
46f4442e 42#include "umutex.h"
374ca955
A
43#include "uassert.h"
44#include "hash.h"
73c04bcf 45
46f4442e
A
46U_NAMESPACE_USE
47
73c04bcf 48#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
374ca955
A
49
50// initial storage. Must be >= 0
51// *** same as in uniset.cpp ! ***
52#define START_EXTRA 16
53
54// Define UChar constants using hex for EBCDIC compatibility
55// Used #define to reduce private static exports and memory access time.
56#define SET_OPEN ((UChar)0x005B) /*[*/
57#define SET_CLOSE ((UChar)0x005D) /*]*/
58#define HYPHEN ((UChar)0x002D) /*-*/
59#define COMPLEMENT ((UChar)0x005E) /*^*/
60#define COLON ((UChar)0x003A) /*:*/
61#define BACKSLASH ((UChar)0x005C) /*\*/
62#define INTERSECTION ((UChar)0x0026) /*&*/
63#define UPPER_U ((UChar)0x0055) /*U*/
64#define LOWER_U ((UChar)0x0075) /*u*/
65#define OPEN_BRACE ((UChar)123) /*{*/
66#define CLOSE_BRACE ((UChar)125) /*}*/
67#define UPPER_P ((UChar)0x0050) /*P*/
68#define LOWER_P ((UChar)0x0070) /*p*/
69#define UPPER_N ((UChar)78) /*N*/
70#define EQUALS ((UChar)0x003D) /*=*/
71
73c04bcf 72//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
374ca955 73static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
73c04bcf 74//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
374ca955 75static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
73c04bcf 76//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
374ca955
A
77static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
78
79// Special property set IDs
80static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
81static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
73c04bcf 82static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
374ca955
A
83
84// Unicode name property alias
85#define NAME_PROP "na"
86#define NAME_PROP_LENGTH 2
87
374ca955
A
88/**
89 * Delimiter string used in patterns to close a category reference:
90 * ":]". Example: "[:Lu:]".
91 */
73c04bcf 92//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
374ca955 93
46f4442e 94U_CDECL_BEGIN
374ca955
A
95
96static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
97
46f4442e
A
98//----------------------------------------------------------------
99// Inclusions list
100//----------------------------------------------------------------
101
102// USetAdder implementation
103// Does not use uset.h to reduce code dependencies
104static void U_CALLCONV
105_set_add(USet *set, UChar32 c) {
106 ((UnicodeSet *)set)->add(c);
107}
108
109static void U_CALLCONV
110_set_addRange(USet *set, UChar32 start, UChar32 end) {
111 ((UnicodeSet *)set)->add(start, end);
112}
113
114static void U_CALLCONV
115_set_addString(USet *set, const UChar *str, int32_t length) {
116 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
117}
118
119/**
120 * Cleanup function for UnicodeSet
121 */
122static UBool U_CALLCONV uset_cleanup(void) {
123 int32_t i;
124
125 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
126 if (INCLUSIONS[i] != NULL) {
127 delete INCLUSIONS[i];
128 INCLUSIONS[i] = NULL;
129 }
130 }
131
132 return TRUE;
133}
134
135U_CDECL_END
136
137U_NAMESPACE_BEGIN
138
139/*
140Reduce excessive reallocation, and make it easier to detect initialization
141problems.
142Usually you don't see smaller sets than this for Unicode 5.0.
143*/
144#define DEFAULT_INCLUSION_CAPACITY 3072
145
146const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
147 UBool needInit;
148 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
149 if (needInit) {
150 UnicodeSet* incl = new UnicodeSet();
151 USetAdder sa = {
152 (USet *)incl,
153 _set_add,
154 _set_addRange,
155 _set_addString,
156 NULL, // don't need remove()
157 NULL // don't need removeRange()
158 };
159 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
160 if (incl != NULL) {
161 switch(src) {
162 case UPROPS_SRC_CHAR:
163 uchar_addPropertyStarts(&sa, &status);
164 break;
165 case UPROPS_SRC_PROPSVEC:
166 upropsvec_addPropertyStarts(&sa, &status);
167 break;
168 case UPROPS_SRC_CHAR_AND_PROPSVEC:
169 uchar_addPropertyStarts(&sa, &status);
170 upropsvec_addPropertyStarts(&sa, &status);
171 break;
172 case UPROPS_SRC_HST:
173 uhst_addPropertyStarts(&sa, &status);
174 break;
175#if !UCONFIG_NO_NORMALIZATION
176 case UPROPS_SRC_NORM:
177 unorm_addPropertyStarts(&sa, &status);
178 break;
179#endif
180 case UPROPS_SRC_CASE:
181 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
182 break;
183 case UPROPS_SRC_BIDI:
184 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
185 break;
186 default:
187 status = U_INTERNAL_PROGRAM_ERROR;
188 break;
189 }
190 if (U_SUCCESS(status)) {
191 // Compact for caching
192 incl->compact();
193 umtx_lock(NULL);
194 if (INCLUSIONS[src] == NULL) {
195 INCLUSIONS[src] = incl;
196 incl = NULL;
197 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
198 }
199 umtx_unlock(NULL);
200 }
201 delete incl;
202 } else {
203 status = U_MEMORY_ALLOCATION_ERROR;
204 }
205 }
206 return INCLUSIONS[src];
207}
208
374ca955
A
209// helper functions for matching of pattern syntax pieces ------------------ ***
210// these functions are parallel to the PERL_OPEN etc. strings above
211
212// using these functions is not only faster than UnicodeString::compare() and
213// caseCompare(), but they also make UnicodeSet work for simple patterns when
214// no Unicode properties data is available - when caseCompare() fails
215
216static inline UBool
217isPerlOpen(const UnicodeString &pattern, int32_t pos) {
218 UChar c;
219 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
220}
221
73c04bcf 222/*static inline UBool
374ca955
A
223isPerlClose(const UnicodeString &pattern, int32_t pos) {
224 return pattern.charAt(pos)==CLOSE_BRACE;
73c04bcf 225}*/
374ca955
A
226
227static inline UBool
228isNameOpen(const UnicodeString &pattern, int32_t pos) {
229 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
230}
231
232static inline UBool
233isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
234 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
235}
236
73c04bcf 237/*static inline UBool
374ca955
A
238isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
239 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
73c04bcf 240}*/
374ca955
A
241
242// TODO memory debugging provided inside uniset.cpp
243// could be made available here but probably obsolete with use of modern
244// memory leak checker tools
245#define _dbgct(me)
246
247//----------------------------------------------------------------
248// Constructors &c
249//----------------------------------------------------------------
250
251/**
252 * Constructs a set from the given pattern, optionally ignoring
253 * white space. See the class description for the syntax of the
254 * pattern language.
255 * @param pattern a string specifying what characters are in the set
256 */
257UnicodeSet::UnicodeSet(const UnicodeString& pattern,
258 UErrorCode& status) :
46f4442e
A
259 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
260 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
261 fFlags(0)
374ca955
A
262{
263 if(U_SUCCESS(status)){
264 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
265 /* test for NULL */
266 if(list == NULL) {
267 status = U_MEMORY_ALLOCATION_ERROR;
268 }else{
46f4442e 269 allocateStrings(status);
374ca955
A
270 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
271 }
272 }
273 _dbgct(this);
274}
275
276/**
277 * Constructs a set from the given pattern, optionally ignoring
278 * white space. See the class description for the syntax of the
279 * pattern language.
280 * @param pattern a string specifying what characters are in the set
281 * @param options bitmask for options to apply to the pattern.
282 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
283 */
284UnicodeSet::UnicodeSet(const UnicodeString& pattern,
285 uint32_t options,
286 const SymbolTable* symbols,
287 UErrorCode& status) :
46f4442e
A
288 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
289 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
290 fFlags(0)
374ca955
A
291{
292 if(U_SUCCESS(status)){
293 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
294 /* test for NULL */
295 if(list == NULL) {
296 status = U_MEMORY_ALLOCATION_ERROR;
297 }else{
46f4442e 298 allocateStrings(status);
374ca955
A
299 applyPattern(pattern, options, symbols, status);
300 }
301 }
302 _dbgct(this);
303}
304
305UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
306 uint32_t options,
307 const SymbolTable* symbols,
308 UErrorCode& status) :
46f4442e
A
309 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
310 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
311 fFlags(0)
374ca955
A
312{
313 if(U_SUCCESS(status)){
314 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
315 /* test for NULL */
316 if(list == NULL) {
317 status = U_MEMORY_ALLOCATION_ERROR;
318 }else{
46f4442e 319 allocateStrings(status);
374ca955
A
320 applyPattern(pattern, pos, options, symbols, status);
321 }
322 }
323 _dbgct(this);
324}
325
374ca955
A
326//----------------------------------------------------------------
327// Public API
328//----------------------------------------------------------------
329
330/**
331 * Modifies this set to represent the set specified by the given
332 * pattern, optionally ignoring white space. See the class
333 * description for the syntax of the pattern language.
334 * @param pattern a string specifying what characters are in the set
335 * @param ignoreSpaces if <code>true</code>, all spaces in the
336 * pattern are ignored. Spaces are those characters for which
337 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
338 * Characters preceded by '\\' are escaped, losing any special
339 * meaning they otherwise have. Spaces may be included by
340 * escaping them.
341 * @exception <code>IllegalArgumentException</code> if the pattern
342 * contains a syntax error.
343 */
344UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
345 UErrorCode& status) {
346 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
347}
348
349
350/**
351 * Modifies this set to represent the set specified by the given
352 * pattern, optionally ignoring white space. See the class
353 * description for the syntax of the pattern language.
354 * @param pattern a string specifying what characters are in the set
355 * @param options bitmask for options to apply to the pattern.
356 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
357 */
358UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
359 uint32_t options,
360 const SymbolTable* symbols,
361 UErrorCode& status) {
46f4442e 362 if (U_FAILURE(status) || isFrozen()) {
374ca955
A
363 return *this;
364 }
365
366 ParsePosition pos(0);
367 applyPattern(pattern, pos, options, symbols, status);
368 if (U_FAILURE(status)) return *this;
369
370 int32_t i = pos.getIndex();
371
372 if (options & USET_IGNORE_SPACE) {
373 // Skip over trailing whitespace
374 ICU_Utility::skipWhitespace(pattern, i, TRUE);
375 }
376
377 if (i != pattern.length()) {
378 status = U_ILLEGAL_ARGUMENT_ERROR;
379 }
380 return *this;
381}
382
383UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
384 ParsePosition& pos,
385 uint32_t options,
386 const SymbolTable* symbols,
387 UErrorCode& status) {
46f4442e 388 if (U_FAILURE(status) || isFrozen()) {
374ca955
A
389 return *this;
390 }
391 // Need to build the pattern in a temporary string because
392 // _applyPattern calls add() etc., which set pat to empty.
393 UnicodeString rebuiltPat;
394 RuleCharacterIterator chars(pattern, symbols, pos);
395 applyPattern(chars, symbols, rebuiltPat, options, status);
396 if (U_FAILURE(status)) return *this;
397 if (chars.inVariable()) {
398 // syntaxError(chars, "Extra chars in variable value");
399 status = U_MALFORMED_SET;
400 return *this;
401 }
46f4442e 402 setPattern(rebuiltPat);
374ca955
A
403 return *this;
404}
405
406/**
407 * Return true if the given position, in the given pattern, appears
408 * to be the start of a UnicodeSet pattern.
409 */
410UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
411 return ((pos+1) < pattern.length() &&
412 pattern.charAt(pos) == (UChar)91/*[*/) ||
413 resemblesPropertyPattern(pattern, pos);
414}
415
416//----------------------------------------------------------------
417// Implementation: Pattern parsing
418//----------------------------------------------------------------
419
420/**
421 * A small all-inline class to manage a UnicodeSet pointer. Add
422 * operator->() etc. as needed.
423 */
424class UnicodeSetPointer {
425 UnicodeSet* p;
426public:
427 inline UnicodeSetPointer() : p(0) {}
428 inline ~UnicodeSetPointer() { delete p; }
429 inline UnicodeSet* pointer() { return p; }
430 inline UBool allocate() {
431 if (p == 0) {
432 p = new UnicodeSet();
433 }
434 return p != 0;
435 }
436};
437
438/**
439 * Parse the pattern from the given RuleCharacterIterator. The
440 * iterator is advanced over the parsed pattern.
441 * @param chars iterator over the pattern characters. Upon return
442 * it will be advanced to the first character after the parsed
443 * pattern, or the end of the iteration if all characters are
444 * parsed.
445 * @param symbols symbol table to use to parse and dereference
446 * variables, or null if none.
447 * @param rebuiltPat the pattern that was parsed, rebuilt or
448 * copied from the input pattern, as appropriate.
449 * @param options a bit mask of zero or more of the following:
450 * IGNORE_SPACE, CASE.
451 */
452void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
453 const SymbolTable* symbols,
454 UnicodeString& rebuiltPat,
455 uint32_t options,
456 UErrorCode& ec) {
457 if (U_FAILURE(ec)) return;
458
459 // Syntax characters: [ ] ^ - & { }
460
461 // Recognized special forms for chars, sets: c-c s-s s&s
462
463 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
464 RuleCharacterIterator::PARSE_ESCAPES;
465 if ((options & USET_IGNORE_SPACE) != 0) {
466 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
467 }
468
469 UnicodeString patLocal, buf;
470 UBool usePat = FALSE;
471 UnicodeSetPointer scratch;
472 RuleCharacterIterator::Pos backup;
473
474 // mode: 0=before [, 1=between [...], 2=after ]
475 // lastItem: 0=none, 1=char, 2=set
476 int8_t lastItem = 0, mode = 0;
477 UChar32 lastChar = 0;
478 UChar op = 0;
479
480 UBool invert = FALSE;
481
482 clear();
483
484 while (mode != 2 && !chars.atEnd()) {
485 U_ASSERT((lastItem == 0 && op == 0) ||
486 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
487 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
488 op == INTERSECTION /*'&'*/)));
489
490 UChar32 c = 0;
491 UBool literal = FALSE;
492 UnicodeSet* nested = 0; // alias - do not delete
493
494 // -------- Check for property pattern
495
496 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
497 int8_t setMode = 0;
498 if (resemblesPropertyPattern(chars, opts)) {
499 setMode = 2;
500 }
501
502 // -------- Parse '[' of opening delimiter OR nested set.
503 // If there is a nested set, use `setMode' to define how
504 // the set should be parsed. If the '[' is part of the
505 // opening delimiter for this pattern, parse special
506 // strings "[", "[^", "[-", and "[^-". Check for stand-in
507 // characters representing a nested set in the symbol
508 // table.
509
510 else {
511 // Prepare to backup if necessary
512 chars.getPos(backup);
513 c = chars.next(opts, literal, ec);
514 if (U_FAILURE(ec)) return;
515
516 if (c == 0x5B /*'['*/ && !literal) {
517 if (mode == 1) {
518 chars.setPos(backup); // backup
519 setMode = 1;
520 } else {
521 // Handle opening '[' delimiter
522 mode = 1;
523 patLocal.append((UChar) 0x5B /*'['*/);
524 chars.getPos(backup); // prepare to backup
525 c = chars.next(opts, literal, ec);
526 if (U_FAILURE(ec)) return;
527 if (c == 0x5E /*'^'*/ && !literal) {
528 invert = TRUE;
529 patLocal.append((UChar) 0x5E /*'^'*/);
530 chars.getPos(backup); // prepare to backup
531 c = chars.next(opts, literal, ec);
532 if (U_FAILURE(ec)) return;
533 }
534 // Fall through to handle special leading '-';
535 // otherwise restart loop for nested [], \p{}, etc.
536 if (c == HYPHEN /*'-'*/) {
537 literal = TRUE;
538 // Fall through to handle literal '-' below
539 } else {
540 chars.setPos(backup); // backup
541 continue;
542 }
543 }
544 } else if (symbols != 0) {
545 const UnicodeFunctor *m = symbols->lookupMatcher(c);
546 if (m != 0) {
547 if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
548 ec = U_MALFORMED_SET;
549 return;
550 }
551 // casting away const, but `nested' won't be modified
552 // (important not to modify stored set)
553 nested = (UnicodeSet*) m;
554 setMode = 3;
555 }
556 }
557 }
558
559 // -------- Handle a nested set. This either is inline in
560 // the pattern or represented by a stand-in that has
561 // previously been parsed and was looked up in the symbol
562 // table.
563
564 if (setMode != 0) {
565 if (lastItem == 1) {
566 if (op != 0) {
567 // syntaxError(chars, "Char expected after operator");
568 ec = U_MALFORMED_SET;
569 return;
570 }
571 add(lastChar, lastChar);
572 _appendToPat(patLocal, lastChar, FALSE);
573 lastItem = 0;
574 op = 0;
575 }
576
577 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
578 patLocal.append(op);
579 }
580
581 if (nested == 0) {
582 // lazy allocation
583 if (!scratch.allocate()) {
584 ec = U_MEMORY_ALLOCATION_ERROR;
585 return;
586 }
587 nested = scratch.pointer();
588 }
589 switch (setMode) {
590 case 1:
591 nested->applyPattern(chars, symbols, patLocal, options, ec);
592 break;
593 case 2:
594 chars.skipIgnored(opts);
595 nested->applyPropertyPattern(chars, patLocal, ec);
596 if (U_FAILURE(ec)) return;
597 break;
598 case 3: // `nested' already parsed
599 nested->_toPattern(patLocal, FALSE);
600 break;
601 }
602
603 usePat = TRUE;
604
605 if (mode == 0) {
606 // Entire pattern is a category; leave parse loop
607 *this = *nested;
608 mode = 2;
609 break;
610 }
611
612 switch (op) {
613 case HYPHEN: /*'-'*/
614 removeAll(*nested);
615 break;
616 case INTERSECTION: /*'&'*/
617 retainAll(*nested);
618 break;
619 case 0:
620 addAll(*nested);
621 break;
622 }
623
624 op = 0;
625 lastItem = 2;
626
627 continue;
628 }
629
630 if (mode == 0) {
631 // syntaxError(chars, "Missing '['");
632 ec = U_MALFORMED_SET;
633 return;
634 }
635
636 // -------- Parse special (syntax) characters. If the
637 // current character is not special, or if it is escaped,
638 // then fall through and handle it below.
639
640 if (!literal) {
641 switch (c) {
642 case 0x5D /*']'*/:
643 if (lastItem == 1) {
644 add(lastChar, lastChar);
645 _appendToPat(patLocal, lastChar, FALSE);
646 }
647 // Treat final trailing '-' as a literal
648 if (op == HYPHEN /*'-'*/) {
649 add(op, op);
650 patLocal.append(op);
651 } else if (op == INTERSECTION /*'&'*/) {
652 // syntaxError(chars, "Trailing '&'");
653 ec = U_MALFORMED_SET;
654 return;
655 }
656 patLocal.append((UChar) 0x5D /*']'*/);
657 mode = 2;
658 continue;
659 case HYPHEN /*'-'*/:
660 if (op == 0) {
661 if (lastItem != 0) {
662 op = (UChar) c;
663 continue;
664 } else {
665 // Treat final trailing '-' as a literal
666 add(c, c);
667 c = chars.next(opts, literal, ec);
668 if (U_FAILURE(ec)) return;
669 if (c == 0x5D /*']'*/ && !literal) {
670 patLocal.append(HYPHEN_RIGHT_BRACE);
671 mode = 2;
672 continue;
673 }
674 }
675 }
676 // syntaxError(chars, "'-' not after char or set");
677 ec = U_MALFORMED_SET;
678 return;
679 case INTERSECTION /*'&'*/:
680 if (lastItem == 2 && op == 0) {
681 op = (UChar) c;
682 continue;
683 }
684 // syntaxError(chars, "'&' not after set");
685 ec = U_MALFORMED_SET;
686 return;
687 case 0x5E /*'^'*/:
688 // syntaxError(chars, "'^' not after '['");
689 ec = U_MALFORMED_SET;
690 return;
691 case 0x7B /*'{'*/:
692 if (op != 0) {
693 // syntaxError(chars, "Missing operand after operator");
694 ec = U_MALFORMED_SET;
695 return;
696 }
697 if (lastItem == 1) {
698 add(lastChar, lastChar);
699 _appendToPat(patLocal, lastChar, FALSE);
700 }
701 lastItem = 0;
702 buf.truncate(0);
703 {
704 UBool ok = FALSE;
705 while (!chars.atEnd()) {
706 c = chars.next(opts, literal, ec);
707 if (U_FAILURE(ec)) return;
708 if (c == 0x7D /*'}'*/ && !literal) {
709 ok = TRUE;
710 break;
711 }
712 buf.append(c);
713 }
714 if (buf.length() < 1 || !ok) {
715 // syntaxError(chars, "Invalid multicharacter string");
716 ec = U_MALFORMED_SET;
717 return;
718 }
719 }
720 // We have new string. Add it to set and continue;
721 // we don't need to drop through to the further
722 // processing
723 add(buf);
724 patLocal.append((UChar) 0x7B /*'{'*/);
725 _appendToPat(patLocal, buf, FALSE);
726 patLocal.append((UChar) 0x7D /*'}'*/);
727 continue;
728 case SymbolTable::SYMBOL_REF:
729 // symbols nosymbols
730 // [a-$] error error (ambiguous)
731 // [a$] anchor anchor
732 // [a-$x] var "x"* literal '$'
733 // [a-$.] error literal '$'
734 // *We won't get here in the case of var "x"
735 {
736 chars.getPos(backup);
737 c = chars.next(opts, literal, ec);
738 if (U_FAILURE(ec)) return;
739 UBool anchor = (c == 0x5D /*']'*/ && !literal);
740 if (symbols == 0 && !anchor) {
741 c = SymbolTable::SYMBOL_REF;
742 chars.setPos(backup);
743 break; // literal '$'
744 }
745 if (anchor && op == 0) {
746 if (lastItem == 1) {
747 add(lastChar, lastChar);
748 _appendToPat(patLocal, lastChar, FALSE);
749 }
750 add(U_ETHER);
751 usePat = TRUE;
752 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
753 patLocal.append((UChar) 0x5D /*']'*/);
754 mode = 2;
755 continue;
756 }
757 // syntaxError(chars, "Unquoted '$'");
758 ec = U_MALFORMED_SET;
759 return;
760 }
761 default:
762 break;
763 }
764 }
765
766 // -------- Parse literal characters. This includes both
767 // escaped chars ("\u4E01") and non-syntax characters
768 // ("a").
769
770 switch (lastItem) {
771 case 0:
772 lastItem = 1;
773 lastChar = c;
774 break;
775 case 1:
776 if (op == HYPHEN /*'-'*/) {
777 if (lastChar >= c) {
778 // Don't allow redundant (a-a) or empty (b-a) ranges;
779 // these are most likely typos.
780 // syntaxError(chars, "Invalid range");
781 ec = U_MALFORMED_SET;
782 return;
783 }
784 add(lastChar, c);
785 _appendToPat(patLocal, lastChar, FALSE);
786 patLocal.append(op);
787 _appendToPat(patLocal, c, FALSE);
788 lastItem = 0;
789 op = 0;
790 } else {
791 add(lastChar, lastChar);
792 _appendToPat(patLocal, lastChar, FALSE);
793 lastChar = c;
794 }
795 break;
796 case 2:
797 if (op != 0) {
798 // syntaxError(chars, "Set expected after operator");
799 ec = U_MALFORMED_SET;
800 return;
801 }
802 lastChar = c;
803 lastItem = 1;
804 break;
805 }
806 }
807
808 if (mode != 2) {
809 // syntaxError(chars, "Missing ']'");
810 ec = U_MALFORMED_SET;
811 return;
812 }
813
814 chars.skipIgnored(opts);
815
816 /**
817 * Handle global flags (invert, case insensitivity). If this
818 * pattern should be compiled case-insensitive, then we need
819 * to close over case BEFORE COMPLEMENTING. This makes
820 * patterns like /[^abc]/i work.
821 */
822 if ((options & USET_CASE_INSENSITIVE) != 0) {
73c04bcf 823 closeOver(USET_CASE_INSENSITIVE);
374ca955
A
824 }
825 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
826 closeOver(USET_ADD_CASE_MAPPINGS);
827 }
828 if (invert) {
829 complement();
830 }
831
832 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
833 // generated pattern.
834 if (usePat) {
835 rebuiltPat.append(patLocal);
836 } else {
837 _generatePattern(rebuiltPat, FALSE);
838 }
46f4442e
A
839 if (isBogus() && U_SUCCESS(ec)) {
840 // We likely ran out of memory. AHHH!
841 ec = U_MEMORY_ALLOCATION_ERROR;
842 }
374ca955
A
843}
844
845//----------------------------------------------------------------
846// Property set implementation
847//----------------------------------------------------------------
848
849static UBool numericValueFilter(UChar32 ch, void* context) {
850 return u_getNumericValue(ch) == *(double*)context;
851}
852
853static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
854 int32_t value = *(int32_t*)context;
855 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
856}
857
858static UBool versionFilter(UChar32 ch, void* context) {
859 UVersionInfo v, none = { 0, 0, 0, 0};
860 UVersionInfo* version = (UVersionInfo*)context;
861 u_charAge(ch, v);
862 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
863}
864
865typedef struct {
866 UProperty prop;
867 int32_t value;
868} IntPropertyContext;
869
870static UBool intPropertyFilter(UChar32 ch, void* context) {
871 IntPropertyContext* c = (IntPropertyContext*)context;
872 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
873}
874
875
876/**
877 * Generic filter-based scanning code for UCD property UnicodeSets.
878 */
879void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
880 void* context,
881 int32_t src,
882 UErrorCode &status) {
883 // Walk through all Unicode characters, noting the start
884 // and end of each range for which filter.contain(c) is
885 // true. Add each range to a set.
886 //
887 // To improve performance, use the INCLUSIONS set, which
888 // encodes information about character ranges that are known
889 // to have identical properties. INCLUSIONS contains
890 // only the first characters of such ranges.
891 //
892 // TODO Where possible, instead of scanning over code points,
893 // use internal property data to initialize UnicodeSets for
894 // those properties. Scanning code points is slow.
895 if (U_FAILURE(status)) return;
896
897 const UnicodeSet* inclusions = getInclusions(src, status);
898 if (U_FAILURE(status)) {
899 return;
900 }
901
902 clear();
903
904 UChar32 startHasProperty = -1;
46f4442e 905 int32_t limitRange = inclusions->getRangeCount();
374ca955
A
906
907 for (int j=0; j<limitRange; ++j) {
908 // get current range
909 UChar32 start = inclusions->getRangeStart(j);
910 UChar32 end = inclusions->getRangeEnd(j);
911
912 // for all the code points in the range, process
913 for (UChar32 ch = start; ch <= end; ++ch) {
914 // only add to this UnicodeSet on inflection points --
915 // where the hasProperty value changes to false
916 if ((*filter)(ch, context)) {
917 if (startHasProperty < 0) {
918 startHasProperty = ch;
919 }
920 } else if (startHasProperty >= 0) {
921 add(startHasProperty, ch-1);
922 startHasProperty = -1;
923 }
924 }
925 }
926 if (startHasProperty >= 0) {
927 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
928 }
46f4442e
A
929 if (isBogus() && U_SUCCESS(status)) {
930 // We likely ran out of memory. AHHH!
931 status = U_MEMORY_ALLOCATION_ERROR;
932 }
374ca955
A
933}
934
935static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
936 /* Note: we use ' ' in compiler code page */
937 int32_t j = 0;
938 char ch;
939 --dstCapacity; /* make room for term. zero */
940 while ((ch = *src++) != 0) {
941 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
942 continue;
943 }
944 if (j >= dstCapacity) return FALSE;
945 dst[j++] = ch;
946 }
947 if (j > 0 && dst[j-1] == ' ') --j;
948 dst[j] = 0;
949 return TRUE;
950}
951
952//----------------------------------------------------------------
953// Property set API
954//----------------------------------------------------------------
955
956#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
957
374ca955
A
958UnicodeSet&
959UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
46f4442e 960 if (U_FAILURE(ec) || isFrozen()) return *this;
374ca955
A
961
962 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
963 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
964 } else {
965 IntPropertyContext c = {prop, value};
966 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
967 }
968 return *this;
969}
970
971UnicodeSet&
972UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
973 const UnicodeString& value,
974 UErrorCode& ec) {
46f4442e 975 if (U_FAILURE(ec) || isFrozen()) return *this;
374ca955
A
976
977 // prop and value used to be converted to char * using the default
978 // converter instead of the invariant conversion.
979 // This should not be necessary because all Unicode property and value
980 // names use only invariant characters.
981 // If there are any variant characters, then we won't find them anyway.
982 // Checking first avoids assertion failures in the conversion.
983 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
984 !uprv_isInvariantUString(value.getBuffer(), value.length())
985 ) {
986 FAIL(ec);
987 }
988 CharString pname(prop);
989 CharString vname(value);
990
991 UProperty p;
992 int32_t v;
73c04bcf 993 UBool mustNotBeEmpty = FALSE, invert = FALSE;
374ca955
A
994
995 if (value.length() > 0) {
996 p = u_getPropertyEnum(pname);
997 if (p == UCHAR_INVALID_CODE) FAIL(ec);
998
999 // Treat gc as gcm
1000 if (p == UCHAR_GENERAL_CATEGORY) {
1001 p = UCHAR_GENERAL_CATEGORY_MASK;
1002 }
1003
1004 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1005 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1006 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1007 v = u_getPropertyValueEnum(p, vname);
1008 if (v == UCHAR_INVALID_CODE) {
1009 // Handle numeric CCC
1010 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1011 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1012 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1013 char* end;
1014 double value = uprv_strtod(vname, &end);
1015 v = (int32_t) value;
1016 if (v != value || v < 0 || *end != 0) {
1017 // non-integral or negative value, or trailing junk
1018 FAIL(ec);
1019 }
1020 // If the resultant set is empty then the numeric value
1021 // was invalid.
1022 mustNotBeEmpty = TRUE;
1023 } else {
1024 FAIL(ec);
1025 }
1026 }
1027 }
1028
1029 else {
1030
1031 switch (p) {
1032 case UCHAR_NUMERIC_VALUE:
1033 {
1034 char* end;
1035 double value = uprv_strtod(vname, &end);
1036 if (*end != 0) {
1037 FAIL(ec);
1038 }
1039 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1040 return *this;
1041 }
1042 break;
1043 case UCHAR_NAME:
1044 case UCHAR_UNICODE_1_NAME:
1045 {
1046 // Must munge name, since u_charFromName() does not do
1047 // 'loose' matching.
1048 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1049 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1050 UCharNameChoice choice = (p == UCHAR_NAME) ?
1051 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
1052 UChar32 ch = u_charFromName(choice, buf, &ec);
1053 if (U_SUCCESS(ec)) {
1054 clear();
1055 add(ch);
1056 return *this;
1057 } else {
1058 FAIL(ec);
1059 }
1060 }
1061 break;
1062 case UCHAR_AGE:
1063 {
1064 // Must munge name, since u_versionFromString() does not do
1065 // 'loose' matching.
1066 char buf[128];
1067 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1068 UVersionInfo version;
1069 u_versionFromString(version, buf);
73c04bcf 1070 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
374ca955
A
1071 return *this;
1072 }
1073 break;
1074 default:
1075 // p is a non-binary, non-enumerated property that we
1076 // don't support (yet).
1077 FAIL(ec);
1078 }
1079 }
1080 }
1081
1082 else {
1083 // value is empty. Interpret as General Category, Script, or
1084 // Binary property.
1085 p = UCHAR_GENERAL_CATEGORY_MASK;
1086 v = u_getPropertyValueEnum(p, pname);
1087 if (v == UCHAR_INVALID_CODE) {
1088 p = UCHAR_SCRIPT;
1089 v = u_getPropertyValueEnum(p, pname);
1090 if (v == UCHAR_INVALID_CODE) {
1091 p = u_getPropertyEnum(pname);
1092 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1093 v = 1;
1094 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
1095 set(MIN_VALUE, MAX_VALUE);
1096 return *this;
1097 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
1098 set(0, 0x7F);
1099 return *this;
73c04bcf
A
1100 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
1101 // [:Assigned:]=[:^Cn:]
1102 p = UCHAR_GENERAL_CATEGORY_MASK;
1103 v = U_GC_CN_MASK;
1104 invert = TRUE;
374ca955 1105 } else {
374ca955
A
1106 FAIL(ec);
1107 }
1108 }
1109 }
1110 }
1111
1112 applyIntPropertyValue(p, v, ec);
73c04bcf
A
1113 if(invert) {
1114 complement();
1115 }
374ca955
A
1116
1117 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1118 // mustNotBeEmpty is set to true if an empty set indicates
1119 // invalid input.
1120 ec = U_ILLEGAL_ARGUMENT_ERROR;
1121 }
1122
46f4442e
A
1123 if (isBogus() && U_SUCCESS(ec)) {
1124 // We likely ran out of memory. AHHH!
1125 ec = U_MEMORY_ALLOCATION_ERROR;
1126 }
374ca955
A
1127 return *this;
1128}
1129
1130//----------------------------------------------------------------
1131// Property set patterns
1132//----------------------------------------------------------------
1133
1134/**
1135 * Return true if the given position, in the given pattern, appears
1136 * to be the start of a property set pattern.
1137 */
1138UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1139 int32_t pos) {
1140 // Patterns are at least 5 characters long
1141 if ((pos+5) > pattern.length()) {
1142 return FALSE;
1143 }
1144
1145 // Look for an opening [:, [:^, \p, or \P
1146 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1147}
1148
1149/**
1150 * Return true if the given iterator appears to point at a
1151 * property pattern. Regardless of the result, return with the
1152 * iterator unchanged.
1153 * @param chars iterator over the pattern characters. Upon return
1154 * it will be unchanged.
1155 * @param iterOpts RuleCharacterIterator options
1156 */
1157UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1158 int32_t iterOpts) {
1159 // NOTE: literal will always be FALSE, because we don't parse escapes.
1160 UBool result = FALSE, literal;
1161 UErrorCode ec = U_ZERO_ERROR;
1162 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1163 RuleCharacterIterator::Pos pos;
1164 chars.getPos(pos);
1165 UChar32 c = chars.next(iterOpts, literal, ec);
1166 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1167 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1168 literal, ec);
1169 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1170 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1171 }
1172 chars.setPos(pos);
1173 return result && U_SUCCESS(ec);
1174}
1175
1176/**
1177 * Parse the given property pattern at the given parse position.
1178 */
1179UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1180 ParsePosition& ppos,
1181 UErrorCode &ec) {
1182 int32_t pos = ppos.getIndex();
1183
1184 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1185 UBool isName = FALSE; // true for \N{pat}, o/w false
1186 UBool invert = FALSE;
1187
1188 if (U_FAILURE(ec)) return *this;
1189
1190 // Minimum length is 5 characters, e.g. \p{L}
1191 if ((pos+5) > pattern.length()) {
1192 FAIL(ec);
1193 }
1194
1195 // On entry, ppos should point to one of the following locations:
1196 // Look for an opening [:, [:^, \p, or \P
1197 if (isPOSIXOpen(pattern, pos)) {
1198 posix = TRUE;
1199 pos += 2;
1200 pos = ICU_Utility::skipWhitespace(pattern, pos);
1201 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1202 ++pos;
1203 invert = TRUE;
1204 }
1205 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1206 UChar c = pattern.charAt(pos+1);
1207 invert = (c == UPPER_P);
1208 isName = (c == UPPER_N);
1209 pos += 2;
1210 pos = ICU_Utility::skipWhitespace(pattern, pos);
1211 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1212 // Syntax error; "\p" or "\P" not followed by "{"
1213 FAIL(ec);
1214 }
1215 } else {
1216 // Open delimiter not seen
1217 FAIL(ec);
1218 }
1219
1220 // Look for the matching close delimiter, either :] or }
1221 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1222 if (close < 0) {
1223 // Syntax error; close delimiter missing
1224 FAIL(ec);
1225 }
1226
1227 // Look for an '=' sign. If this is present, we will parse a
1228 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1229 // pattern.
1230 int32_t equals = pattern.indexOf(EQUALS, pos);
1231 UnicodeString propName, valueName;
1232 if (equals >= 0 && equals < close && !isName) {
1233 // Equals seen; parse medium/long pattern
1234 pattern.extractBetween(pos, equals, propName);
1235 pattern.extractBetween(equals+1, close, valueName);
1236 }
1237
1238 else {
1239 // Handle case where no '=' is seen, and \N{}
1240 pattern.extractBetween(pos, close, propName);
1241
1242 // Handle \N{name}
1243 if (isName) {
1244 // This is a little inefficient since it means we have to
1245 // parse NAME_PROP back to UCHAR_NAME even though we already
1246 // know it's UCHAR_NAME. If we refactor the API to
1247 // support args of (UProperty, char*) then we can remove
1248 // NAME_PROP and make this a little more efficient.
1249 valueName = propName;
1250 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1251 }
1252 }
1253
1254 applyPropertyAlias(propName, valueName, ec);
1255
1256 if (U_SUCCESS(ec)) {
1257 if (invert) {
1258 complement();
1259 }
1260
1261 // Move to the limit position after the close delimiter if the
1262 // parse succeeded.
1263 ppos.setIndex(close + (posix ? 2 : 1));
1264 }
1265
1266 return *this;
1267}
1268
1269/**
1270 * Parse a property pattern.
1271 * @param chars iterator over the pattern characters. Upon return
1272 * it will be advanced to the first character after the parsed
1273 * pattern, or the end of the iteration if all characters are
1274 * parsed.
1275 * @param rebuiltPat the pattern that was parsed, rebuilt or
1276 * copied from the input pattern, as appropriate.
1277 */
1278void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1279 UnicodeString& rebuiltPat,
1280 UErrorCode& ec) {
1281 if (U_FAILURE(ec)) return;
1282 UnicodeString pattern;
1283 chars.lookahead(pattern);
1284 ParsePosition pos(0);
1285 applyPropertyPattern(pattern, pos, ec);
1286 if (U_FAILURE(ec)) return;
1287 if (pos.getIndex() == 0) {
1288 // syntaxError(chars, "Invalid property pattern");
1289 ec = U_MALFORMED_SET;
1290 return;
1291 }
1292 chars.jumpahead(pos.getIndex());
1293 rebuiltPat.append(pattern, 0, pos.getIndex());
1294}
1295
374ca955
A
1296//----------------------------------------------------------------
1297// Case folding API
1298//----------------------------------------------------------------
1299
1300// add the result of a full case mapping to the set
1301// use str as a temporary string to avoid constructing one
1302static inline void
1303addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1304 if(result >= 0) {
1305 if(result > UCASE_MAX_STRING_LENGTH) {
1306 // add a single-code point case mapping
1307 set.add(result);
1308 } else {
1309 // add a string case mapping from full with length result
1310 str.setTo((UBool)FALSE, full, result);
1311 set.add(str);
1312 }
1313 }
1314 // result < 0: the code point mapped to itself, no need to add it
1315 // see ucase.h
1316}
1317
1318UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
46f4442e
A
1319 if (isFrozen() || isBogus()) {
1320 return *this;
1321 }
73c04bcf 1322 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
374ca955 1323 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1324 const UCaseProps *csp = ucase_getSingleton(&status);
374ca955 1325 if (U_SUCCESS(status)) {
73c04bcf
A
1326 UnicodeSet foldSet(*this);
1327 UnicodeString str;
1328 USetAdder sa = {
1329 (USet *)&foldSet,
1330 _set_add,
1331 _set_addRange,
1332 _set_addString,
46f4442e
A
1333 NULL, // don't need remove()
1334 NULL // don't need removeRange()
73c04bcf
A
1335 };
1336
1337 // start with input set to guarantee inclusion
1338 // USET_CASE: remove strings because the strings will actually be reduced (folded);
1339 // therefore, start with no strings and add only those needed
1340 if (attribute & USET_CASE_INSENSITIVE) {
1341 foldSet.strings->removeAllElements();
1342 }
1343
374ca955
A
1344 int32_t n = getRangeCount();
1345 UChar32 result;
1346 const UChar *full;
1347 int32_t locCache = 0;
1348
1349 for (int32_t i=0; i<n; ++i) {
1350 UChar32 start = getRangeStart(i);
1351 UChar32 end = getRangeEnd(i);
1352
73c04bcf
A
1353 if (attribute & USET_CASE_INSENSITIVE) {
1354 // full case closure
1355 for (UChar32 cp=start; cp<=end; ++cp) {
1356 ucase_addCaseClosure(csp, cp, &sa);
1357 }
1358 } else {
1359 // add case mappings
1360 // (does not add long s for regular s, or Kelvin for k, for example)
1361 for (UChar32 cp=start; cp<=end; ++cp) {
1362 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1363 addCaseMapping(foldSet, result, full, str);
374ca955 1364
73c04bcf
A
1365 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1366 addCaseMapping(foldSet, result, full, str);
374ca955 1367
73c04bcf
A
1368 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1369 addCaseMapping(foldSet, result, full, str);
374ca955 1370
73c04bcf
A
1371 result = ucase_toFullFolding(csp, cp, &full, 0);
1372 addCaseMapping(foldSet, result, full, str);
1373 }
374ca955
A
1374 }
1375 }
1376 if (strings != NULL && strings->size() > 0) {
73c04bcf
A
1377 if (attribute & USET_CASE_INSENSITIVE) {
1378 for (int32_t j=0; j<strings->size(); ++j) {
1379 str = *(const UnicodeString *) strings->elementAt(j);
1380 str.foldCase();
1381 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1382 foldSet.add(str); // does not map to code points: add the folded string itself
1383 }
1384 }
1385 } else {
1386 Locale root("");
374ca955 1387#if !UCONFIG_NO_BREAK_ITERATION
73c04bcf 1388 BreakIterator *bi = BreakIterator::createWordInstance(root, status);
374ca955 1389#endif
73c04bcf
A
1390 if (U_SUCCESS(status)) {
1391 const UnicodeString *pStr;
374ca955 1392
73c04bcf
A
1393 for (int32_t j=0; j<strings->size(); ++j) {
1394 pStr = (const UnicodeString *) strings->elementAt(j);
1395 (str = *pStr).toLower(root);
1396 foldSet.add(str);
374ca955 1397#if !UCONFIG_NO_BREAK_ITERATION
73c04bcf
A
1398 (str = *pStr).toTitle(bi, root);
1399 foldSet.add(str);
374ca955 1400#endif
73c04bcf
A
1401 (str = *pStr).toUpper(root);
1402 foldSet.add(str);
1403 (str = *pStr).foldCase();
1404 foldSet.add(str);
1405 }
374ca955 1406 }
374ca955 1407#if !UCONFIG_NO_BREAK_ITERATION
73c04bcf 1408 delete bi;
374ca955 1409#endif
73c04bcf 1410 }
374ca955
A
1411 }
1412 *this = foldSet;
1413 }
1414 }
1415 return *this;
1416}
1417
374ca955 1418U_NAMESPACE_END