]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/uniset_props.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / uniset_props.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
73c04bcf 4* Copyright (C) 1999-2006, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: uniset_props.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug25
14* created by: Markus W. Scherer
15*
16* Character property dependent functions moved here from uniset.cpp
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/uniset.h"
21#include "unicode/parsepos.h"
22#include "unicode/uchar.h"
23#include "unicode/uscript.h"
24#include "unicode/symtable.h"
25#include "unicode/uset.h"
26#include "unicode/locid.h"
27#include "unicode/brkiter.h"
28#include "uset_imp.h"
29#include "ruleiter.h"
30#include "cmemory.h"
374ca955
A
31#include "ucln_cmn.h"
32#include "util.h"
33#include "uvector.h"
34#include "uprops.h"
35#include "propname.h"
36#include "unormimp.h"
37#include "ucase.h"
73c04bcf 38#include "ubidi_props.h"
374ca955
A
39#include "uinvchar.h"
40#include "charstr.h"
41#include "cstring.h"
42#include "mutex.h"
43#include "uassert.h"
44#include "hash.h"
73c04bcf
A
45
46#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
374ca955
A
47
48// initial storage. Must be >= 0
49// *** same as in uniset.cpp ! ***
50#define START_EXTRA 16
51
52// Define UChar constants using hex for EBCDIC compatibility
53// Used #define to reduce private static exports and memory access time.
54#define SET_OPEN ((UChar)0x005B) /*[*/
55#define SET_CLOSE ((UChar)0x005D) /*]*/
56#define HYPHEN ((UChar)0x002D) /*-*/
57#define COMPLEMENT ((UChar)0x005E) /*^*/
58#define COLON ((UChar)0x003A) /*:*/
59#define BACKSLASH ((UChar)0x005C) /*\*/
60#define INTERSECTION ((UChar)0x0026) /*&*/
61#define UPPER_U ((UChar)0x0055) /*U*/
62#define LOWER_U ((UChar)0x0075) /*u*/
63#define OPEN_BRACE ((UChar)123) /*{*/
64#define CLOSE_BRACE ((UChar)125) /*}*/
65#define UPPER_P ((UChar)0x0050) /*P*/
66#define LOWER_P ((UChar)0x0070) /*p*/
67#define UPPER_N ((UChar)78) /*N*/
68#define EQUALS ((UChar)0x003D) /*=*/
69
73c04bcf 70//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
374ca955 71static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
73c04bcf 72//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
374ca955 73static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
73c04bcf 74//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
374ca955
A
75static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
76
77// Special property set IDs
78static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
79static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
73c04bcf 80static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
374ca955
A
81
82// Unicode name property alias
83#define NAME_PROP "na"
84#define NAME_PROP_LENGTH 2
85
374ca955
A
86/**
87 * Delimiter string used in patterns to close a category reference:
88 * ":]". Example: "[:Lu:]".
89 */
73c04bcf 90//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
374ca955
A
91
92U_NAMESPACE_BEGIN
93
94static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
95
374ca955
A
96// helper functions for matching of pattern syntax pieces ------------------ ***
97// these functions are parallel to the PERL_OPEN etc. strings above
98
99// using these functions is not only faster than UnicodeString::compare() and
100// caseCompare(), but they also make UnicodeSet work for simple patterns when
101// no Unicode properties data is available - when caseCompare() fails
102
103static inline UBool
104isPerlOpen(const UnicodeString &pattern, int32_t pos) {
105 UChar c;
106 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
107}
108
73c04bcf 109/*static inline UBool
374ca955
A
110isPerlClose(const UnicodeString &pattern, int32_t pos) {
111 return pattern.charAt(pos)==CLOSE_BRACE;
73c04bcf 112}*/
374ca955
A
113
114static inline UBool
115isNameOpen(const UnicodeString &pattern, int32_t pos) {
116 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
117}
118
119static inline UBool
120isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
121 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
122}
123
73c04bcf 124/*static inline UBool
374ca955
A
125isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
126 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
73c04bcf 127}*/
374ca955
A
128
129// TODO memory debugging provided inside uniset.cpp
130// could be made available here but probably obsolete with use of modern
131// memory leak checker tools
132#define _dbgct(me)
133
134//----------------------------------------------------------------
135// Constructors &c
136//----------------------------------------------------------------
137
138/**
139 * Constructs a set from the given pattern, optionally ignoring
140 * white space. See the class description for the syntax of the
141 * pattern language.
142 * @param pattern a string specifying what characters are in the set
143 */
144UnicodeSet::UnicodeSet(const UnicodeString& pattern,
145 UErrorCode& status) :
146 len(0), capacity(START_EXTRA), bufferCapacity(0),
147 list(0), buffer(0), strings(0)
148{
149 if(U_SUCCESS(status)){
150 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
151 /* test for NULL */
152 if(list == NULL) {
153 status = U_MEMORY_ALLOCATION_ERROR;
154 }else{
155 allocateStrings();
156 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
157 }
158 }
159 _dbgct(this);
160}
161
162/**
163 * Constructs a set from the given pattern, optionally ignoring
164 * white space. See the class description for the syntax of the
165 * pattern language.
166 * @param pattern a string specifying what characters are in the set
167 * @param options bitmask for options to apply to the pattern.
168 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
169 */
170UnicodeSet::UnicodeSet(const UnicodeString& pattern,
171 uint32_t options,
172 const SymbolTable* symbols,
173 UErrorCode& status) :
174 len(0), capacity(START_EXTRA), bufferCapacity(0),
175 list(0), buffer(0), strings(0)
176{
177 if(U_SUCCESS(status)){
178 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
179 /* test for NULL */
180 if(list == NULL) {
181 status = U_MEMORY_ALLOCATION_ERROR;
182 }else{
183 allocateStrings();
184 applyPattern(pattern, options, symbols, status);
185 }
186 }
187 _dbgct(this);
188}
189
190UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
191 uint32_t options,
192 const SymbolTable* symbols,
193 UErrorCode& status) :
194 len(0), capacity(START_EXTRA), bufferCapacity(0),
195 list(0), buffer(0), strings(0)
196{
197 if(U_SUCCESS(status)){
198 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
199 /* test for NULL */
200 if(list == NULL) {
201 status = U_MEMORY_ALLOCATION_ERROR;
202 }else{
203 allocateStrings();
204 applyPattern(pattern, pos, options, symbols, status);
205 }
206 }
207 _dbgct(this);
208}
209
374ca955
A
210//----------------------------------------------------------------
211// Public API
212//----------------------------------------------------------------
213
214/**
215 * Modifies this set to represent the set specified by the given
216 * pattern, optionally ignoring white space. See the class
217 * description for the syntax of the pattern language.
218 * @param pattern a string specifying what characters are in the set
219 * @param ignoreSpaces if <code>true</code>, all spaces in the
220 * pattern are ignored. Spaces are those characters for which
221 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
222 * Characters preceded by '\\' are escaped, losing any special
223 * meaning they otherwise have. Spaces may be included by
224 * escaping them.
225 * @exception <code>IllegalArgumentException</code> if the pattern
226 * contains a syntax error.
227 */
228UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
229 UErrorCode& status) {
230 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
231}
232
233
234/**
235 * Modifies this set to represent the set specified by the given
236 * pattern, optionally ignoring white space. See the class
237 * description for the syntax of the pattern language.
238 * @param pattern a string specifying what characters are in the set
239 * @param options bitmask for options to apply to the pattern.
240 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
241 */
242UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
243 uint32_t options,
244 const SymbolTable* symbols,
245 UErrorCode& status) {
246 if (U_FAILURE(status)) {
247 return *this;
248 }
249
250 ParsePosition pos(0);
251 applyPattern(pattern, pos, options, symbols, status);
252 if (U_FAILURE(status)) return *this;
253
254 int32_t i = pos.getIndex();
255
256 if (options & USET_IGNORE_SPACE) {
257 // Skip over trailing whitespace
258 ICU_Utility::skipWhitespace(pattern, i, TRUE);
259 }
260
261 if (i != pattern.length()) {
262 status = U_ILLEGAL_ARGUMENT_ERROR;
263 }
264 return *this;
265}
266
267UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
268 ParsePosition& pos,
269 uint32_t options,
270 const SymbolTable* symbols,
271 UErrorCode& status) {
272 if (U_FAILURE(status)) {
273 return *this;
274 }
275 // Need to build the pattern in a temporary string because
276 // _applyPattern calls add() etc., which set pat to empty.
277 UnicodeString rebuiltPat;
278 RuleCharacterIterator chars(pattern, symbols, pos);
279 applyPattern(chars, symbols, rebuiltPat, options, status);
280 if (U_FAILURE(status)) return *this;
281 if (chars.inVariable()) {
282 // syntaxError(chars, "Extra chars in variable value");
283 status = U_MALFORMED_SET;
284 return *this;
285 }
286 pat = rebuiltPat;
287 return *this;
288}
289
290/**
291 * Return true if the given position, in the given pattern, appears
292 * to be the start of a UnicodeSet pattern.
293 */
294UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
295 return ((pos+1) < pattern.length() &&
296 pattern.charAt(pos) == (UChar)91/*[*/) ||
297 resemblesPropertyPattern(pattern, pos);
298}
299
300//----------------------------------------------------------------
301// Implementation: Pattern parsing
302//----------------------------------------------------------------
303
304/**
305 * A small all-inline class to manage a UnicodeSet pointer. Add
306 * operator->() etc. as needed.
307 */
308class UnicodeSetPointer {
309 UnicodeSet* p;
310public:
311 inline UnicodeSetPointer() : p(0) {}
312 inline ~UnicodeSetPointer() { delete p; }
313 inline UnicodeSet* pointer() { return p; }
314 inline UBool allocate() {
315 if (p == 0) {
316 p = new UnicodeSet();
317 }
318 return p != 0;
319 }
320};
321
322/**
323 * Parse the pattern from the given RuleCharacterIterator. The
324 * iterator is advanced over the parsed pattern.
325 * @param chars iterator over the pattern characters. Upon return
326 * it will be advanced to the first character after the parsed
327 * pattern, or the end of the iteration if all characters are
328 * parsed.
329 * @param symbols symbol table to use to parse and dereference
330 * variables, or null if none.
331 * @param rebuiltPat the pattern that was parsed, rebuilt or
332 * copied from the input pattern, as appropriate.
333 * @param options a bit mask of zero or more of the following:
334 * IGNORE_SPACE, CASE.
335 */
336void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
337 const SymbolTable* symbols,
338 UnicodeString& rebuiltPat,
339 uint32_t options,
340 UErrorCode& ec) {
341 if (U_FAILURE(ec)) return;
342
343 // Syntax characters: [ ] ^ - & { }
344
345 // Recognized special forms for chars, sets: c-c s-s s&s
346
347 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
348 RuleCharacterIterator::PARSE_ESCAPES;
349 if ((options & USET_IGNORE_SPACE) != 0) {
350 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
351 }
352
353 UnicodeString patLocal, buf;
354 UBool usePat = FALSE;
355 UnicodeSetPointer scratch;
356 RuleCharacterIterator::Pos backup;
357
358 // mode: 0=before [, 1=between [...], 2=after ]
359 // lastItem: 0=none, 1=char, 2=set
360 int8_t lastItem = 0, mode = 0;
361 UChar32 lastChar = 0;
362 UChar op = 0;
363
364 UBool invert = FALSE;
365
366 clear();
367
368 while (mode != 2 && !chars.atEnd()) {
369 U_ASSERT((lastItem == 0 && op == 0) ||
370 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
371 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
372 op == INTERSECTION /*'&'*/)));
373
374 UChar32 c = 0;
375 UBool literal = FALSE;
376 UnicodeSet* nested = 0; // alias - do not delete
377
378 // -------- Check for property pattern
379
380 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
381 int8_t setMode = 0;
382 if (resemblesPropertyPattern(chars, opts)) {
383 setMode = 2;
384 }
385
386 // -------- Parse '[' of opening delimiter OR nested set.
387 // If there is a nested set, use `setMode' to define how
388 // the set should be parsed. If the '[' is part of the
389 // opening delimiter for this pattern, parse special
390 // strings "[", "[^", "[-", and "[^-". Check for stand-in
391 // characters representing a nested set in the symbol
392 // table.
393
394 else {
395 // Prepare to backup if necessary
396 chars.getPos(backup);
397 c = chars.next(opts, literal, ec);
398 if (U_FAILURE(ec)) return;
399
400 if (c == 0x5B /*'['*/ && !literal) {
401 if (mode == 1) {
402 chars.setPos(backup); // backup
403 setMode = 1;
404 } else {
405 // Handle opening '[' delimiter
406 mode = 1;
407 patLocal.append((UChar) 0x5B /*'['*/);
408 chars.getPos(backup); // prepare to backup
409 c = chars.next(opts, literal, ec);
410 if (U_FAILURE(ec)) return;
411 if (c == 0x5E /*'^'*/ && !literal) {
412 invert = TRUE;
413 patLocal.append((UChar) 0x5E /*'^'*/);
414 chars.getPos(backup); // prepare to backup
415 c = chars.next(opts, literal, ec);
416 if (U_FAILURE(ec)) return;
417 }
418 // Fall through to handle special leading '-';
419 // otherwise restart loop for nested [], \p{}, etc.
420 if (c == HYPHEN /*'-'*/) {
421 literal = TRUE;
422 // Fall through to handle literal '-' below
423 } else {
424 chars.setPos(backup); // backup
425 continue;
426 }
427 }
428 } else if (symbols != 0) {
429 const UnicodeFunctor *m = symbols->lookupMatcher(c);
430 if (m != 0) {
431 if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
432 ec = U_MALFORMED_SET;
433 return;
434 }
435 // casting away const, but `nested' won't be modified
436 // (important not to modify stored set)
437 nested = (UnicodeSet*) m;
438 setMode = 3;
439 }
440 }
441 }
442
443 // -------- Handle a nested set. This either is inline in
444 // the pattern or represented by a stand-in that has
445 // previously been parsed and was looked up in the symbol
446 // table.
447
448 if (setMode != 0) {
449 if (lastItem == 1) {
450 if (op != 0) {
451 // syntaxError(chars, "Char expected after operator");
452 ec = U_MALFORMED_SET;
453 return;
454 }
455 add(lastChar, lastChar);
456 _appendToPat(patLocal, lastChar, FALSE);
457 lastItem = 0;
458 op = 0;
459 }
460
461 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
462 patLocal.append(op);
463 }
464
465 if (nested == 0) {
466 // lazy allocation
467 if (!scratch.allocate()) {
468 ec = U_MEMORY_ALLOCATION_ERROR;
469 return;
470 }
471 nested = scratch.pointer();
472 }
473 switch (setMode) {
474 case 1:
475 nested->applyPattern(chars, symbols, patLocal, options, ec);
476 break;
477 case 2:
478 chars.skipIgnored(opts);
479 nested->applyPropertyPattern(chars, patLocal, ec);
480 if (U_FAILURE(ec)) return;
481 break;
482 case 3: // `nested' already parsed
483 nested->_toPattern(patLocal, FALSE);
484 break;
485 }
486
487 usePat = TRUE;
488
489 if (mode == 0) {
490 // Entire pattern is a category; leave parse loop
491 *this = *nested;
492 mode = 2;
493 break;
494 }
495
496 switch (op) {
497 case HYPHEN: /*'-'*/
498 removeAll(*nested);
499 break;
500 case INTERSECTION: /*'&'*/
501 retainAll(*nested);
502 break;
503 case 0:
504 addAll(*nested);
505 break;
506 }
507
508 op = 0;
509 lastItem = 2;
510
511 continue;
512 }
513
514 if (mode == 0) {
515 // syntaxError(chars, "Missing '['");
516 ec = U_MALFORMED_SET;
517 return;
518 }
519
520 // -------- Parse special (syntax) characters. If the
521 // current character is not special, or if it is escaped,
522 // then fall through and handle it below.
523
524 if (!literal) {
525 switch (c) {
526 case 0x5D /*']'*/:
527 if (lastItem == 1) {
528 add(lastChar, lastChar);
529 _appendToPat(patLocal, lastChar, FALSE);
530 }
531 // Treat final trailing '-' as a literal
532 if (op == HYPHEN /*'-'*/) {
533 add(op, op);
534 patLocal.append(op);
535 } else if (op == INTERSECTION /*'&'*/) {
536 // syntaxError(chars, "Trailing '&'");
537 ec = U_MALFORMED_SET;
538 return;
539 }
540 patLocal.append((UChar) 0x5D /*']'*/);
541 mode = 2;
542 continue;
543 case HYPHEN /*'-'*/:
544 if (op == 0) {
545 if (lastItem != 0) {
546 op = (UChar) c;
547 continue;
548 } else {
549 // Treat final trailing '-' as a literal
550 add(c, c);
551 c = chars.next(opts, literal, ec);
552 if (U_FAILURE(ec)) return;
553 if (c == 0x5D /*']'*/ && !literal) {
554 patLocal.append(HYPHEN_RIGHT_BRACE);
555 mode = 2;
556 continue;
557 }
558 }
559 }
560 // syntaxError(chars, "'-' not after char or set");
561 ec = U_MALFORMED_SET;
562 return;
563 case INTERSECTION /*'&'*/:
564 if (lastItem == 2 && op == 0) {
565 op = (UChar) c;
566 continue;
567 }
568 // syntaxError(chars, "'&' not after set");
569 ec = U_MALFORMED_SET;
570 return;
571 case 0x5E /*'^'*/:
572 // syntaxError(chars, "'^' not after '['");
573 ec = U_MALFORMED_SET;
574 return;
575 case 0x7B /*'{'*/:
576 if (op != 0) {
577 // syntaxError(chars, "Missing operand after operator");
578 ec = U_MALFORMED_SET;
579 return;
580 }
581 if (lastItem == 1) {
582 add(lastChar, lastChar);
583 _appendToPat(patLocal, lastChar, FALSE);
584 }
585 lastItem = 0;
586 buf.truncate(0);
587 {
588 UBool ok = FALSE;
589 while (!chars.atEnd()) {
590 c = chars.next(opts, literal, ec);
591 if (U_FAILURE(ec)) return;
592 if (c == 0x7D /*'}'*/ && !literal) {
593 ok = TRUE;
594 break;
595 }
596 buf.append(c);
597 }
598 if (buf.length() < 1 || !ok) {
599 // syntaxError(chars, "Invalid multicharacter string");
600 ec = U_MALFORMED_SET;
601 return;
602 }
603 }
604 // We have new string. Add it to set and continue;
605 // we don't need to drop through to the further
606 // processing
607 add(buf);
608 patLocal.append((UChar) 0x7B /*'{'*/);
609 _appendToPat(patLocal, buf, FALSE);
610 patLocal.append((UChar) 0x7D /*'}'*/);
611 continue;
612 case SymbolTable::SYMBOL_REF:
613 // symbols nosymbols
614 // [a-$] error error (ambiguous)
615 // [a$] anchor anchor
616 // [a-$x] var "x"* literal '$'
617 // [a-$.] error literal '$'
618 // *We won't get here in the case of var "x"
619 {
620 chars.getPos(backup);
621 c = chars.next(opts, literal, ec);
622 if (U_FAILURE(ec)) return;
623 UBool anchor = (c == 0x5D /*']'*/ && !literal);
624 if (symbols == 0 && !anchor) {
625 c = SymbolTable::SYMBOL_REF;
626 chars.setPos(backup);
627 break; // literal '$'
628 }
629 if (anchor && op == 0) {
630 if (lastItem == 1) {
631 add(lastChar, lastChar);
632 _appendToPat(patLocal, lastChar, FALSE);
633 }
634 add(U_ETHER);
635 usePat = TRUE;
636 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
637 patLocal.append((UChar) 0x5D /*']'*/);
638 mode = 2;
639 continue;
640 }
641 // syntaxError(chars, "Unquoted '$'");
642 ec = U_MALFORMED_SET;
643 return;
644 }
645 default:
646 break;
647 }
648 }
649
650 // -------- Parse literal characters. This includes both
651 // escaped chars ("\u4E01") and non-syntax characters
652 // ("a").
653
654 switch (lastItem) {
655 case 0:
656 lastItem = 1;
657 lastChar = c;
658 break;
659 case 1:
660 if (op == HYPHEN /*'-'*/) {
661 if (lastChar >= c) {
662 // Don't allow redundant (a-a) or empty (b-a) ranges;
663 // these are most likely typos.
664 // syntaxError(chars, "Invalid range");
665 ec = U_MALFORMED_SET;
666 return;
667 }
668 add(lastChar, c);
669 _appendToPat(patLocal, lastChar, FALSE);
670 patLocal.append(op);
671 _appendToPat(patLocal, c, FALSE);
672 lastItem = 0;
673 op = 0;
674 } else {
675 add(lastChar, lastChar);
676 _appendToPat(patLocal, lastChar, FALSE);
677 lastChar = c;
678 }
679 break;
680 case 2:
681 if (op != 0) {
682 // syntaxError(chars, "Set expected after operator");
683 ec = U_MALFORMED_SET;
684 return;
685 }
686 lastChar = c;
687 lastItem = 1;
688 break;
689 }
690 }
691
692 if (mode != 2) {
693 // syntaxError(chars, "Missing ']'");
694 ec = U_MALFORMED_SET;
695 return;
696 }
697
698 chars.skipIgnored(opts);
699
700 /**
701 * Handle global flags (invert, case insensitivity). If this
702 * pattern should be compiled case-insensitive, then we need
703 * to close over case BEFORE COMPLEMENTING. This makes
704 * patterns like /[^abc]/i work.
705 */
706 if ((options & USET_CASE_INSENSITIVE) != 0) {
73c04bcf 707 closeOver(USET_CASE_INSENSITIVE);
374ca955
A
708 }
709 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
710 closeOver(USET_ADD_CASE_MAPPINGS);
711 }
712 if (invert) {
713 complement();
714 }
715
716 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
717 // generated pattern.
718 if (usePat) {
719 rebuiltPat.append(patLocal);
720 } else {
721 _generatePattern(rebuiltPat, FALSE);
722 }
723}
724
725//----------------------------------------------------------------
726// Property set implementation
727//----------------------------------------------------------------
728
729static UBool numericValueFilter(UChar32 ch, void* context) {
730 return u_getNumericValue(ch) == *(double*)context;
731}
732
733static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
734 int32_t value = *(int32_t*)context;
735 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
736}
737
738static UBool versionFilter(UChar32 ch, void* context) {
739 UVersionInfo v, none = { 0, 0, 0, 0};
740 UVersionInfo* version = (UVersionInfo*)context;
741 u_charAge(ch, v);
742 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
743}
744
745typedef struct {
746 UProperty prop;
747 int32_t value;
748} IntPropertyContext;
749
750static UBool intPropertyFilter(UChar32 ch, void* context) {
751 IntPropertyContext* c = (IntPropertyContext*)context;
752 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
753}
754
755
756/**
757 * Generic filter-based scanning code for UCD property UnicodeSets.
758 */
759void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
760 void* context,
761 int32_t src,
762 UErrorCode &status) {
763 // Walk through all Unicode characters, noting the start
764 // and end of each range for which filter.contain(c) is
765 // true. Add each range to a set.
766 //
767 // To improve performance, use the INCLUSIONS set, which
768 // encodes information about character ranges that are known
769 // to have identical properties. INCLUSIONS contains
770 // only the first characters of such ranges.
771 //
772 // TODO Where possible, instead of scanning over code points,
773 // use internal property data to initialize UnicodeSets for
774 // those properties. Scanning code points is slow.
775 if (U_FAILURE(status)) return;
776
777 const UnicodeSet* inclusions = getInclusions(src, status);
778 if (U_FAILURE(status)) {
779 return;
780 }
781
782 clear();
783
784 UChar32 startHasProperty = -1;
785 int limitRange = inclusions->getRangeCount();
786
787 for (int j=0; j<limitRange; ++j) {
788 // get current range
789 UChar32 start = inclusions->getRangeStart(j);
790 UChar32 end = inclusions->getRangeEnd(j);
791
792 // for all the code points in the range, process
793 for (UChar32 ch = start; ch <= end; ++ch) {
794 // only add to this UnicodeSet on inflection points --
795 // where the hasProperty value changes to false
796 if ((*filter)(ch, context)) {
797 if (startHasProperty < 0) {
798 startHasProperty = ch;
799 }
800 } else if (startHasProperty >= 0) {
801 add(startHasProperty, ch-1);
802 startHasProperty = -1;
803 }
804 }
805 }
806 if (startHasProperty >= 0) {
807 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
808 }
809}
810
811static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
812 /* Note: we use ' ' in compiler code page */
813 int32_t j = 0;
814 char ch;
815 --dstCapacity; /* make room for term. zero */
816 while ((ch = *src++) != 0) {
817 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
818 continue;
819 }
820 if (j >= dstCapacity) return FALSE;
821 dst[j++] = ch;
822 }
823 if (j > 0 && dst[j-1] == ' ') --j;
824 dst[j] = 0;
825 return TRUE;
826}
827
828//----------------------------------------------------------------
829// Property set API
830//----------------------------------------------------------------
831
832#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
833
374ca955
A
834UnicodeSet&
835UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
836 if (U_FAILURE(ec)) return *this;
837
838 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
839 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
840 } else {
841 IntPropertyContext c = {prop, value};
842 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
843 }
844 return *this;
845}
846
847UnicodeSet&
848UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
849 const UnicodeString& value,
850 UErrorCode& ec) {
851 if (U_FAILURE(ec)) return *this;
852
853 // prop and value used to be converted to char * using the default
854 // converter instead of the invariant conversion.
855 // This should not be necessary because all Unicode property and value
856 // names use only invariant characters.
857 // If there are any variant characters, then we won't find them anyway.
858 // Checking first avoids assertion failures in the conversion.
859 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
860 !uprv_isInvariantUString(value.getBuffer(), value.length())
861 ) {
862 FAIL(ec);
863 }
864 CharString pname(prop);
865 CharString vname(value);
866
867 UProperty p;
868 int32_t v;
73c04bcf 869 UBool mustNotBeEmpty = FALSE, invert = FALSE;
374ca955
A
870
871 if (value.length() > 0) {
872 p = u_getPropertyEnum(pname);
873 if (p == UCHAR_INVALID_CODE) FAIL(ec);
874
875 // Treat gc as gcm
876 if (p == UCHAR_GENERAL_CATEGORY) {
877 p = UCHAR_GENERAL_CATEGORY_MASK;
878 }
879
880 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
881 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
882 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
883 v = u_getPropertyValueEnum(p, vname);
884 if (v == UCHAR_INVALID_CODE) {
885 // Handle numeric CCC
886 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
887 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
888 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
889 char* end;
890 double value = uprv_strtod(vname, &end);
891 v = (int32_t) value;
892 if (v != value || v < 0 || *end != 0) {
893 // non-integral or negative value, or trailing junk
894 FAIL(ec);
895 }
896 // If the resultant set is empty then the numeric value
897 // was invalid.
898 mustNotBeEmpty = TRUE;
899 } else {
900 FAIL(ec);
901 }
902 }
903 }
904
905 else {
906
907 switch (p) {
908 case UCHAR_NUMERIC_VALUE:
909 {
910 char* end;
911 double value = uprv_strtod(vname, &end);
912 if (*end != 0) {
913 FAIL(ec);
914 }
915 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
916 return *this;
917 }
918 break;
919 case UCHAR_NAME:
920 case UCHAR_UNICODE_1_NAME:
921 {
922 // Must munge name, since u_charFromName() does not do
923 // 'loose' matching.
924 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
925 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
926 UCharNameChoice choice = (p == UCHAR_NAME) ?
927 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
928 UChar32 ch = u_charFromName(choice, buf, &ec);
929 if (U_SUCCESS(ec)) {
930 clear();
931 add(ch);
932 return *this;
933 } else {
934 FAIL(ec);
935 }
936 }
937 break;
938 case UCHAR_AGE:
939 {
940 // Must munge name, since u_versionFromString() does not do
941 // 'loose' matching.
942 char buf[128];
943 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
944 UVersionInfo version;
945 u_versionFromString(version, buf);
73c04bcf 946 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
374ca955
A
947 return *this;
948 }
949 break;
950 default:
951 // p is a non-binary, non-enumerated property that we
952 // don't support (yet).
953 FAIL(ec);
954 }
955 }
956 }
957
958 else {
959 // value is empty. Interpret as General Category, Script, or
960 // Binary property.
961 p = UCHAR_GENERAL_CATEGORY_MASK;
962 v = u_getPropertyValueEnum(p, pname);
963 if (v == UCHAR_INVALID_CODE) {
964 p = UCHAR_SCRIPT;
965 v = u_getPropertyValueEnum(p, pname);
966 if (v == UCHAR_INVALID_CODE) {
967 p = u_getPropertyEnum(pname);
968 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
969 v = 1;
970 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
971 set(MIN_VALUE, MAX_VALUE);
972 return *this;
973 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
974 set(0, 0x7F);
975 return *this;
73c04bcf
A
976 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
977 // [:Assigned:]=[:^Cn:]
978 p = UCHAR_GENERAL_CATEGORY_MASK;
979 v = U_GC_CN_MASK;
980 invert = TRUE;
374ca955 981 } else {
374ca955
A
982 FAIL(ec);
983 }
984 }
985 }
986 }
987
988 applyIntPropertyValue(p, v, ec);
73c04bcf
A
989 if(invert) {
990 complement();
991 }
374ca955
A
992
993 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
994 // mustNotBeEmpty is set to true if an empty set indicates
995 // invalid input.
996 ec = U_ILLEGAL_ARGUMENT_ERROR;
997 }
998
999 return *this;
1000}
1001
1002//----------------------------------------------------------------
1003// Property set patterns
1004//----------------------------------------------------------------
1005
1006/**
1007 * Return true if the given position, in the given pattern, appears
1008 * to be the start of a property set pattern.
1009 */
1010UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1011 int32_t pos) {
1012 // Patterns are at least 5 characters long
1013 if ((pos+5) > pattern.length()) {
1014 return FALSE;
1015 }
1016
1017 // Look for an opening [:, [:^, \p, or \P
1018 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1019}
1020
1021/**
1022 * Return true if the given iterator appears to point at a
1023 * property pattern. Regardless of the result, return with the
1024 * iterator unchanged.
1025 * @param chars iterator over the pattern characters. Upon return
1026 * it will be unchanged.
1027 * @param iterOpts RuleCharacterIterator options
1028 */
1029UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1030 int32_t iterOpts) {
1031 // NOTE: literal will always be FALSE, because we don't parse escapes.
1032 UBool result = FALSE, literal;
1033 UErrorCode ec = U_ZERO_ERROR;
1034 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1035 RuleCharacterIterator::Pos pos;
1036 chars.getPos(pos);
1037 UChar32 c = chars.next(iterOpts, literal, ec);
1038 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1039 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1040 literal, ec);
1041 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1042 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1043 }
1044 chars.setPos(pos);
1045 return result && U_SUCCESS(ec);
1046}
1047
1048/**
1049 * Parse the given property pattern at the given parse position.
1050 */
1051UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1052 ParsePosition& ppos,
1053 UErrorCode &ec) {
1054 int32_t pos = ppos.getIndex();
1055
1056 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1057 UBool isName = FALSE; // true for \N{pat}, o/w false
1058 UBool invert = FALSE;
1059
1060 if (U_FAILURE(ec)) return *this;
1061
1062 // Minimum length is 5 characters, e.g. \p{L}
1063 if ((pos+5) > pattern.length()) {
1064 FAIL(ec);
1065 }
1066
1067 // On entry, ppos should point to one of the following locations:
1068 // Look for an opening [:, [:^, \p, or \P
1069 if (isPOSIXOpen(pattern, pos)) {
1070 posix = TRUE;
1071 pos += 2;
1072 pos = ICU_Utility::skipWhitespace(pattern, pos);
1073 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1074 ++pos;
1075 invert = TRUE;
1076 }
1077 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1078 UChar c = pattern.charAt(pos+1);
1079 invert = (c == UPPER_P);
1080 isName = (c == UPPER_N);
1081 pos += 2;
1082 pos = ICU_Utility::skipWhitespace(pattern, pos);
1083 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1084 // Syntax error; "\p" or "\P" not followed by "{"
1085 FAIL(ec);
1086 }
1087 } else {
1088 // Open delimiter not seen
1089 FAIL(ec);
1090 }
1091
1092 // Look for the matching close delimiter, either :] or }
1093 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1094 if (close < 0) {
1095 // Syntax error; close delimiter missing
1096 FAIL(ec);
1097 }
1098
1099 // Look for an '=' sign. If this is present, we will parse a
1100 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1101 // pattern.
1102 int32_t equals = pattern.indexOf(EQUALS, pos);
1103 UnicodeString propName, valueName;
1104 if (equals >= 0 && equals < close && !isName) {
1105 // Equals seen; parse medium/long pattern
1106 pattern.extractBetween(pos, equals, propName);
1107 pattern.extractBetween(equals+1, close, valueName);
1108 }
1109
1110 else {
1111 // Handle case where no '=' is seen, and \N{}
1112 pattern.extractBetween(pos, close, propName);
1113
1114 // Handle \N{name}
1115 if (isName) {
1116 // This is a little inefficient since it means we have to
1117 // parse NAME_PROP back to UCHAR_NAME even though we already
1118 // know it's UCHAR_NAME. If we refactor the API to
1119 // support args of (UProperty, char*) then we can remove
1120 // NAME_PROP and make this a little more efficient.
1121 valueName = propName;
1122 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1123 }
1124 }
1125
1126 applyPropertyAlias(propName, valueName, ec);
1127
1128 if (U_SUCCESS(ec)) {
1129 if (invert) {
1130 complement();
1131 }
1132
1133 // Move to the limit position after the close delimiter if the
1134 // parse succeeded.
1135 ppos.setIndex(close + (posix ? 2 : 1));
1136 }
1137
1138 return *this;
1139}
1140
1141/**
1142 * Parse a property pattern.
1143 * @param chars iterator over the pattern characters. Upon return
1144 * it will be advanced to the first character after the parsed
1145 * pattern, or the end of the iteration if all characters are
1146 * parsed.
1147 * @param rebuiltPat the pattern that was parsed, rebuilt or
1148 * copied from the input pattern, as appropriate.
1149 */
1150void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1151 UnicodeString& rebuiltPat,
1152 UErrorCode& ec) {
1153 if (U_FAILURE(ec)) return;
1154 UnicodeString pattern;
1155 chars.lookahead(pattern);
1156 ParsePosition pos(0);
1157 applyPropertyPattern(pattern, pos, ec);
1158 if (U_FAILURE(ec)) return;
1159 if (pos.getIndex() == 0) {
1160 // syntaxError(chars, "Invalid property pattern");
1161 ec = U_MALFORMED_SET;
1162 return;
1163 }
1164 chars.jumpahead(pos.getIndex());
1165 rebuiltPat.append(pattern, 0, pos.getIndex());
1166}
1167
1168//----------------------------------------------------------------
1169// Inclusions list
1170//----------------------------------------------------------------
1171
1172U_CDECL_BEGIN
1173
1174// USetAdder implementation
1175// Does not use uset.h to reduce code dependencies
1176static void U_CALLCONV
1177_set_add(USet *set, UChar32 c) {
1178 ((UnicodeSet *)set)->add(c);
1179}
1180
1181static void U_CALLCONV
1182_set_addRange(USet *set, UChar32 start, UChar32 end) {
1183 ((UnicodeSet *)set)->add(start, end);
1184}
1185
1186static void U_CALLCONV
1187_set_addString(USet *set, const UChar *str, int32_t length) {
1188 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
1189}
1190
1191/**
1192 * Cleanup function for UnicodeSet
1193 */
1194static UBool U_CALLCONV uset_cleanup(void) {
1195 int32_t i;
1196
1197 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
1198 if (INCLUSIONS[i] != NULL) {
1199 delete INCLUSIONS[i];
1200 INCLUSIONS[i] = NULL;
1201 }
1202 }
1203
374ca955
A
1204 return TRUE;
1205}
1206
1207U_CDECL_END
1208
1209const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
1210 umtx_lock(NULL);
1211 UBool f = (INCLUSIONS[src] == NULL);
1212 umtx_unlock(NULL);
1213 if (f) {
1214 UnicodeSet* incl = new UnicodeSet();
1215 USetAdder sa = {
1216 (USet *)incl,
1217 _set_add,
1218 _set_addRange,
73c04bcf
A
1219 _set_addString,
1220 NULL // don't need remove()
374ca955
A
1221 };
1222
1223 if (incl != NULL) {
1224 switch(src) {
1225 case UPROPS_SRC_CHAR:
1226 uchar_addPropertyStarts(&sa, &status);
1227 break;
73c04bcf
A
1228 case UPROPS_SRC_PROPSVEC:
1229 upropsvec_addPropertyStarts(&sa, &status);
1230 break;
1231 case UPROPS_SRC_CHAR_AND_PROPSVEC:
1232 uchar_addPropertyStarts(&sa, &status);
1233 upropsvec_addPropertyStarts(&sa, &status);
1234 break;
374ca955
A
1235 case UPROPS_SRC_HST:
1236 uhst_addPropertyStarts(&sa, &status);
1237 break;
1238#if !UCONFIG_NO_NORMALIZATION
1239 case UPROPS_SRC_NORM:
1240 unorm_addPropertyStarts(&sa, &status);
1241 break;
1242#endif
1243 case UPROPS_SRC_CASE:
1244 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
1245 break;
73c04bcf
A
1246 case UPROPS_SRC_BIDI:
1247 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
1248 break;
374ca955
A
1249 default:
1250 status = U_INTERNAL_PROGRAM_ERROR;
1251 break;
1252 }
1253 if (U_SUCCESS(status)) {
1254 umtx_lock(NULL);
1255 if (INCLUSIONS[src] == NULL) {
1256 INCLUSIONS[src] = incl;
1257 incl = NULL;
1258 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
1259 }
1260 umtx_unlock(NULL);
1261 }
1262 delete incl;
1263 } else {
1264 status = U_MEMORY_ALLOCATION_ERROR;
1265 }
1266 }
1267 return INCLUSIONS[src];
1268}
1269
1270//----------------------------------------------------------------
1271// Case folding API
1272//----------------------------------------------------------------
1273
1274// add the result of a full case mapping to the set
1275// use str as a temporary string to avoid constructing one
1276static inline void
1277addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1278 if(result >= 0) {
1279 if(result > UCASE_MAX_STRING_LENGTH) {
1280 // add a single-code point case mapping
1281 set.add(result);
1282 } else {
1283 // add a string case mapping from full with length result
1284 str.setTo((UBool)FALSE, full, result);
1285 set.add(str);
1286 }
1287 }
1288 // result < 0: the code point mapped to itself, no need to add it
1289 // see ucase.h
1290}
1291
1292UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
73c04bcf 1293 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
374ca955 1294 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1295 const UCaseProps *csp = ucase_getSingleton(&status);
374ca955 1296 if (U_SUCCESS(status)) {
73c04bcf
A
1297 UnicodeSet foldSet(*this);
1298 UnicodeString str;
1299 USetAdder sa = {
1300 (USet *)&foldSet,
1301 _set_add,
1302 _set_addRange,
1303 _set_addString,
1304 NULL // don't need remove()
1305 };
1306
1307 // start with input set to guarantee inclusion
1308 // USET_CASE: remove strings because the strings will actually be reduced (folded);
1309 // therefore, start with no strings and add only those needed
1310 if (attribute & USET_CASE_INSENSITIVE) {
1311 foldSet.strings->removeAllElements();
1312 }
1313
374ca955
A
1314 int32_t n = getRangeCount();
1315 UChar32 result;
1316 const UChar *full;
1317 int32_t locCache = 0;
1318
1319 for (int32_t i=0; i<n; ++i) {
1320 UChar32 start = getRangeStart(i);
1321 UChar32 end = getRangeEnd(i);
1322
73c04bcf
A
1323 if (attribute & USET_CASE_INSENSITIVE) {
1324 // full case closure
1325 for (UChar32 cp=start; cp<=end; ++cp) {
1326 ucase_addCaseClosure(csp, cp, &sa);
1327 }
1328 } else {
1329 // add case mappings
1330 // (does not add long s for regular s, or Kelvin for k, for example)
1331 for (UChar32 cp=start; cp<=end; ++cp) {
1332 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1333 addCaseMapping(foldSet, result, full, str);
374ca955 1334
73c04bcf
A
1335 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1336 addCaseMapping(foldSet, result, full, str);
374ca955 1337
73c04bcf
A
1338 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1339 addCaseMapping(foldSet, result, full, str);
374ca955 1340
73c04bcf
A
1341 result = ucase_toFullFolding(csp, cp, &full, 0);
1342 addCaseMapping(foldSet, result, full, str);
1343 }
374ca955
A
1344 }
1345 }
1346 if (strings != NULL && strings->size() > 0) {
73c04bcf
A
1347 if (attribute & USET_CASE_INSENSITIVE) {
1348 for (int32_t j=0; j<strings->size(); ++j) {
1349 str = *(const UnicodeString *) strings->elementAt(j);
1350 str.foldCase();
1351 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1352 foldSet.add(str); // does not map to code points: add the folded string itself
1353 }
1354 }
1355 } else {
1356 Locale root("");
374ca955 1357#if !UCONFIG_NO_BREAK_ITERATION
73c04bcf 1358 BreakIterator *bi = BreakIterator::createWordInstance(root, status);
374ca955 1359#endif
73c04bcf
A
1360 if (U_SUCCESS(status)) {
1361 const UnicodeString *pStr;
374ca955 1362
73c04bcf
A
1363 for (int32_t j=0; j<strings->size(); ++j) {
1364 pStr = (const UnicodeString *) strings->elementAt(j);
1365 (str = *pStr).toLower(root);
1366 foldSet.add(str);
374ca955 1367#if !UCONFIG_NO_BREAK_ITERATION
73c04bcf
A
1368 (str = *pStr).toTitle(bi, root);
1369 foldSet.add(str);
374ca955 1370#endif
73c04bcf
A
1371 (str = *pStr).toUpper(root);
1372 foldSet.add(str);
1373 (str = *pStr).foldCase();
1374 foldSet.add(str);
1375 }
374ca955 1376 }
374ca955 1377#if !UCONFIG_NO_BREAK_ITERATION
73c04bcf 1378 delete bi;
374ca955 1379#endif
73c04bcf 1380 }
374ca955
A
1381 }
1382 *this = foldSet;
1383 }
1384 }
1385 return *this;
1386}
1387
374ca955 1388U_NAMESPACE_END