1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: uniset_closure.cpp
12 * tab size: 8 (not used)
15 * created on: 2011may30
16 * created by: Markus W. Scherer
18 * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp
19 * to simplify dependencies.
20 * In particular, this depends on the BreakIterator, but the BreakIterator
21 * code also builds UnicodeSets from patterns and needs uniset_props.
24 #include "unicode/brkiter.h"
25 #include "unicode/locid.h"
26 #include "unicode/parsepos.h"
27 #include "unicode/uniset.h"
36 // TODO memory debugging provided inside uniset.cpp
37 // could be made available here but probably obsolete with use of modern
38 // memory leak checker tools
41 //----------------------------------------------------------------
43 //----------------------------------------------------------------
45 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
,
47 const SymbolTable
* symbols
,
49 applyPattern(pattern
, options
, symbols
, status
);
53 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
, ParsePosition
& pos
,
55 const SymbolTable
* symbols
,
57 applyPattern(pattern
, pos
, options
, symbols
, status
);
61 //----------------------------------------------------------------
63 //----------------------------------------------------------------
65 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
,
67 const SymbolTable
* symbols
,
70 applyPattern(pattern
, pos
, options
, symbols
, status
);
71 if (U_FAILURE(status
)) return *this;
73 int32_t i
= pos
.getIndex();
75 if (options
& USET_IGNORE_SPACE
) {
76 // Skip over trailing whitespace
77 ICU_Utility::skipWhitespace(pattern
, i
, TRUE
);
80 if (i
!= pattern
.length()) {
81 status
= U_ILLEGAL_ARGUMENT_ERROR
;
86 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
,
89 const SymbolTable
* symbols
,
91 if (U_FAILURE(status
)) {
95 status
= U_NO_WRITE_PERMISSION
;
98 // Need to build the pattern in a temporary string because
99 // _applyPattern calls add() etc., which set pat to empty.
100 UnicodeString rebuiltPat
;
101 RuleCharacterIterator
chars(pattern
, symbols
, pos
);
102 applyPattern(chars
, symbols
, rebuiltPat
, options
, &UnicodeSet::closeOver
, 0, status
);
103 if (U_FAILURE(status
)) return *this;
104 if (chars
.inVariable()) {
105 // syntaxError(chars, "Extra chars in variable value");
106 status
= U_MALFORMED_SET
;
109 setPattern(rebuiltPat
);
113 // USetAdder implementation
114 // Does not use uset.h to reduce code dependencies
115 static void U_CALLCONV
116 _set_add(USet
*set
, UChar32 c
) {
117 ((UnicodeSet
*)set
)->add(c
);
120 static void U_CALLCONV
121 _set_addRange(USet
*set
, UChar32 start
, UChar32 end
) {
122 ((UnicodeSet
*)set
)->add(start
, end
);
125 static void U_CALLCONV
126 _set_addString(USet
*set
, const UChar
*str
, int32_t length
) {
127 ((UnicodeSet
*)set
)->add(UnicodeString((UBool
)(length
<0), str
, length
));
130 //----------------------------------------------------------------
132 //----------------------------------------------------------------
134 // add the result of a full case mapping to the set
135 // use str as a temporary string to avoid constructing one
137 addCaseMapping(UnicodeSet
&set
, int32_t result
, const UChar
*full
, UnicodeString
&str
) {
139 if(result
> UCASE_MAX_STRING_LENGTH
) {
140 // add a single-code point case mapping
143 // add a string case mapping from full with length result
144 str
.setTo((UBool
)FALSE
, full
, result
);
148 // result < 0: the code point mapped to itself, no need to add it
152 UnicodeSet
& UnicodeSet::closeOver(int32_t attribute
) {
153 if (isFrozen() || isBogus()) {
156 if (attribute
& (USET_CASE_INSENSITIVE
| USET_ADD_CASE_MAPPINGS
)) {
158 UnicodeSet
foldSet(*this);
165 NULL
, // don't need remove()
166 NULL
// don't need removeRange()
169 // start with input set to guarantee inclusion
170 // USET_CASE: remove strings because the strings will actually be reduced (folded);
171 // therefore, start with no strings and add only those needed
172 if ((attribute
& USET_CASE_INSENSITIVE
) && foldSet
.hasStrings()) {
173 foldSet
.strings
->removeAllElements();
176 int32_t n
= getRangeCount();
180 for (int32_t i
=0; i
<n
; ++i
) {
181 UChar32 start
= getRangeStart(i
);
182 UChar32 end
= getRangeEnd(i
);
184 if (attribute
& USET_CASE_INSENSITIVE
) {
186 for (UChar32 cp
=start
; cp
<=end
; ++cp
) {
187 ucase_addCaseClosure(cp
, &sa
);
191 // (does not add long s for regular s, or Kelvin for k, for example)
192 for (UChar32 cp
=start
; cp
<=end
; ++cp
) {
193 result
= ucase_toFullLower(cp
, NULL
, NULL
, &full
, UCASE_LOC_ROOT
);
194 addCaseMapping(foldSet
, result
, full
, str
);
196 result
= ucase_toFullTitle(cp
, NULL
, NULL
, &full
, UCASE_LOC_ROOT
);
197 addCaseMapping(foldSet
, result
, full
, str
);
199 result
= ucase_toFullUpper(cp
, NULL
, NULL
, &full
, UCASE_LOC_ROOT
);
200 addCaseMapping(foldSet
, result
, full
, str
);
202 result
= ucase_toFullFolding(cp
, &full
, 0);
203 addCaseMapping(foldSet
, result
, full
, str
);
208 if (attribute
& USET_CASE_INSENSITIVE
) {
209 for (int32_t j
=0; j
<strings
->size(); ++j
) {
210 str
= *(const UnicodeString
*) strings
->elementAt(j
);
212 if(!ucase_addStringCaseClosure(str
.getBuffer(), str
.length(), &sa
)) {
213 foldSet
.add(str
); // does not map to code points: add the folded string itself
218 #if !UCONFIG_NO_BREAK_ITERATION
219 UErrorCode status
= U_ZERO_ERROR
;
220 BreakIterator
*bi
= BreakIterator::createWordInstance(root
, status
);
221 if (U_SUCCESS(status
)) {
223 const UnicodeString
*pStr
;
225 for (int32_t j
=0; j
<strings
->size(); ++j
) {
226 pStr
= (const UnicodeString
*) strings
->elementAt(j
);
227 (str
= *pStr
).toLower(root
);
229 #if !UCONFIG_NO_BREAK_ITERATION
230 (str
= *pStr
).toTitle(bi
, root
);
233 (str
= *pStr
).toUpper(root
);
235 (str
= *pStr
).foldCase();
238 #if !UCONFIG_NO_BREAK_ITERATION