1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: uniset_closure.cpp
12 * tab size: 8 (not used)
15 * created on: 2011may30
16 * created by: Markus W. Scherer
18 * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp
19 * to simplify dependencies.
20 * In particular, this depends on the BreakIterator, but the BreakIterator
21 * code also builds UnicodeSets from patterns and needs uniset_props.
24 #include "unicode/brkiter.h"
25 #include "unicode/locid.h"
26 #include "unicode/parsepos.h"
27 #include "unicode/uniset.h"
34 // initial storage. Must be >= 0
35 // *** same as in uniset.cpp ! ***
36 #define START_EXTRA 16
40 // TODO memory debugging provided inside uniset.cpp
41 // could be made available here but probably obsolete with use of modern
42 // memory leak checker tools
45 //----------------------------------------------------------------
47 //----------------------------------------------------------------
49 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
,
51 const SymbolTable
* symbols
,
53 len(0), capacity(START_EXTRA
), list(0), bmpSet(0), buffer(0),
54 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
57 if(U_SUCCESS(status
)){
58 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
61 status
= U_MEMORY_ALLOCATION_ERROR
;
63 allocateStrings(status
);
64 applyPattern(pattern
, options
, symbols
, status
);
70 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
, ParsePosition
& pos
,
72 const SymbolTable
* symbols
,
74 len(0), capacity(START_EXTRA
), list(0), bmpSet(0), buffer(0),
75 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
78 if(U_SUCCESS(status
)){
79 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
82 status
= U_MEMORY_ALLOCATION_ERROR
;
84 allocateStrings(status
);
85 applyPattern(pattern
, pos
, options
, symbols
, status
);
91 //----------------------------------------------------------------
93 //----------------------------------------------------------------
95 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
,
97 const SymbolTable
* symbols
,
100 applyPattern(pattern
, pos
, options
, symbols
, status
);
101 if (U_FAILURE(status
)) return *this;
103 int32_t i
= pos
.getIndex();
105 if (options
& USET_IGNORE_SPACE
) {
106 // Skip over trailing whitespace
107 ICU_Utility::skipWhitespace(pattern
, i
, TRUE
);
110 if (i
!= pattern
.length()) {
111 status
= U_ILLEGAL_ARGUMENT_ERROR
;
116 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
,
119 const SymbolTable
* symbols
,
120 UErrorCode
& status
) {
121 if (U_FAILURE(status
)) {
125 status
= U_NO_WRITE_PERMISSION
;
128 // Need to build the pattern in a temporary string because
129 // _applyPattern calls add() etc., which set pat to empty.
130 UnicodeString rebuiltPat
;
131 RuleCharacterIterator
chars(pattern
, symbols
, pos
);
132 applyPattern(chars
, symbols
, rebuiltPat
, options
, &UnicodeSet::closeOver
, 0, status
);
133 if (U_FAILURE(status
)) return *this;
134 if (chars
.inVariable()) {
135 // syntaxError(chars, "Extra chars in variable value");
136 status
= U_MALFORMED_SET
;
139 setPattern(rebuiltPat
);
143 // USetAdder implementation
144 // Does not use uset.h to reduce code dependencies
145 static void U_CALLCONV
146 _set_add(USet
*set
, UChar32 c
) {
147 ((UnicodeSet
*)set
)->add(c
);
150 static void U_CALLCONV
151 _set_addRange(USet
*set
, UChar32 start
, UChar32 end
) {
152 ((UnicodeSet
*)set
)->add(start
, end
);
155 static void U_CALLCONV
156 _set_addString(USet
*set
, const UChar
*str
, int32_t length
) {
157 ((UnicodeSet
*)set
)->add(UnicodeString((UBool
)(length
<0), str
, length
));
160 //----------------------------------------------------------------
162 //----------------------------------------------------------------
164 // add the result of a full case mapping to the set
165 // use str as a temporary string to avoid constructing one
167 addCaseMapping(UnicodeSet
&set
, int32_t result
, const UChar
*full
, UnicodeString
&str
) {
169 if(result
> UCASE_MAX_STRING_LENGTH
) {
170 // add a single-code point case mapping
173 // add a string case mapping from full with length result
174 str
.setTo((UBool
)FALSE
, full
, result
);
178 // result < 0: the code point mapped to itself, no need to add it
182 UnicodeSet
& UnicodeSet::closeOver(int32_t attribute
) {
183 if (isFrozen() || isBogus()) {
186 if (attribute
& (USET_CASE_INSENSITIVE
| USET_ADD_CASE_MAPPINGS
)) {
188 UnicodeSet
foldSet(*this);
195 NULL
, // don't need remove()
196 NULL
// don't need removeRange()
199 // start with input set to guarantee inclusion
200 // USET_CASE: remove strings because the strings will actually be reduced (folded);
201 // therefore, start with no strings and add only those needed
202 if (attribute
& USET_CASE_INSENSITIVE
) {
203 foldSet
.strings
->removeAllElements();
206 int32_t n
= getRangeCount();
210 for (int32_t i
=0; i
<n
; ++i
) {
211 UChar32 start
= getRangeStart(i
);
212 UChar32 end
= getRangeEnd(i
);
214 if (attribute
& USET_CASE_INSENSITIVE
) {
216 for (UChar32 cp
=start
; cp
<=end
; ++cp
) {
217 ucase_addCaseClosure(cp
, &sa
);
221 // (does not add long s for regular s, or Kelvin for k, for example)
222 for (UChar32 cp
=start
; cp
<=end
; ++cp
) {
223 result
= ucase_toFullLower(cp
, NULL
, NULL
, &full
, UCASE_LOC_ROOT
);
224 addCaseMapping(foldSet
, result
, full
, str
);
226 result
= ucase_toFullTitle(cp
, NULL
, NULL
, &full
, UCASE_LOC_ROOT
);
227 addCaseMapping(foldSet
, result
, full
, str
);
229 result
= ucase_toFullUpper(cp
, NULL
, NULL
, &full
, UCASE_LOC_ROOT
);
230 addCaseMapping(foldSet
, result
, full
, str
);
232 result
= ucase_toFullFolding(cp
, &full
, 0);
233 addCaseMapping(foldSet
, result
, full
, str
);
237 if (strings
!= NULL
&& strings
->size() > 0) {
238 if (attribute
& USET_CASE_INSENSITIVE
) {
239 for (int32_t j
=0; j
<strings
->size(); ++j
) {
240 str
= *(const UnicodeString
*) strings
->elementAt(j
);
242 if(!ucase_addStringCaseClosure(str
.getBuffer(), str
.length(), &sa
)) {
243 foldSet
.add(str
); // does not map to code points: add the folded string itself
248 #if !UCONFIG_NO_BREAK_ITERATION
249 UErrorCode status
= U_ZERO_ERROR
;
250 BreakIterator
*bi
= BreakIterator::createWordInstance(root
, status
);
251 if (U_SUCCESS(status
)) {
253 const UnicodeString
*pStr
;
255 for (int32_t j
=0; j
<strings
->size(); ++j
) {
256 pStr
= (const UnicodeString
*) strings
->elementAt(j
);
257 (str
= *pStr
).toLower(root
);
259 #if !UCONFIG_NO_BREAK_ITERATION
260 (str
= *pStr
).toTitle(bi
, root
);
263 (str
= *pStr
).toUpper(root
);
265 (str
= *pStr
).foldCase();
268 #if !UCONFIG_NO_BREAK_ITERATION