2 *******************************************************************************
4 * Copyright (C) 2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uniset_closure.cpp
10 * tab size: 8 (not used)
13 * created on: 2011may30
14 * created by: Markus W. Scherer
16 * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp
17 * to simplify dependencies.
18 * In particular, this depends on the BreakIterator, but the BreakIterator
19 * code also builds UnicodeSets from patterns and needs uniset_props.
22 #include "unicode/brkiter.h"
23 #include "unicode/locid.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/uniset.h"
32 // initial storage. Must be >= 0
33 // *** same as in uniset.cpp ! ***
34 #define START_EXTRA 16
38 // TODO memory debugging provided inside uniset.cpp
39 // could be made available here but probably obsolete with use of modern
40 // memory leak checker tools
43 //----------------------------------------------------------------
45 //----------------------------------------------------------------
47 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
,
49 const SymbolTable
* symbols
,
51 len(0), capacity(START_EXTRA
), list(0), bmpSet(0), buffer(0),
52 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
55 if(U_SUCCESS(status
)){
56 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
59 status
= U_MEMORY_ALLOCATION_ERROR
;
61 allocateStrings(status
);
62 applyPattern(pattern
, options
, symbols
, status
);
68 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
, ParsePosition
& pos
,
70 const SymbolTable
* symbols
,
72 len(0), capacity(START_EXTRA
), list(0), bmpSet(0), buffer(0),
73 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
76 if(U_SUCCESS(status
)){
77 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
80 status
= U_MEMORY_ALLOCATION_ERROR
;
82 allocateStrings(status
);
83 applyPattern(pattern
, pos
, options
, symbols
, status
);
89 //----------------------------------------------------------------
91 //----------------------------------------------------------------
93 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
,
95 const SymbolTable
* symbols
,
98 applyPattern(pattern
, pos
, options
, symbols
, status
);
99 if (U_FAILURE(status
)) return *this;
101 int32_t i
= pos
.getIndex();
103 if (options
& USET_IGNORE_SPACE
) {
104 // Skip over trailing whitespace
105 ICU_Utility::skipWhitespace(pattern
, i
, TRUE
);
108 if (i
!= pattern
.length()) {
109 status
= U_ILLEGAL_ARGUMENT_ERROR
;
114 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
,
117 const SymbolTable
* symbols
,
118 UErrorCode
& status
) {
119 if (U_FAILURE(status
)) {
123 status
= U_NO_WRITE_PERMISSION
;
126 // Need to build the pattern in a temporary string because
127 // _applyPattern calls add() etc., which set pat to empty.
128 UnicodeString rebuiltPat
;
129 RuleCharacterIterator
chars(pattern
, symbols
, pos
);
130 applyPattern(chars
, symbols
, rebuiltPat
, options
, &UnicodeSet::closeOver
, status
);
131 if (U_FAILURE(status
)) return *this;
132 if (chars
.inVariable()) {
133 // syntaxError(chars, "Extra chars in variable value");
134 status
= U_MALFORMED_SET
;
137 setPattern(rebuiltPat
);
141 // USetAdder implementation
142 // Does not use uset.h to reduce code dependencies
143 static void U_CALLCONV
144 _set_add(USet
*set
, UChar32 c
) {
145 ((UnicodeSet
*)set
)->add(c
);
148 static void U_CALLCONV
149 _set_addRange(USet
*set
, UChar32 start
, UChar32 end
) {
150 ((UnicodeSet
*)set
)->add(start
, end
);
153 static void U_CALLCONV
154 _set_addString(USet
*set
, const UChar
*str
, int32_t length
) {
155 ((UnicodeSet
*)set
)->add(UnicodeString((UBool
)(length
<0), str
, length
));
158 //----------------------------------------------------------------
160 //----------------------------------------------------------------
162 // add the result of a full case mapping to the set
163 // use str as a temporary string to avoid constructing one
165 addCaseMapping(UnicodeSet
&set
, int32_t result
, const UChar
*full
, UnicodeString
&str
) {
167 if(result
> UCASE_MAX_STRING_LENGTH
) {
168 // add a single-code point case mapping
171 // add a string case mapping from full with length result
172 str
.setTo((UBool
)FALSE
, full
, result
);
176 // result < 0: the code point mapped to itself, no need to add it
180 UnicodeSet
& UnicodeSet::closeOver(int32_t attribute
) {
181 if (isFrozen() || isBogus()) {
184 if (attribute
& (USET_CASE_INSENSITIVE
| USET_ADD_CASE_MAPPINGS
)) {
185 const UCaseProps
*csp
= ucase_getSingleton();
187 UnicodeSet
foldSet(*this);
194 NULL
, // don't need remove()
195 NULL
// don't need removeRange()
198 // start with input set to guarantee inclusion
199 // USET_CASE: remove strings because the strings will actually be reduced (folded);
200 // therefore, start with no strings and add only those needed
201 if (attribute
& USET_CASE_INSENSITIVE
) {
202 foldSet
.strings
->removeAllElements();
205 int32_t n
= getRangeCount();
208 int32_t locCache
= 0;
210 for (int32_t i
=0; i
<n
; ++i
) {
211 UChar32 start
= getRangeStart(i
);
212 UChar32 end
= getRangeEnd(i
);
214 if (attribute
& USET_CASE_INSENSITIVE
) {
216 for (UChar32 cp
=start
; cp
<=end
; ++cp
) {
217 ucase_addCaseClosure(csp
, cp
, &sa
);
221 // (does not add long s for regular s, or Kelvin for k, for example)
222 for (UChar32 cp
=start
; cp
<=end
; ++cp
) {
223 result
= ucase_toFullLower(csp
, cp
, NULL
, NULL
, &full
, "", &locCache
);
224 addCaseMapping(foldSet
, result
, full
, str
);
226 result
= ucase_toFullTitle(csp
, cp
, NULL
, NULL
, &full
, "", &locCache
);
227 addCaseMapping(foldSet
, result
, full
, str
);
229 result
= ucase_toFullUpper(csp
, cp
, NULL
, NULL
, &full
, "", &locCache
);
230 addCaseMapping(foldSet
, result
, full
, str
);
232 result
= ucase_toFullFolding(csp
, cp
, &full
, 0);
233 addCaseMapping(foldSet
, result
, full
, str
);
237 if (strings
!= NULL
&& strings
->size() > 0) {
238 if (attribute
& USET_CASE_INSENSITIVE
) {
239 for (int32_t j
=0; j
<strings
->size(); ++j
) {
240 str
= *(const UnicodeString
*) strings
->elementAt(j
);
242 if(!ucase_addStringCaseClosure(csp
, str
.getBuffer(), str
.length(), &sa
)) {
243 foldSet
.add(str
); // does not map to code points: add the folded string itself
248 #if !UCONFIG_NO_BREAK_ITERATION
249 UErrorCode status
= U_ZERO_ERROR
;
250 BreakIterator
*bi
= BreakIterator::createWordInstance(root
, status
);
251 if (U_SUCCESS(status
)) {
253 const UnicodeString
*pStr
;
255 for (int32_t j
=0; j
<strings
->size(); ++j
) {
256 pStr
= (const UnicodeString
*) strings
->elementAt(j
);
257 (str
= *pStr
).toLower(root
);
259 #if !UCONFIG_NO_BREAK_ITERATION
260 (str
= *pStr
).toTitle(bi
, root
);
263 (str
= *pStr
).toUpper(root
);
265 (str
= *pStr
).foldCase();
268 #if !UCONFIG_NO_BREAK_ITERATION