git.saurik.com Git - apple/icu.git/blob - icuSources/common/uniset

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /*

4 *******************************************************************************

5 *

8 *

9 *******************************************************************************

10 * file name: uniset_closure.cpp

11 * encoding: UTF-8

12 * tab size: 8 (not used)

13 * indentation:4

14 *

15 * created on: 2011may30

16 * created by: Markus W. Scherer

17 *

18 * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp

19 * to simplify dependencies.

20 * In particular, this depends on the BreakIterator, but the BreakIterator

21 * code also builds UnicodeSets from patterns and needs uniset_props.

22 */

24 #include "unicode/brkiter.h"

25 #include "unicode/locid.h"

26 #include "unicode/parsepos.h"

27 #include "unicode/uniset.h"

28 #include "cmemory.h"

29 #include "ruleiter.h"

30 #include "ucase.h"

31 #include "util.h"

32 #include "uvector.h"

34 U_NAMESPACE_BEGIN

36 // TODO memory debugging provided inside uniset.cpp

37 // could be made available here but probably obsolete with use of modern

38 // memory leak checker tools

39 #define _dbgct(me)

41 //----------------------------------------------------------------

42 // Constructors &c

43 //----------------------------------------------------------------

 UnicodeSet::UnicodeSet(const UnicodeString& pattern,

46 uint32_t options,

47 const SymbolTable* symbols,

48 UErrorCode& status) {

     applyPattern(pattern, options, symbols, status);

50 _dbgct(this);

51 }

 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,

54 uint32_t options,

55 const SymbolTable* symbols,

56 UErrorCode& status) {

     applyPattern(pattern, pos, options, symbols, status);

58 _dbgct(this);

59 }

61 //----------------------------------------------------------------

62 // Public API

63 //----------------------------------------------------------------

 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

66 uint32_t options,

67 const SymbolTable* symbols,

68 UErrorCode& status) {

69 ParsePosition pos(0);

     applyPattern(pattern, pos, options, symbols, status);

     if (U_FAILURE(status)) return *this;

     int32_t i = pos.getIndex();

75 if (options & USET_IGNORE_SPACE) {

76 // Skip over trailing whitespace

         ICU_Utility::skipWhitespace(pattern, i, TRUE);

78 }

     if (i != pattern.length()) {

81 status = U_ILLEGAL_ARGUMENT_ERROR;

82 }

83 return *this;

84 }

 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

87 ParsePosition& pos,

88 uint32_t options,

89 const SymbolTable* symbols,

90 UErrorCode& status) {

     if (U_FAILURE(status)) {

92 return *this;

93 }

94 if (isFrozen()) {

95 status = U_NO_WRITE_PERMISSION;

96 return *this;

97 }

98 // Need to build the pattern in a temporary string because

99 // _applyPattern calls add() etc., which set pat to empty.

100 UnicodeString rebuiltPat;

     RuleCharacterIterator chars(pattern, symbols, pos);

     applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status);

     if (U_FAILURE(status)) return *this;

     if (chars.inVariable()) {

105 // syntaxError(chars, "Extra chars in variable value");

106 status = U_MALFORMED_SET;

107 return *this;

108 }

109 setPattern(rebuiltPat);

110 return *this;

111 }

112

113 // USetAdder implementation

114 // Does not use uset.h to reduce code dependencies

115 static void U_CALLCONV

 _set_add(USet *set, UChar32 c) {

     ((UnicodeSet *)set)->add(c);

118 }

119

120 static void U_CALLCONV

 _set_addRange(USet *set, UChar32 start, UChar32 end) {

     ((UnicodeSet *)set)->add(start, end);

123 }

124

125 static void U_CALLCONV

 _set_addString(USet *set, const UChar *str, int32_t length) {

     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));

128 }

129

130 //----------------------------------------------------------------

131 // Case folding API

132 //----------------------------------------------------------------

133

134 // add the result of a full case mapping to the set

135 // use str as a temporary string to avoid constructing one

136 static inline void

 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {

     if(result >= 0) {

139 if(result > UCASE_MAX_STRING_LENGTH) {

140 // add a single-code point case mapping

141 set.add(result);

142 } else {

143 // add a string case mapping from full with length result

             str.setTo((UBool)FALSE, full, result);

145 set.add(str);

146 }

147 }

148 // result < 0: the code point mapped to itself, no need to add it

149 // see ucase.h

150 }

151

 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {

     if (isFrozen() || isBogus()) {

154 return *this;

155 }

     if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {

157 {

158 UnicodeSet foldSet(*this);

159 UnicodeString str;

160 USetAdder sa = {

161 foldSet.toUSet(),

162 _set_add,

163 _set_addRange,

164 _set_addString,

165 NULL, // don't need remove()

166 NULL // don't need removeRange()

167 };

168

169 // start with input set to guarantee inclusion

170 // USET_CASE: remove strings because the strings will actually be reduced (folded);

171 // therefore, start with no strings and add only those needed

             if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {

173 foldSet.strings->removeAllElements();

174 }

175

176 int32_t n = getRangeCount();

177 UChar32 result;

178 const UChar *full;

179

             for (int32_t i=0; i<n; ++i) {

181 UChar32 start = getRangeStart(i);

182 UChar32 end = getRangeEnd(i);

183

184 if (attribute & USET_CASE_INSENSITIVE) {

185 // full case closure

                     for (UChar32 cp=start; cp<=end; ++cp) {

187 ucase_addCaseClosure(cp, &sa);

188 }

189 } else {

190 // add case mappings

191 // (does not add long s for regular s, or Kelvin for k, for example)

                     for (UChar32 cp=start; cp<=end; ++cp) {

                         result = ucase_toFullLower(cp, NULL, NULL, &full, UCASE_LOC_ROOT);

                         addCaseMapping(foldSet, result, full, str);

195

                         result = ucase_toFullTitle(cp, NULL, NULL, &full, UCASE_LOC_ROOT);

                         addCaseMapping(foldSet, result, full, str);

198

                         result = ucase_toFullUpper(cp, NULL, NULL, &full, UCASE_LOC_ROOT);

                         addCaseMapping(foldSet, result, full, str);

201

                         result = ucase_toFullFolding(cp, &full, 0);

                         addCaseMapping(foldSet, result, full, str);

204 }

205 }

206 }

207 if (hasStrings()) {

208 if (attribute & USET_CASE_INSENSITIVE) {

                     for (int32_t j=0; j<strings->size(); ++j) {

                         str = *(const UnicodeString *) strings->elementAt(j);

211 str.foldCase();

                         if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {

                             foldSet.add(str); // does not map to code points: add the folded string itself

214 }

215 }

216 } else {

217 Locale root("");

218 #if !UCONFIG_NO_BREAK_ITERATION

219 UErrorCode status = U_ZERO_ERROR;

                     BreakIterator *bi = BreakIterator::createWordInstance(root, status);

                     if (U_SUCCESS(status)) {

222 #endif

223 const UnicodeString *pStr;

224

                         for (int32_t j=0; j<strings->size(); ++j) {

                             pStr = (const UnicodeString *) strings->elementAt(j);

                             (str = *pStr).toLower(root);

228 foldSet.add(str);

229 #if !UCONFIG_NO_BREAK_ITERATION

                             (str = *pStr).toTitle(bi, root);

231 foldSet.add(str);

232 #endif

                             (str = *pStr).toUpper(root);

234 foldSet.add(str);

                             (str = *pStr).foldCase();

236 foldSet.add(str);

237 }

238 #if !UCONFIG_NO_BREAK_ITERATION

239 }

240 delete bi;

241 #endif

242 }

243 }

244 *this = foldSet;

245 }

246 }

247 return *this;

248 }

249

250 U_NAMESPACE_END