]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/identifier_info.cpp
ICU-531.31.tar.gz
[apple/icu.git] / icuSources / i18n / identifier_info.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
12
13 #include "identifier_info.h"
14 #include "mutex.h"
15 #include "scriptset.h"
16 #include "ucln_in.h"
17 #include "uvector.h"
18
19 U_NAMESPACE_BEGIN
20
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
22
23 static UnicodeSet *ASCII;
24 static ScriptSet *JAPANESE;
25 static ScriptSet *CHINESE;
26 static ScriptSet *KOREAN;
27 static ScriptSet *CONFUSABLE_WITH_LATIN;
28 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
29
30
31 U_CDECL_BEGIN
32 static UBool U_CALLCONV
33 IdentifierInfo_cleanup(void) {
34 delete ASCII;
35 ASCII = NULL;
36 delete JAPANESE;
37 JAPANESE = NULL;
38 delete CHINESE;
39 CHINESE = NULL;
40 delete KOREAN;
41 KOREAN = NULL;
42 delete CONFUSABLE_WITH_LATIN;
43 CONFUSABLE_WITH_LATIN = NULL;
44 gIdentifierInfoInitOnce.reset();
45 return TRUE;
46 }
47
48 static void U_CALLCONV
49 IdentifierInfo_init(UErrorCode &status) {
50 ASCII = new UnicodeSet(0, 0x7f);
51 JAPANESE = new ScriptSet();
52 CHINESE = new ScriptSet();
53 KOREAN = new ScriptSet();
54 CONFUSABLE_WITH_LATIN = new ScriptSet();
55 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
56 || CONFUSABLE_WITH_LATIN == NULL) {
57 status = U_MEMORY_ALLOCATION_ERROR;
58 return;
59 }
60 ASCII->freeze();
61 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
62 .set(USCRIPT_KATAKANA, status);
63 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
64 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
65 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
66 .set(USCRIPT_CHEROKEE, status);
67 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
68 }
69 U_CDECL_END
70
71
72 IdentifierInfo::IdentifierInfo(UErrorCode &status):
73 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
74 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
75 umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
76 if (U_FAILURE(status)) {
77 return;
78 }
79
80 fIdentifier = new UnicodeString();
81 fRequiredScripts = new ScriptSet();
82 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
83 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
84 fCommonAmongAlternates = new ScriptSet();
85 fNumerics = new UnicodeSet();
86 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
87
88 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
89 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
90 status = U_MEMORY_ALLOCATION_ERROR;
91 }
92 }
93
94 IdentifierInfo::~IdentifierInfo() {
95 delete fIdentifier;
96 delete fRequiredScripts;
97 uhash_close(fScriptSetSet);
98 delete fCommonAmongAlternates;
99 delete fNumerics;
100 delete fIdentifierProfile;
101 }
102
103
104 IdentifierInfo &IdentifierInfo::clear() {
105 fRequiredScripts->resetAll();
106 uhash_removeAll(fScriptSetSet);
107 fNumerics->clear();
108 fCommonAmongAlternates->resetAll();
109 return *this;
110 }
111
112
113 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
114 *fIdentifierProfile = identifierProfile;
115 return *this;
116 }
117
118
119 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
120 return *fIdentifierProfile;
121 }
122
123
124 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
125 if (U_FAILURE(status)) {
126 return *this;
127 }
128 *fIdentifier = identifier;
129 clear();
130 ScriptSet scriptsForCP;
131 UChar32 cp;
132 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
133 cp = identifier.char32At(i);
134 // Store a representative character for each kind of decimal digit
135 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
136 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
137 fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
138 }
139 UScriptCode extensions[500];
140 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
141 if (U_FAILURE(status)) {
142 return *this;
143 }
144 scriptsForCP.resetAll();
145 for (int32_t j=0; j<extensionsCount; j++) {
146 scriptsForCP.set(extensions[j], status);
147 }
148 scriptsForCP.reset(USCRIPT_COMMON, status);
149 scriptsForCP.reset(USCRIPT_INHERITED, status);
150 switch (scriptsForCP.countMembers()) {
151 case 0: break;
152 case 1:
153 // Single script, record it.
154 fRequiredScripts->Union(scriptsForCP);
155 break;
156 default:
157 if (!fRequiredScripts->intersects(scriptsForCP)
158 && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
159 // If the set hasn't been added already, add it
160 // (Add a copy, fScriptSetSet takes ownership of the copy.)
161 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
162 }
163 break;
164 }
165 }
166 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
167 // [Kana], [Kana Hira] => [Kana]
168 // This is relatively infrequent, so doesn't have to be optimized.
169 // We also compute any commonalities among the alternates.
170 if (uhash_count(fScriptSetSet) > 0) {
171 fCommonAmongAlternates->setAll();
172 for (int32_t it = -1;;) {
173 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
174 if (nextHashEl == NULL) {
175 break;
176 }
177 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
178 // [Kana], [Kana Hira] => [Kana]
179 if (fRequiredScripts->intersects(*next)) {
180 uhash_removeElement(fScriptSetSet, nextHashEl);
181 } else {
182 fCommonAmongAlternates->intersect(*next);
183 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
184 for (int32_t otherIt = -1;;) {
185 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
186 if (otherHashEl == NULL) {
187 break;
188 }
189 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
190 if (next != other && next->contains(*other)) {
191 uhash_removeElement(fScriptSetSet, nextHashEl);
192 break;
193 }
194 }
195 }
196 }
197 }
198 if (uhash_count(fScriptSetSet) == 0) {
199 fCommonAmongAlternates->resetAll();
200 }
201 return *this;
202 }
203
204
205 const UnicodeString *IdentifierInfo::getIdentifier() const {
206 return fIdentifier;
207 }
208
209 const ScriptSet *IdentifierInfo::getScripts() const {
210 return fRequiredScripts;
211 }
212
213 const UHashtable *IdentifierInfo::getAlternates() const {
214 return fScriptSetSet;
215 }
216
217
218 const UnicodeSet *IdentifierInfo::getNumerics() const {
219 return fNumerics;
220 }
221
222 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
223 return fCommonAmongAlternates;
224 }
225
226 #if !UCONFIG_NO_NORMALIZATION
227
228 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
229 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
230 return USPOOF_UNRESTRICTIVE;
231 }
232 if (ASCII->containsAll(*fIdentifier)) {
233 return USPOOF_ASCII;
234 }
235 // This is a bit tricky. We look at a number of factors.
236 // The number of scripts in the text.
237 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
238 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
239
240 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
241 // time it is created, in setIdentifier().
242 int32_t cardinalityPlus = fRequiredScripts->countMembers() +
243 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
244 if (cardinalityPlus < 2) {
245 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
246 }
247 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
248 || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
249 return USPOOF_HIGHLY_RESTRICTIVE;
250 }
251 if (cardinalityPlus == 2 &&
252 fRequiredScripts->test(USCRIPT_LATIN, status) &&
253 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
254 return USPOOF_MODERATELY_RESTRICTIVE;
255 }
256 return USPOOF_MINIMALLY_RESTRICTIVE;
257 }
258
259 #endif /* !UCONFIG_NO_NORMALIZATION */
260
261 int32_t IdentifierInfo::getScriptCount() const {
262 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
263 int32_t count = fRequiredScripts->countMembers() +
264 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
265 return count;
266 }
267
268
269
270 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
271 if (!container.contains(containee)) {
272 return FALSE;
273 }
274 for (int32_t iter = -1; ;) {
275 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
276 if (hashEl == NULL) {
277 break;
278 }
279 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
280 if (!container.intersects(*alternatives)) {
281 return false;
282 }
283 }
284 return true;
285 }
286
287 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
288 UVector sorted(status);
289 if (U_FAILURE(status)) {
290 return dest;
291 }
292 for (int32_t pos = -1; ;) {
293 const UHashElement *el = uhash_nextElement(alternates, &pos);
294 if (el == NULL) {
295 break;
296 }
297 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
298 sorted.addElement(ss, status);
299 }
300 sorted.sort(uhash_compareScriptSet, status);
301 UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
302 for (int32_t i=0; i<sorted.size(); i++) {
303 if (i>0) {
304 dest.append(separator);
305 }
306 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
307 ss->displayScripts(dest);
308 }
309 return dest;
310 }
311
312 U_NAMESPACE_END
313