2 **********************************************************************
3 * Copyright (C) 2012-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
13 #include "identifier_info.h"
15 #include "scriptset.h"
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
23 static UMutex gInitMutex
= U_MUTEX_INITIALIZER
;
24 static UBool gStaticsAreInitialized
= FALSE
;
26 UnicodeSet
*IdentifierInfo::ASCII
;
27 ScriptSet
*IdentifierInfo::JAPANESE
;
28 ScriptSet
*IdentifierInfo::CHINESE
;
29 ScriptSet
*IdentifierInfo::KOREAN
;
30 ScriptSet
*IdentifierInfo::CONFUSABLE_WITH_LATIN
;
32 UBool
IdentifierInfo::cleanup() {
41 delete CONFUSABLE_WITH_LATIN
;
42 CONFUSABLE_WITH_LATIN
= NULL
;
43 gStaticsAreInitialized
= FALSE
;
48 static UBool U_CALLCONV
49 IdentifierInfo_cleanup(void) {
50 return IdentifierInfo::cleanup();
55 IdentifierInfo::IdentifierInfo(UErrorCode
&status
):
56 fIdentifier(NULL
), fRequiredScripts(NULL
), fScriptSetSet(NULL
),
57 fCommonAmongAlternates(NULL
), fNumerics(NULL
), fIdentifierProfile(NULL
) {
58 if (U_FAILURE(status
)) {
62 Mutex
lock(&gInitMutex
);
63 if (!gStaticsAreInitialized
) {
64 ASCII
= new UnicodeSet(0, 0x7f);
65 JAPANESE
= new ScriptSet();
66 CHINESE
= new ScriptSet();
67 KOREAN
= new ScriptSet();
68 CONFUSABLE_WITH_LATIN
= new ScriptSet();
69 if (ASCII
== NULL
|| JAPANESE
== NULL
|| CHINESE
== NULL
|| KOREAN
== NULL
70 || CONFUSABLE_WITH_LATIN
== NULL
) {
71 status
= U_MEMORY_ALLOCATION_ERROR
;
75 JAPANESE
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_HIRAGANA
, status
)
76 .set(USCRIPT_KATAKANA
, status
);
77 CHINESE
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_BOPOMOFO
, status
);
78 KOREAN
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_HANGUL
, status
);
79 CONFUSABLE_WITH_LATIN
->set(USCRIPT_CYRILLIC
, status
).set(USCRIPT_GREEK
, status
)
80 .set(USCRIPT_CHEROKEE
, status
);
81 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO
, IdentifierInfo_cleanup
);
82 gStaticsAreInitialized
= TRUE
;
85 fIdentifier
= new UnicodeString();
86 fRequiredScripts
= new ScriptSet();
87 fScriptSetSet
= uhash_open(uhash_hashScriptSet
, uhash_compareScriptSet
, NULL
, &status
);
88 uhash_setKeyDeleter(fScriptSetSet
, uhash_deleteScriptSet
);
89 fCommonAmongAlternates
= new ScriptSet();
90 fNumerics
= new UnicodeSet();
91 fIdentifierProfile
= new UnicodeSet(0, 0x10FFFF);
93 if (U_SUCCESS(status
) && (fIdentifier
== NULL
|| fRequiredScripts
== NULL
|| fScriptSetSet
== NULL
||
94 fCommonAmongAlternates
== NULL
|| fNumerics
== NULL
|| fIdentifierProfile
== NULL
)) {
95 status
= U_MEMORY_ALLOCATION_ERROR
;
99 IdentifierInfo::~IdentifierInfo() {
101 delete fRequiredScripts
;
102 uhash_close(fScriptSetSet
);
103 delete fCommonAmongAlternates
;
105 delete fIdentifierProfile
;
109 IdentifierInfo
&IdentifierInfo::clear() {
110 fRequiredScripts
->resetAll();
111 uhash_removeAll(fScriptSetSet
);
113 fCommonAmongAlternates
->resetAll();
118 IdentifierInfo
&IdentifierInfo::setIdentifierProfile(const UnicodeSet
&identifierProfile
) {
119 *fIdentifierProfile
= identifierProfile
;
124 const UnicodeSet
&IdentifierInfo::getIdentifierProfile() const {
125 return *fIdentifierProfile
;
129 IdentifierInfo
&IdentifierInfo::setIdentifier(const UnicodeString
&identifier
, UErrorCode
&status
) {
130 if (U_FAILURE(status
)) {
133 *fIdentifier
= identifier
;
135 ScriptSet scriptsForCP
;
137 for (int32_t i
= 0; i
< identifier
.length(); i
+= U16_LENGTH(cp
)) {
138 cp
= identifier
.char32At(i
);
139 // Store a representative character for each kind of decimal digit
140 if (u_charType(cp
) == U_DECIMAL_DIGIT_NUMBER
) {
141 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
142 fNumerics
->add(cp
- (UChar32
)u_getNumericValue(cp
));
144 UScriptCode extensions
[500];
145 int32_t extensionsCount
= uscript_getScriptExtensions(cp
, extensions
, LENGTHOF(extensions
), &status
);
146 if (U_FAILURE(status
)) {
149 scriptsForCP
.resetAll();
150 for (int32_t j
=0; j
<extensionsCount
; j
++) {
151 scriptsForCP
.set(extensions
[j
], status
);
153 scriptsForCP
.reset(USCRIPT_COMMON
, status
);
154 scriptsForCP
.reset(USCRIPT_INHERITED
, status
);
155 switch (scriptsForCP
.countMembers()) {
158 // Single script, record it.
159 fRequiredScripts
->Union(scriptsForCP
);
162 if (!fRequiredScripts
->intersects(scriptsForCP
)
163 && !uhash_geti(fScriptSetSet
, &scriptsForCP
)) {
164 // If the set hasn't been added already, add it
165 // (Add a copy, fScriptSetSet takes ownership of the copy.)
166 uhash_puti(fScriptSetSet
, new ScriptSet(scriptsForCP
), 1, &status
);
171 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
172 // [Kana], [Kana Hira] => [Kana]
173 // This is relatively infrequent, so doesn't have to be optimized.
174 // We also compute any commonalities among the alternates.
175 if (uhash_count(fScriptSetSet
) > 0) {
176 fCommonAmongAlternates
->setAll();
177 for (int32_t it
= -1;;) {
178 const UHashElement
*nextHashEl
= uhash_nextElement(fScriptSetSet
, &it
);
179 if (nextHashEl
== NULL
) {
182 ScriptSet
*next
= static_cast<ScriptSet
*>(nextHashEl
->key
.pointer
);
183 // [Kana], [Kana Hira] => [Kana]
184 if (fRequiredScripts
->intersects(*next
)) {
185 uhash_removeElement(fScriptSetSet
, nextHashEl
);
187 fCommonAmongAlternates
->intersect(*next
);
188 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
189 for (int32_t otherIt
= -1;;) {
190 const UHashElement
*otherHashEl
= uhash_nextElement(fScriptSetSet
, &otherIt
);
191 if (otherHashEl
== NULL
) {
194 ScriptSet
*other
= static_cast<ScriptSet
*>(otherHashEl
->key
.pointer
);
195 if (next
!= other
&& next
->contains(*other
)) {
196 uhash_removeElement(fScriptSetSet
, nextHashEl
);
203 if (uhash_count(fScriptSetSet
) == 0) {
204 fCommonAmongAlternates
->resetAll();
210 const UnicodeString
*IdentifierInfo::getIdentifier() const {
214 const ScriptSet
*IdentifierInfo::getScripts() const {
215 return fRequiredScripts
;
218 const UHashtable
*IdentifierInfo::getAlternates() const {
219 return fScriptSetSet
;
223 const UnicodeSet
*IdentifierInfo::getNumerics() const {
227 const ScriptSet
*IdentifierInfo::getCommonAmongAlternates() const {
228 return fCommonAmongAlternates
;
231 #if !UCONFIG_NO_NORMALIZATION
233 URestrictionLevel
IdentifierInfo::getRestrictionLevel(UErrorCode
&status
) const {
234 if (!fIdentifierProfile
->containsAll(*fIdentifier
) || getNumerics()->size() > 1) {
235 return USPOOF_UNRESTRICTIVE
;
237 if (ASCII
->containsAll(*fIdentifier
)) {
240 // This is a bit tricky. We look at a number of factors.
241 // The number of scripts in the text.
242 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
243 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
245 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
246 // time it is created, in setIdentifier().
247 int32_t cardinalityPlus
= fRequiredScripts
->countMembers() +
248 (fCommonAmongAlternates
->countMembers() == 0 ? uhash_count(fScriptSetSet
) : 1);
249 if (cardinalityPlus
< 2) {
250 return USPOOF_HIGHLY_RESTRICTIVE
;
252 if (containsWithAlternates(*JAPANESE
, *fRequiredScripts
) || containsWithAlternates(*CHINESE
, *fRequiredScripts
)
253 || containsWithAlternates(*KOREAN
, *fRequiredScripts
)) {
254 return USPOOF_HIGHLY_RESTRICTIVE
;
256 if (cardinalityPlus
== 2 &&
257 fRequiredScripts
->test(USCRIPT_LATIN
, status
) &&
258 !fRequiredScripts
->intersects(*CONFUSABLE_WITH_LATIN
)) {
259 return USPOOF_MODERATELY_RESTRICTIVE
;
261 return USPOOF_MINIMALLY_RESTRICTIVE
;
264 #endif /* !UCONFIG_NO_NORMALIZATION */
266 int32_t IdentifierInfo::getScriptCount() const {
267 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
268 int32_t count
= fRequiredScripts
->countMembers() +
269 (fCommonAmongAlternates
->countMembers() == 0 ? uhash_count(fScriptSetSet
) : 1);
275 UBool
IdentifierInfo::containsWithAlternates(const ScriptSet
&container
, const ScriptSet
&containee
) const {
276 if (!container
.contains(containee
)) {
279 for (int32_t iter
= -1; ;) {
280 const UHashElement
*hashEl
= uhash_nextElement(fScriptSetSet
, &iter
);
281 if (hashEl
== NULL
) {
284 ScriptSet
*alternatives
= static_cast<ScriptSet
*>(hashEl
->key
.pointer
);
285 if (!container
.intersects(*alternatives
)) {
292 UnicodeString
&IdentifierInfo::displayAlternates(UnicodeString
&dest
, const UHashtable
*alternates
, UErrorCode
&status
) {
293 UVector
sorted(status
);
294 if (U_FAILURE(status
)) {
297 for (int32_t pos
= -1; ;) {
298 const UHashElement
*el
= uhash_nextElement(alternates
, &pos
);
302 ScriptSet
*ss
= static_cast<ScriptSet
*>(el
->key
.pointer
);
303 sorted
.addElement(ss
, status
);
305 sorted
.sort(uhash_compareScriptSet
, status
);
306 UnicodeString separator
= UNICODE_STRING_SIMPLE("; ");
307 for (int32_t i
=0; i
<sorted
.size(); i
++) {
309 dest
.append(separator
);
311 ScriptSet
*ss
= static_cast<ScriptSet
*>(sorted
.elementAt(i
));
312 ss
->displayScripts(dest
);