2 **********************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
13 #include "identifier_info.h"
15 #include "scriptset.h"
21 static UnicodeSet
*ASCII
;
22 static ScriptSet
*JAPANESE
;
23 static ScriptSet
*CHINESE
;
24 static ScriptSet
*KOREAN
;
25 static ScriptSet
*CONFUSABLE_WITH_LATIN
;
26 static UInitOnce gIdentifierInfoInitOnce
= U_INITONCE_INITIALIZER
;
30 static UBool U_CALLCONV
31 IdentifierInfo_cleanup(void) {
40 delete CONFUSABLE_WITH_LATIN
;
41 CONFUSABLE_WITH_LATIN
= NULL
;
42 gIdentifierInfoInitOnce
.reset();
46 static void U_CALLCONV
47 IdentifierInfo_init(UErrorCode
&status
) {
48 ASCII
= new UnicodeSet(0, 0x7f);
49 JAPANESE
= new ScriptSet();
50 CHINESE
= new ScriptSet();
51 KOREAN
= new ScriptSet();
52 CONFUSABLE_WITH_LATIN
= new ScriptSet();
53 if (ASCII
== NULL
|| JAPANESE
== NULL
|| CHINESE
== NULL
|| KOREAN
== NULL
54 || CONFUSABLE_WITH_LATIN
== NULL
) {
55 status
= U_MEMORY_ALLOCATION_ERROR
;
59 JAPANESE
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_HIRAGANA
, status
)
60 .set(USCRIPT_KATAKANA
, status
);
61 CHINESE
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_BOPOMOFO
, status
);
62 KOREAN
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_HANGUL
, status
);
63 CONFUSABLE_WITH_LATIN
->set(USCRIPT_CYRILLIC
, status
).set(USCRIPT_GREEK
, status
)
64 .set(USCRIPT_CHEROKEE
, status
);
65 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO
, IdentifierInfo_cleanup
);
70 IdentifierInfo::IdentifierInfo(UErrorCode
&status
):
71 fIdentifier(NULL
), fRequiredScripts(NULL
), fScriptSetSet(NULL
),
72 fCommonAmongAlternates(NULL
), fNumerics(NULL
), fIdentifierProfile(NULL
) {
73 umtx_initOnce(gIdentifierInfoInitOnce
, &IdentifierInfo_init
, status
);
74 if (U_FAILURE(status
)) {
78 fIdentifier
= new UnicodeString();
79 fRequiredScripts
= new ScriptSet();
80 fScriptSetSet
= uhash_open(uhash_hashScriptSet
, uhash_compareScriptSet
, NULL
, &status
);
81 uhash_setKeyDeleter(fScriptSetSet
, uhash_deleteScriptSet
);
82 fCommonAmongAlternates
= new ScriptSet();
83 fNumerics
= new UnicodeSet();
84 fIdentifierProfile
= new UnicodeSet(0, 0x10FFFF);
86 if (U_SUCCESS(status
) && (fIdentifier
== NULL
|| fRequiredScripts
== NULL
|| fScriptSetSet
== NULL
||
87 fCommonAmongAlternates
== NULL
|| fNumerics
== NULL
|| fIdentifierProfile
== NULL
)) {
88 status
= U_MEMORY_ALLOCATION_ERROR
;
92 IdentifierInfo::~IdentifierInfo() {
94 delete fRequiredScripts
;
95 uhash_close(fScriptSetSet
);
96 delete fCommonAmongAlternates
;
98 delete fIdentifierProfile
;
102 IdentifierInfo
&IdentifierInfo::clear() {
103 fRequiredScripts
->resetAll();
104 uhash_removeAll(fScriptSetSet
);
106 fCommonAmongAlternates
->resetAll();
111 IdentifierInfo
&IdentifierInfo::setIdentifierProfile(const UnicodeSet
&identifierProfile
) {
112 *fIdentifierProfile
= identifierProfile
;
117 const UnicodeSet
&IdentifierInfo::getIdentifierProfile() const {
118 return *fIdentifierProfile
;
122 IdentifierInfo
&IdentifierInfo::setIdentifier(const UnicodeString
&identifier
, UErrorCode
&status
) {
123 if (U_FAILURE(status
)) {
126 *fIdentifier
= identifier
;
128 ScriptSet scriptsForCP
;
130 for (int32_t i
= 0; i
< identifier
.length(); i
+= U16_LENGTH(cp
)) {
131 cp
= identifier
.char32At(i
);
132 // Store a representative character for each kind of decimal digit
133 if (u_charType(cp
) == U_DECIMAL_DIGIT_NUMBER
) {
134 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
135 fNumerics
->add(cp
- (UChar32
)u_getNumericValue(cp
));
137 UScriptCode extensions
[500];
138 int32_t extensionsCount
= uscript_getScriptExtensions(cp
, extensions
, UPRV_LENGTHOF(extensions
), &status
);
139 if (U_FAILURE(status
)) {
142 scriptsForCP
.resetAll();
143 for (int32_t j
=0; j
<extensionsCount
; j
++) {
144 scriptsForCP
.set(extensions
[j
], status
);
146 scriptsForCP
.reset(USCRIPT_COMMON
, status
);
147 scriptsForCP
.reset(USCRIPT_INHERITED
, status
);
148 switch (scriptsForCP
.countMembers()) {
151 // Single script, record it.
152 fRequiredScripts
->Union(scriptsForCP
);
155 if (!fRequiredScripts
->intersects(scriptsForCP
)
156 && !uhash_geti(fScriptSetSet
, &scriptsForCP
)) {
157 // If the set hasn't been added already, add it
158 // (Add a copy, fScriptSetSet takes ownership of the copy.)
159 uhash_puti(fScriptSetSet
, new ScriptSet(scriptsForCP
), 1, &status
);
164 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
165 // [Kana], [Kana Hira] => [Kana]
166 // This is relatively infrequent, so doesn't have to be optimized.
167 // We also compute any commonalities among the alternates.
168 if (uhash_count(fScriptSetSet
) > 0) {
169 fCommonAmongAlternates
->setAll();
170 for (int32_t it
= UHASH_FIRST
;;) {
171 const UHashElement
*nextHashEl
= uhash_nextElement(fScriptSetSet
, &it
);
172 if (nextHashEl
== NULL
) {
175 ScriptSet
*next
= static_cast<ScriptSet
*>(nextHashEl
->key
.pointer
);
176 // [Kana], [Kana Hira] => [Kana]
177 if (fRequiredScripts
->intersects(*next
)) {
178 uhash_removeElement(fScriptSetSet
, nextHashEl
);
180 fCommonAmongAlternates
->intersect(*next
);
181 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
182 for (int32_t otherIt
= UHASH_FIRST
;;) {
183 const UHashElement
*otherHashEl
= uhash_nextElement(fScriptSetSet
, &otherIt
);
184 if (otherHashEl
== NULL
) {
187 ScriptSet
*other
= static_cast<ScriptSet
*>(otherHashEl
->key
.pointer
);
188 if (next
!= other
&& next
->contains(*other
)) {
189 uhash_removeElement(fScriptSetSet
, nextHashEl
);
196 if (uhash_count(fScriptSetSet
) == 0) {
197 fCommonAmongAlternates
->resetAll();
203 const UnicodeString
*IdentifierInfo::getIdentifier() const {
207 const ScriptSet
*IdentifierInfo::getScripts() const {
208 return fRequiredScripts
;
211 const UHashtable
*IdentifierInfo::getAlternates() const {
212 return fScriptSetSet
;
216 const UnicodeSet
*IdentifierInfo::getNumerics() const {
220 const ScriptSet
*IdentifierInfo::getCommonAmongAlternates() const {
221 return fCommonAmongAlternates
;
224 #if !UCONFIG_NO_NORMALIZATION
226 URestrictionLevel
IdentifierInfo::getRestrictionLevel(UErrorCode
&status
) const {
227 if (!fIdentifierProfile
->containsAll(*fIdentifier
) || getNumerics()->size() > 1) {
228 return USPOOF_UNRESTRICTIVE
;
230 if (ASCII
->containsAll(*fIdentifier
)) {
233 // This is a bit tricky. We look at a number of factors.
234 // The number of scripts in the text.
235 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
236 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
238 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
239 // time it is created, in setIdentifier().
240 int32_t cardinalityPlus
= fRequiredScripts
->countMembers() +
241 (fCommonAmongAlternates
->countMembers() == 0 ? uhash_count(fScriptSetSet
) : 1);
242 if (cardinalityPlus
< 2) {
243 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE
;
245 if (containsWithAlternates(*JAPANESE
, *fRequiredScripts
) || containsWithAlternates(*CHINESE
, *fRequiredScripts
)
246 || containsWithAlternates(*KOREAN
, *fRequiredScripts
)) {
247 return USPOOF_HIGHLY_RESTRICTIVE
;
249 if (cardinalityPlus
== 2 &&
250 fRequiredScripts
->test(USCRIPT_LATIN
, status
) &&
251 !fRequiredScripts
->intersects(*CONFUSABLE_WITH_LATIN
)) {
252 return USPOOF_MODERATELY_RESTRICTIVE
;
254 return USPOOF_MINIMALLY_RESTRICTIVE
;
257 #endif /* !UCONFIG_NO_NORMALIZATION */
259 int32_t IdentifierInfo::getScriptCount() const {
260 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
261 int32_t count
= fRequiredScripts
->countMembers() +
262 (fCommonAmongAlternates
->countMembers() == 0 ? uhash_count(fScriptSetSet
) : 1);
268 UBool
IdentifierInfo::containsWithAlternates(const ScriptSet
&container
, const ScriptSet
&containee
) const {
269 if (!container
.contains(containee
)) {
272 for (int32_t iter
= UHASH_FIRST
; ;) {
273 const UHashElement
*hashEl
= uhash_nextElement(fScriptSetSet
, &iter
);
274 if (hashEl
== NULL
) {
277 ScriptSet
*alternatives
= static_cast<ScriptSet
*>(hashEl
->key
.pointer
);
278 if (!container
.intersects(*alternatives
)) {
285 UnicodeString
&IdentifierInfo::displayAlternates(UnicodeString
&dest
, const UHashtable
*alternates
, UErrorCode
&status
) {
286 UVector
sorted(status
);
287 if (U_FAILURE(status
)) {
290 for (int32_t pos
= UHASH_FIRST
; ;) {
291 const UHashElement
*el
= uhash_nextElement(alternates
, &pos
);
295 ScriptSet
*ss
= static_cast<ScriptSet
*>(el
->key
.pointer
);
296 sorted
.addElement(ss
, status
);
298 sorted
.sort(uhash_compareScriptSet
, status
);
299 UnicodeString separator
= UNICODE_STRING_SIMPLE("; ");
300 for (int32_t i
=0; i
<sorted
.size(); i
++) {
302 dest
.append(separator
);
304 ScriptSet
*ss
= static_cast<ScriptSet
*>(sorted
.elementAt(i
));
305 ss
->displayScripts(dest
);