1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *****************************************************************
5 * Copyright (c) 2002-2014, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *****************************************************************
8 * Date Name Description
9 * 06/06/2002 aliu Creation.
10 *****************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_TRANSLITERATION
17 #include "unicode/uobject.h"
18 #include "unicode/uscript.h"
29 //------------------------------------------------------------
32 static const UChar TARGET_SEP
= 45; // '-'
33 static const UChar VARIANT_SEP
= 47; // '/'
34 static const UChar ANY
[] = {0x41,0x6E,0x79,0}; // "Any"
35 static const UChar NULL_ID
[] = {78,117,108,108,0}; // "Null"
36 static const UChar LATIN_PIVOT
[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
38 // initial size for an Any-XXXX transform's cache of script-XXXX transforms
39 // (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
40 #define ANY_TRANS_CACHE_INIT_SIZE 7
42 //------------------------------------------------------------
46 * Deleter function for Transliterator*.
48 static void U_CALLCONV
49 _deleteTransliterator(void *obj
) {
50 delete (icu::Transliterator
*) obj
;
54 //------------------------------------------------------------
58 //------------------------------------------------------------
62 * Returns a series of ranges corresponding to scripts. They will be
65 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
66 * | | - first run (start, limit)
67 * | | - second run (start, limit)
69 * That is, the runs will overlap. The reason for this is so that a
70 * transliterator can consider common characters both before and after
73 class ScriptRunIterator
: public UMemory
{
75 const Replaceable
& text
;
81 * The code of the current run, valid after next() returns. May
82 * be USCRIPT_INVALID_CODE if and only if the entire text is
85 UScriptCode scriptCode
;
88 * The start of the run, inclusive, valid after next() returns.
93 * The end of the run, exclusive, valid after next() returns.
98 * Constructs a run iterator over the given text from start
99 * (inclusive) to limit (exclusive).
101 ScriptRunIterator(const Replaceable
& text
, int32_t start
, int32_t limit
);
104 * Returns TRUE if there are any more runs. TRUE is always
105 * returned at least once. Upon return, the caller should
106 * examine scriptCode, start, and limit.
111 * Adjusts internal indices for a change in the limit index of the
112 * given delta. A positive delta means the limit has increased.
114 void adjustLimit(int32_t delta
);
117 ScriptRunIterator(const ScriptRunIterator
&other
); // forbid copying of this class
118 ScriptRunIterator
&operator=(const ScriptRunIterator
&other
); // forbid copying of this class
121 ScriptRunIterator::ScriptRunIterator(const Replaceable
& theText
,
122 int32_t myStart
, int32_t myLimit
) :
130 UBool
ScriptRunIterator::next() {
133 UErrorCode ec
= U_ZERO_ERROR
;
135 scriptCode
= USCRIPT_INVALID_CODE
; // don't know script yet
139 if (start
== textLimit
) {
143 // Move start back to include adjacent COMMON or INHERITED
145 while (start
> textStart
) {
146 ch
= text
.char32At(start
- 1); // look back
147 s
= uscript_getScript(ch
, &ec
);
148 if (s
== USCRIPT_COMMON
|| s
== USCRIPT_INHERITED
) {
155 // Move limit ahead to include COMMON, INHERITED, and characters
156 // of the current script.
157 while (limit
< textLimit
) {
158 ch
= text
.char32At(limit
); // look ahead
159 s
= uscript_getScript(ch
, &ec
);
160 if (s
!= USCRIPT_COMMON
&& s
!= USCRIPT_INHERITED
) {
161 if (scriptCode
== USCRIPT_INVALID_CODE
) {
163 } else if (s
!= scriptCode
) {
170 // Return TRUE even if the entire text is COMMON / INHERITED, in
171 // which case scriptCode will be USCRIPT_INVALID_CODE.
175 void ScriptRunIterator::adjustLimit(int32_t delta
) {
180 //------------------------------------------------------------
183 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator
)
185 AnyTransliterator::AnyTransliterator(const UnicodeString
& id
,
186 const UnicodeString
& theTarget
,
187 const UnicodeString
& theVariant
,
188 UScriptCode theTargetScript
,
190 Transliterator(id
, NULL
),
191 targetScript(theTargetScript
)
193 cache
= uhash_openSize(uhash_hashLong
, uhash_compareLong
, NULL
, ANY_TRANS_CACHE_INIT_SIZE
, &ec
);
197 uhash_setValueDeleter(cache
, _deleteTransliterator
);
200 if (theVariant
.length() > 0) {
201 target
.append(VARIANT_SEP
).append(theVariant
);
205 AnyTransliterator::~AnyTransliterator() {
212 AnyTransliterator::AnyTransliterator(const AnyTransliterator
& o
) :
215 targetScript(o
.targetScript
)
217 // Don't copy the cache contents
218 UErrorCode ec
= U_ZERO_ERROR
;
219 cache
= uhash_openSize(uhash_hashLong
, uhash_compareLong
, NULL
, ANY_TRANS_CACHE_INIT_SIZE
, &ec
);
223 uhash_setValueDeleter(cache
, _deleteTransliterator
);
227 * Transliterator API.
229 AnyTransliterator
* AnyTransliterator::clone() const {
230 return new AnyTransliterator(*this);
234 * Implements {@link Transliterator#handleTransliterate}.
236 void AnyTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& pos
,
237 UBool isIncremental
) const {
238 int32_t allStart
= pos
.start
;
239 int32_t allLimit
= pos
.limit
;
241 ScriptRunIterator
it(text
, pos
.contextStart
, pos
.contextLimit
);
244 // Ignore runs in the ante context
245 if (it
.limit
<= allStart
) continue;
247 // Try to instantiate transliterator from it.scriptCode to
248 // our target or target/variant
249 Transliterator
* t
= getTransliterator(it
.scriptCode
);
252 // We have no transliterator. Do nothing, but keep
253 // pos.start up to date.
254 pos
.start
= it
.limit
;
258 // If the run end is before the transliteration limit, do
259 // a non-incremental transliteration. Otherwise do an
261 UBool incremental
= isIncremental
&& (it
.limit
>= allLimit
);
263 pos
.start
= uprv_max(allStart
, it
.start
);
264 pos
.limit
= uprv_min(allLimit
, it
.limit
);
265 int32_t limit
= pos
.limit
;
266 t
->filteredTransliterate(text
, pos
, incremental
);
267 int32_t delta
= pos
.limit
- limit
;
269 it
.adjustLimit(delta
);
271 // We're done if we enter the post context
272 if (it
.limit
>= allLimit
) break;
275 // Restore limit. pos.start is fine where the last transliterator
276 // left it, or at the end of the last run.
277 pos
.limit
= allLimit
;
280 Transliterator
* AnyTransliterator::getTransliterator(UScriptCode source
) const {
282 if (source
== targetScript
|| source
== USCRIPT_INVALID_CODE
) {
286 Transliterator
* t
= NULL
;
289 t
= (Transliterator
*) uhash_iget(cache
, (int32_t) source
);
292 UErrorCode ec
= U_ZERO_ERROR
;
293 UnicodeString
sourceName(uscript_getShortName(source
), -1, US_INV
);
294 UnicodeString
id(sourceName
);
295 id
.append(TARGET_SEP
).append(target
);
297 t
= Transliterator::createInstance(id
, UTRANS_FORWARD
, ec
);
298 if (U_FAILURE(ec
) || t
== NULL
) {
301 // Try to pivot around Latin, our most common script
303 id
.append(LATIN_PIVOT
, -1).append(target
);
304 t
= Transliterator::createInstance(id
, UTRANS_FORWARD
, ec
);
305 if (U_FAILURE(ec
) || t
== NULL
) {
312 Transliterator
*rt
= NULL
;
315 rt
= static_cast<Transliterator
*> (uhash_iget(cache
, (int32_t) source
));
317 // Common case, no race to cache this new transliterator.
318 uhash_iput(cache
, (int32_t) source
, t
, &ec
);
320 // Race case, some other thread beat us to caching this transliterator.
321 Transliterator
*temp
= rt
;
322 rt
= t
; // Our newly created transliterator that lost the race & now needs deleting.
323 t
= temp
; // The transliterator from the cache that we will return.
326 delete rt
; // will be non-null only in case of races.
333 * Return the script code for a given name, or -1 if not found.
335 static UScriptCode
scriptNameToCode(const UnicodeString
& name
) {
338 UErrorCode ec
= U_ZERO_ERROR
;
339 int32_t nameLen
= name
.length();
340 UBool isInvariant
= uprv_isInvariantUString(name
.getBuffer(), nameLen
);
343 name
.extract(0, nameLen
, buf
, (int32_t)sizeof(buf
), US_INV
);
344 buf
[127] = 0; // Make sure that we NULL terminate the string.
346 if (!isInvariant
|| uscript_getCode(buf
, &code
, 1, &ec
) != 1 || U_FAILURE(ec
))
348 code
= USCRIPT_INVALID_CODE
;
354 * Registers standard transliterators with the system. Called by
355 * Transliterator during initialization. Scan all current targets and
356 * register those that are scripts T as Any-T/V.
358 void AnyTransliterator::registerIDs() {
360 UErrorCode ec
= U_ZERO_ERROR
;
361 Hashtable
seen(TRUE
, ec
);
363 int32_t sourceCount
= Transliterator::_countAvailableSources();
364 for (int32_t s
=0; s
<sourceCount
; ++s
) {
365 UnicodeString source
;
366 Transliterator::_getAvailableSource(s
, source
);
368 // Ignore the "Any" source
369 if (source
.caseCompare(ANY
, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
371 int32_t targetCount
= Transliterator::_countAvailableTargets(source
);
372 for (int32_t t
=0; t
<targetCount
; ++t
) {
373 UnicodeString target
;
374 Transliterator::_getAvailableTarget(t
, source
, target
);
376 // Only process each target once
377 if (seen
.geti(target
) != 0) continue;
379 seen
.puti(target
, 1, ec
);
381 // Get the script code for the target. If not a script, ignore.
382 UScriptCode targetScript
= scriptNameToCode(target
);
383 if (targetScript
== USCRIPT_INVALID_CODE
) continue;
385 int32_t variantCount
= Transliterator::_countAvailableVariants(source
, target
);
386 // assert(variantCount >= 1);
387 for (int32_t v
=0; v
<variantCount
; ++v
) {
388 UnicodeString variant
;
389 Transliterator::_getAvailableVariant(v
, source
, target
, variant
);
392 TransliteratorIDParser::STVtoID(UnicodeString(TRUE
, ANY
, 3), target
, variant
, id
);
394 AnyTransliterator
* tl
= new AnyTransliterator(id
, target
, variant
,
399 Transliterator::_registerInstance(tl
);
400 Transliterator::_registerSpecialInverse(target
, UnicodeString(TRUE
, NULL_ID
, 4), FALSE
);
409 #endif /* #if !UCONFIG_NO_TRANSLITERATION */