2 *****************************************************************
3 * Copyright (c) 2002-2005, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *****************************************************************
6 * Date Name Description
7 * 06/06/2002 aliu Creation.
8 *****************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
25 //------------------------------------------------------------
28 static const UChar TARGET_SEP
= 45; // '-'
29 static const UChar VARIANT_SEP
= 47; // '/'
30 static const UChar ANY
[] = {65,110,121,0}; // "Any"
31 static const UChar NULL_ID
[] = {78,117,108,108,0}; // "Null"
32 static const UChar LATIN_PIVOT
[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
34 //------------------------------------------------------------
38 * Deleter function for Transliterator*.
40 static void U_CALLCONV
41 _deleteTransliterator(void *obj
) {
42 delete (Transliterator
*) obj
;
46 //------------------------------------------------------------
50 //------------------------------------------------------------
54 * Returns a series of ranges corresponding to scripts. They will be
57 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
58 * | | - first run (start, limit)
59 * | | - second run (start, limit)
61 * That is, the runs will overlap. The reason for this is so that a
62 * transliterator can consider common characters both before and after
65 class ScriptRunIterator
: public UMemory
{
67 const Replaceable
& text
;
73 * The code of the current run, valid after next() returns. May
74 * be USCRIPT_INVALID_CODE if and only if the entire text is
77 UScriptCode scriptCode
;
80 * The start of the run, inclusive, valid after next() returns.
85 * The end of the run, exclusive, valid after next() returns.
90 * Constructs a run iterator over the given text from start
91 * (inclusive) to limit (exclusive).
93 ScriptRunIterator(const Replaceable
& text
, int32_t start
, int32_t limit
);
96 * Returns TRUE if there are any more runs. TRUE is always
97 * returned at least once. Upon return, the caller should
98 * examine scriptCode, start, and limit.
103 * Adjusts internal indices for a change in the limit index of the
104 * given delta. A positive delta means the limit has increased.
106 void adjustLimit(int32_t delta
);
109 ScriptRunIterator(const ScriptRunIterator
&other
); // forbid copying of this class
110 ScriptRunIterator
&operator=(const ScriptRunIterator
&other
); // forbid copying of this class
113 ScriptRunIterator::ScriptRunIterator(const Replaceable
& theText
,
114 int32_t myStart
, int32_t myLimit
) :
122 UBool
ScriptRunIterator::next() {
125 UErrorCode ec
= U_ZERO_ERROR
;
127 scriptCode
= USCRIPT_INVALID_CODE
; // don't know script yet
131 if (start
== textLimit
) {
135 // Move start back to include adjacent COMMON or INHERITED
137 while (start
> textStart
) {
138 ch
= text
.char32At(start
- 1); // look back
139 s
= uscript_getScript(ch
, &ec
);
140 if (s
== USCRIPT_COMMON
|| s
== USCRIPT_INHERITED
) {
147 // Move limit ahead to include COMMON, INHERITED, and characters
148 // of the current script.
149 while (limit
< textLimit
) {
150 ch
= text
.char32At(limit
); // look ahead
151 s
= uscript_getScript(ch
, &ec
);
152 if (s
!= USCRIPT_COMMON
&& s
!= USCRIPT_INHERITED
) {
153 if (scriptCode
== USCRIPT_INVALID_CODE
) {
155 } else if (s
!= scriptCode
) {
162 // Return TRUE even if the entire text is COMMON / INHERITED, in
163 // which case scriptCode will be USCRIPT_INVALID_CODE.
167 void ScriptRunIterator::adjustLimit(int32_t delta
) {
172 //------------------------------------------------------------
175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator
)
177 AnyTransliterator::AnyTransliterator(const UnicodeString
& id
,
178 const UnicodeString
& theTarget
,
179 const UnicodeString
& theVariant
,
180 UScriptCode theTargetScript
,
182 Transliterator(id
, NULL
),
183 targetScript(theTargetScript
)
185 cache
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &ec
);
186 uhash_setValueDeleter(cache
, _deleteTransliterator
);
189 if (theVariant
.length() > 0) {
190 target
.append(VARIANT_SEP
).append(theVariant
);
194 AnyTransliterator::~AnyTransliterator() {
201 AnyTransliterator::AnyTransliterator(const AnyTransliterator
& o
) :
204 targetScript(o
.targetScript
)
206 // Don't copy the cache contents
207 UErrorCode ec
= U_ZERO_ERROR
;
208 cache
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &ec
);
209 uhash_setValueDeleter(cache
, _deleteTransliterator
);
213 * Transliterator API.
215 Transliterator
* AnyTransliterator::clone() const {
216 return new AnyTransliterator(*this);
220 * Implements {@link Transliterator#handleTransliterate}.
222 void AnyTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& pos
,
223 UBool isIncremental
) const {
224 int32_t allStart
= pos
.start
;
225 int32_t allLimit
= pos
.limit
;
227 ScriptRunIterator
it(text
, pos
.contextStart
, pos
.contextLimit
);
230 // Ignore runs in the ante context
231 if (it
.limit
<= allStart
) continue;
233 // Try to instantiate transliterator from it.scriptCode to
234 // our target or target/variant
235 Transliterator
* t
= getTransliterator(it
.scriptCode
);
238 // We have no transliterator. Do nothing, but keep
239 // pos.start up to date.
240 pos
.start
= it
.limit
;
244 // If the run end is before the transliteration limit, do
245 // a non-incremental transliteration. Otherwise do an
247 UBool incremental
= isIncremental
&& (it
.limit
>= allLimit
);
249 pos
.start
= uprv_max(allStart
, it
.start
);
250 pos
.limit
= uprv_min(allLimit
, it
.limit
);
251 int32_t limit
= pos
.limit
;
252 t
->filteredTransliterate(text
, pos
, incremental
);
253 int32_t delta
= pos
.limit
- limit
;
255 it
.adjustLimit(delta
);
257 // We're done if we enter the post context
258 if (it
.limit
>= allLimit
) break;
261 // Restore limit. pos.start is fine where the last transliterator
262 // left it, or at the end of the last run.
263 pos
.limit
= allLimit
;
266 Transliterator
* AnyTransliterator::getTransliterator(UScriptCode source
) const {
268 if (source
== targetScript
|| source
== USCRIPT_INVALID_CODE
) {
272 Transliterator
* t
= (Transliterator
*) uhash_iget(cache
, (int32_t) source
);
274 UErrorCode ec
= U_ZERO_ERROR
;
275 UnicodeString
sourceName(uscript_getName(source
), -1, US_INV
);
276 UnicodeString
id(sourceName
);
277 id
.append(TARGET_SEP
).append(target
);
279 t
= Transliterator::createInstance(id
, UTRANS_FORWARD
, ec
);
280 if (U_FAILURE(ec
) || t
== NULL
) {
283 // Try to pivot around Latin, our most common script
285 id
.append(LATIN_PIVOT
).append(target
);
286 t
= Transliterator::createInstance(id
, UTRANS_FORWARD
, ec
);
287 if (U_FAILURE(ec
) || t
== NULL
) {
294 uhash_iput(cache
, (int32_t) source
, t
, &ec
);
302 * Return the script code for a given name, or -1 if not found.
304 static UScriptCode
scriptNameToCode(const UnicodeString
& name
) {
307 UErrorCode ec
= U_ZERO_ERROR
;
308 int32_t nameLen
= name
.length();
309 UBool isInvariant
= uprv_isInvariantUString(name
.getBuffer(), nameLen
);
312 name
.extract(0, nameLen
, buf
, (int32_t)sizeof(buf
), US_INV
);
313 buf
[127] = 0; // Make sure that we NULL terminate the string.
315 if (!isInvariant
|| uscript_getCode(buf
, &code
, 1, &ec
) != 1 || U_FAILURE(ec
))
317 code
= USCRIPT_INVALID_CODE
;
323 * Registers standard transliterators with the system. Called by
324 * Transliterator during initialization. Scan all current targets and
325 * register those that are scripts T as Any-T/V.
327 void AnyTransliterator::registerIDs() {
329 UErrorCode ec
= U_ZERO_ERROR
;
330 Hashtable
seen(TRUE
, ec
);
332 int32_t sourceCount
= Transliterator::_countAvailableSources();
333 for (int32_t s
=0; s
<sourceCount
; ++s
) {
334 UnicodeString source
;
335 Transliterator::_getAvailableSource(s
, source
);
337 // Ignore the "Any" source
338 if (source
.caseCompare(ANY
, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
340 int32_t targetCount
= Transliterator::_countAvailableTargets(source
);
341 for (int32_t t
=0; t
<targetCount
; ++t
) {
342 UnicodeString target
;
343 Transliterator::_getAvailableTarget(t
, source
, target
);
345 // Only process each target once
346 if (seen
.geti(target
) != 0) continue;
348 seen
.puti(target
, 1, ec
);
350 // Get the script code for the target. If not a script, ignore.
351 UScriptCode targetScript
= scriptNameToCode(target
);
352 if (targetScript
== USCRIPT_INVALID_CODE
) continue;
354 int32_t variantCount
= Transliterator::_countAvailableVariants(source
, target
);
355 // assert(variantCount >= 1);
356 for (int32_t v
=0; v
<variantCount
; ++v
) {
357 UnicodeString variant
;
358 Transliterator::_getAvailableVariant(v
, source
, target
, variant
);
361 TransliteratorIDParser::STVtoID(ANY
, target
, variant
, id
);
363 AnyTransliterator
* t
= new AnyTransliterator(id
, target
, variant
,
368 Transliterator::_registerInstance(t
);
369 Transliterator::_registerSpecialInverse(target
, NULL_ID
, FALSE
);
378 #endif /* #if !UCONFIG_NO_TRANSLITERATION */