]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/anytrans.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / anytrans.cpp
CommitLineData
b75a7d8f
A
1/*
2*****************************************************************
2ca993e8 3* Copyright (c) 2002-2016 International Business Machines Corporation
b75a7d8f
A
4* and others. All Rights Reserved.
5*****************************************************************
6* Date Name Description
7* 06/06/2002 aliu Creation.
8*****************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uobject.h"
16#include "unicode/uscript.h"
57a6839d 17
b75a7d8f 18#include "anytrans.h"
b75a7d8f 19#include "hash.h"
57a6839d
A
20#include "mutex.h"
21#include "nultrans.h"
374ca955 22#include "putilimp.h"
57a6839d 23#include "tridpars.h"
73c04bcf 24#include "uinvchar.h"
57a6839d 25#include "uvector.h"
b75a7d8f
A
26
27//------------------------------------------------------------
28// Constants
29
30static const UChar TARGET_SEP = 45; // '-'
31static const UChar VARIANT_SEP = 47; // '/'
2ca993e8 32static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
b75a7d8f 33static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
2ca993e8
A
34static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
35
36// initial size for an Any-XXXX transform's cache of script-XXXX transforms
37// (will grow as necessary, but we don't expect to have source text wit more than 7 scripts)
38#define ANY_TRANS_CACHE_INIT_SIZE 7
b75a7d8f
A
39
40//------------------------------------------------------------
41
42U_CDECL_BEGIN
43/**
44 * Deleter function for Transliterator*.
45 */
46static void U_CALLCONV
47_deleteTransliterator(void *obj) {
57a6839d 48 delete (icu::Transliterator*) obj;
b75a7d8f
A
49}
50U_CDECL_END
51
52//------------------------------------------------------------
53
54U_NAMESPACE_BEGIN
55
56//------------------------------------------------------------
57// ScriptRunIterator
58
59/**
60 * Returns a series of ranges corresponding to scripts. They will be
61 * of the form:
62 *
63 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
64 * | | - first run (start, limit)
65 * | | - second run (start, limit)
66 *
67 * That is, the runs will overlap. The reason for this is so that a
68 * transliterator can consider common characters both before and after
69 * the scripts.
70 */
71class ScriptRunIterator : public UMemory {
72private:
73 const Replaceable& text;
74 int32_t textStart;
75 int32_t textLimit;
76
77public:
78 /**
79 * The code of the current run, valid after next() returns. May
80 * be USCRIPT_INVALID_CODE if and only if the entire text is
81 * COMMON/INHERITED.
82 */
83 UScriptCode scriptCode;
84
85 /**
86 * The start of the run, inclusive, valid after next() returns.
87 */
88 int32_t start;
89
90 /**
91 * The end of the run, exclusive, valid after next() returns.
92 */
93 int32_t limit;
57a6839d 94
b75a7d8f
A
95 /**
96 * Constructs a run iterator over the given text from start
97 * (inclusive) to limit (exclusive).
98 */
99 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
100
101 /**
102 * Returns TRUE if there are any more runs. TRUE is always
103 * returned at least once. Upon return, the caller should
104 * examine scriptCode, start, and limit.
105 */
106 UBool next();
107
108 /**
109 * Adjusts internal indices for a change in the limit index of the
110 * given delta. A positive delta means the limit has increased.
111 */
112 void adjustLimit(int32_t delta);
113
114private:
115 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
116 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
117};
118
119ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
120 int32_t myStart, int32_t myLimit) :
121 text(theText)
122{
123 textStart = myStart;
124 textLimit = myLimit;
125 limit = myStart;
126}
127
128UBool ScriptRunIterator::next() {
129 UChar32 ch;
130 UScriptCode s;
131 UErrorCode ec = U_ZERO_ERROR;
132
133 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
134 start = limit;
135
136 // Are we done?
137 if (start == textLimit) {
138 return FALSE;
139 }
140
141 // Move start back to include adjacent COMMON or INHERITED
142 // characters
143 while (start > textStart) {
144 ch = text.char32At(start - 1); // look back
145 s = uscript_getScript(ch, &ec);
146 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
147 --start;
148 } else {
149 break;
150 }
151 }
152
153 // Move limit ahead to include COMMON, INHERITED, and characters
154 // of the current script.
155 while (limit < textLimit) {
156 ch = text.char32At(limit); // look ahead
157 s = uscript_getScript(ch, &ec);
158 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
159 if (scriptCode == USCRIPT_INVALID_CODE) {
160 scriptCode = s;
161 } else if (s != scriptCode) {
162 break;
163 }
164 }
165 ++limit;
166 }
167
168 // Return TRUE even if the entire text is COMMON / INHERITED, in
169 // which case scriptCode will be USCRIPT_INVALID_CODE.
170 return TRUE;
171}
172
173void ScriptRunIterator::adjustLimit(int32_t delta) {
174 limit += delta;
175 textLimit += delta;
176}
177
178//------------------------------------------------------------
179// AnyTransliterator
180
374ca955 181UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
b75a7d8f
A
182
183AnyTransliterator::AnyTransliterator(const UnicodeString& id,
184 const UnicodeString& theTarget,
185 const UnicodeString& theVariant,
186 UScriptCode theTargetScript,
187 UErrorCode& ec) :
188 Transliterator(id, NULL),
57a6839d 189 targetScript(theTargetScript)
b75a7d8f 190{
2ca993e8 191 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
46f4442e
A
192 if (U_FAILURE(ec)) {
193 return;
194 }
b75a7d8f
A
195 uhash_setValueDeleter(cache, _deleteTransliterator);
196
197 target = theTarget;
198 if (theVariant.length() > 0) {
199 target.append(VARIANT_SEP).append(theVariant);
200 }
201}
202
203AnyTransliterator::~AnyTransliterator() {
204 uhash_close(cache);
205}
206
207/**
208 * Copy constructor.
209 */
210AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
211 Transliterator(o),
212 target(o.target),
213 targetScript(o.targetScript)
214{
215 // Don't copy the cache contents
216 UErrorCode ec = U_ZERO_ERROR;
2ca993e8 217 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
46f4442e
A
218 if (U_FAILURE(ec)) {
219 return;
220 }
b75a7d8f
A
221 uhash_setValueDeleter(cache, _deleteTransliterator);
222}
223
224/**
225 * Transliterator API.
226 */
227Transliterator* AnyTransliterator::clone() const {
228 return new AnyTransliterator(*this);
229}
230
231/**
232 * Implements {@link Transliterator#handleTransliterate}.
233 */
234void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
235 UBool isIncremental) const {
236 int32_t allStart = pos.start;
237 int32_t allLimit = pos.limit;
238
239 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
240
241 while (it.next()) {
242 // Ignore runs in the ante context
243 if (it.limit <= allStart) continue;
244
245 // Try to instantiate transliterator from it.scriptCode to
246 // our target or target/variant
247 Transliterator* t = getTransliterator(it.scriptCode);
57a6839d 248
b75a7d8f
A
249 if (t == NULL) {
250 // We have no transliterator. Do nothing, but keep
251 // pos.start up to date.
252 pos.start = it.limit;
253 continue;
254 }
255
256 // If the run end is before the transliteration limit, do
257 // a non-incremental transliteration. Otherwise do an
258 // incremental one.
259 UBool incremental = isIncremental && (it.limit >= allLimit);
57a6839d 260
b75a7d8f
A
261 pos.start = uprv_max(allStart, it.start);
262 pos.limit = uprv_min(allLimit, it.limit);
263 int32_t limit = pos.limit;
264 t->filteredTransliterate(text, pos, incremental);
265 int32_t delta = pos.limit - limit;
266 allLimit += delta;
267 it.adjustLimit(delta);
268
269 // We're done if we enter the post context
270 if (it.limit >= allLimit) break;
271 }
272
273 // Restore limit. pos.start is fine where the last transliterator
274 // left it, or at the end of the last run.
275 pos.limit = allLimit;
276}
277
278Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
279
280 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
281 return NULL;
282 }
283
57a6839d
A
284 Transliterator* t = NULL;
285 {
286 Mutex m(NULL);
287 t = (Transliterator*) uhash_iget(cache, (int32_t) source);
288 }
b75a7d8f
A
289 if (t == NULL) {
290 UErrorCode ec = U_ZERO_ERROR;
2ca993e8 291 UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
b75a7d8f
A
292 UnicodeString id(sourceName);
293 id.append(TARGET_SEP).append(target);
57a6839d 294
b75a7d8f
A
295 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
296 if (U_FAILURE(ec) || t == NULL) {
297 delete t;
57a6839d 298
b75a7d8f
A
299 // Try to pivot around Latin, our most common script
300 id = sourceName;
4388f060 301 id.append(LATIN_PIVOT, -1).append(target);
b75a7d8f
A
302 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
303 if (U_FAILURE(ec) || t == NULL) {
304 delete t;
305 t = NULL;
306 }
307 }
308
309 if (t != NULL) {
57a6839d
A
310 Transliterator *rt = NULL;
311 {
312 Mutex m(NULL);
313 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
314 if (rt == NULL) {
315 // Common case, no race to cache this new transliterator.
316 uhash_iput(cache, (int32_t) source, t, &ec);
317 } else {
318 // Race case, some other thread beat us to caching this transliterator.
319 Transliterator *temp = rt;
320 rt = t; // Our newly created transliterator that lost the race & now needs deleting.
321 t = temp; // The transliterator from the cache that we will return.
322 }
323 }
324 delete rt; // will be non-null only in case of races.
b75a7d8f
A
325 }
326 }
b75a7d8f
A
327 return t;
328}
329
330/**
331 * Return the script code for a given name, or -1 if not found.
332 */
73c04bcf 333static UScriptCode scriptNameToCode(const UnicodeString& name) {
b75a7d8f
A
334 char buf[128];
335 UScriptCode code;
336 UErrorCode ec = U_ZERO_ERROR;
73c04bcf
A
337 int32_t nameLen = name.length();
338 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
57a6839d 339
73c04bcf
A
340 if (isInvariant) {
341 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
342 buf[127] = 0; // Make sure that we NULL terminate the string.
343 }
344 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
345 {
b75a7d8f
A
346 code = USCRIPT_INVALID_CODE;
347 }
348 return code;
349}
350
351/**
352 * Registers standard transliterators with the system. Called by
353 * Transliterator during initialization. Scan all current targets and
354 * register those that are scripts T as Any-T/V.
355 */
356void AnyTransliterator::registerIDs() {
357
374ca955
A
358 UErrorCode ec = U_ZERO_ERROR;
359 Hashtable seen(TRUE, ec);
b75a7d8f
A
360
361 int32_t sourceCount = Transliterator::_countAvailableSources();
362 for (int32_t s=0; s<sourceCount; ++s) {
363 UnicodeString source;
364 Transliterator::_getAvailableSource(s, source);
365
366 // Ignore the "Any" source
4388f060 367 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
b75a7d8f
A
368
369 int32_t targetCount = Transliterator::_countAvailableTargets(source);
370 for (int32_t t=0; t<targetCount; ++t) {
371 UnicodeString target;
372 Transliterator::_getAvailableTarget(t, source, target);
373
374 // Only process each target once
375 if (seen.geti(target) != 0) continue;
376 ec = U_ZERO_ERROR;
377 seen.puti(target, 1, ec);
57a6839d 378
b75a7d8f
A
379 // Get the script code for the target. If not a script, ignore.
380 UScriptCode targetScript = scriptNameToCode(target);
381 if (targetScript == USCRIPT_INVALID_CODE) continue;
382
383 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
384 // assert(variantCount >= 1);
385 for (int32_t v=0; v<variantCount; ++v) {
386 UnicodeString variant;
387 Transliterator::_getAvailableVariant(v, source, target, variant);
57a6839d 388
b75a7d8f 389 UnicodeString id;
4388f060 390 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
b75a7d8f
A
391 ec = U_ZERO_ERROR;
392 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
393 targetScript, ec);
394 if (U_FAILURE(ec)) {
395 delete t;
396 } else {
397 Transliterator::_registerInstance(t);
4388f060 398 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
b75a7d8f
A
399 }
400 }
401 }
402 }
403}
404
405U_NAMESPACE_END
406
407#endif /* #if !UCONFIG_NO_TRANSLITERATION */
408
409//eof