]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/anytrans.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / anytrans.cpp
1 /*
2 *****************************************************************
3 * Copyright (c) 2002-2016 International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *****************************************************************
6 * Date Name Description
7 * 06/06/2002 aliu Creation.
8 *****************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
17
18 #include "anytrans.h"
19 #include "hash.h"
20 #include "mutex.h"
21 #include "nultrans.h"
22 #include "putilimp.h"
23 #include "tridpars.h"
24 #include "uinvchar.h"
25 #include "uvector.h"
26
27 //------------------------------------------------------------
28 // Constants
29
30 static const UChar TARGET_SEP = 45; // '-'
31 static const UChar VARIANT_SEP = 47; // '/'
32 static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
33 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
34 static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
35
36 // initial size for an Any-XXXX transform's cache of script-XXXX transforms
37 // (will grow as necessary, but we don't expect to have source text wit more than 7 scripts)
38 #define ANY_TRANS_CACHE_INIT_SIZE 7
39
40 //------------------------------------------------------------
41
42 U_CDECL_BEGIN
43 /**
44 * Deleter function for Transliterator*.
45 */
46 static void U_CALLCONV
47 _deleteTransliterator(void *obj) {
48 delete (icu::Transliterator*) obj;
49 }
50 U_CDECL_END
51
52 //------------------------------------------------------------
53
54 U_NAMESPACE_BEGIN
55
56 //------------------------------------------------------------
57 // ScriptRunIterator
58
59 /**
60 * Returns a series of ranges corresponding to scripts. They will be
61 * of the form:
62 *
63 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
64 * | | - first run (start, limit)
65 * | | - second run (start, limit)
66 *
67 * That is, the runs will overlap. The reason for this is so that a
68 * transliterator can consider common characters both before and after
69 * the scripts.
70 */
71 class ScriptRunIterator : public UMemory {
72 private:
73 const Replaceable& text;
74 int32_t textStart;
75 int32_t textLimit;
76
77 public:
78 /**
79 * The code of the current run, valid after next() returns. May
80 * be USCRIPT_INVALID_CODE if and only if the entire text is
81 * COMMON/INHERITED.
82 */
83 UScriptCode scriptCode;
84
85 /**
86 * The start of the run, inclusive, valid after next() returns.
87 */
88 int32_t start;
89
90 /**
91 * The end of the run, exclusive, valid after next() returns.
92 */
93 int32_t limit;
94
95 /**
96 * Constructs a run iterator over the given text from start
97 * (inclusive) to limit (exclusive).
98 */
99 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
100
101 /**
102 * Returns TRUE if there are any more runs. TRUE is always
103 * returned at least once. Upon return, the caller should
104 * examine scriptCode, start, and limit.
105 */
106 UBool next();
107
108 /**
109 * Adjusts internal indices for a change in the limit index of the
110 * given delta. A positive delta means the limit has increased.
111 */
112 void adjustLimit(int32_t delta);
113
114 private:
115 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
116 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
117 };
118
119 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
120 int32_t myStart, int32_t myLimit) :
121 text(theText)
122 {
123 textStart = myStart;
124 textLimit = myLimit;
125 limit = myStart;
126 }
127
128 UBool ScriptRunIterator::next() {
129 UChar32 ch;
130 UScriptCode s;
131 UErrorCode ec = U_ZERO_ERROR;
132
133 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
134 start = limit;
135
136 // Are we done?
137 if (start == textLimit) {
138 return FALSE;
139 }
140
141 // Move start back to include adjacent COMMON or INHERITED
142 // characters
143 while (start > textStart) {
144 ch = text.char32At(start - 1); // look back
145 s = uscript_getScript(ch, &ec);
146 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
147 --start;
148 } else {
149 break;
150 }
151 }
152
153 // Move limit ahead to include COMMON, INHERITED, and characters
154 // of the current script.
155 while (limit < textLimit) {
156 ch = text.char32At(limit); // look ahead
157 s = uscript_getScript(ch, &ec);
158 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
159 if (scriptCode == USCRIPT_INVALID_CODE) {
160 scriptCode = s;
161 } else if (s != scriptCode) {
162 break;
163 }
164 }
165 ++limit;
166 }
167
168 // Return TRUE even if the entire text is COMMON / INHERITED, in
169 // which case scriptCode will be USCRIPT_INVALID_CODE.
170 return TRUE;
171 }
172
173 void ScriptRunIterator::adjustLimit(int32_t delta) {
174 limit += delta;
175 textLimit += delta;
176 }
177
178 //------------------------------------------------------------
179 // AnyTransliterator
180
181 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
182
183 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
184 const UnicodeString& theTarget,
185 const UnicodeString& theVariant,
186 UScriptCode theTargetScript,
187 UErrorCode& ec) :
188 Transliterator(id, NULL),
189 targetScript(theTargetScript)
190 {
191 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
192 if (U_FAILURE(ec)) {
193 return;
194 }
195 uhash_setValueDeleter(cache, _deleteTransliterator);
196
197 target = theTarget;
198 if (theVariant.length() > 0) {
199 target.append(VARIANT_SEP).append(theVariant);
200 }
201 }
202
203 AnyTransliterator::~AnyTransliterator() {
204 uhash_close(cache);
205 }
206
207 /**
208 * Copy constructor.
209 */
210 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
211 Transliterator(o),
212 target(o.target),
213 targetScript(o.targetScript)
214 {
215 // Don't copy the cache contents
216 UErrorCode ec = U_ZERO_ERROR;
217 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
218 if (U_FAILURE(ec)) {
219 return;
220 }
221 uhash_setValueDeleter(cache, _deleteTransliterator);
222 }
223
224 /**
225 * Transliterator API.
226 */
227 Transliterator* AnyTransliterator::clone() const {
228 return new AnyTransliterator(*this);
229 }
230
231 /**
232 * Implements {@link Transliterator#handleTransliterate}.
233 */
234 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
235 UBool isIncremental) const {
236 int32_t allStart = pos.start;
237 int32_t allLimit = pos.limit;
238
239 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
240
241 while (it.next()) {
242 // Ignore runs in the ante context
243 if (it.limit <= allStart) continue;
244
245 // Try to instantiate transliterator from it.scriptCode to
246 // our target or target/variant
247 Transliterator* t = getTransliterator(it.scriptCode);
248
249 if (t == NULL) {
250 // We have no transliterator. Do nothing, but keep
251 // pos.start up to date.
252 pos.start = it.limit;
253 continue;
254 }
255
256 // If the run end is before the transliteration limit, do
257 // a non-incremental transliteration. Otherwise do an
258 // incremental one.
259 UBool incremental = isIncremental && (it.limit >= allLimit);
260
261 pos.start = uprv_max(allStart, it.start);
262 pos.limit = uprv_min(allLimit, it.limit);
263 int32_t limit = pos.limit;
264 t->filteredTransliterate(text, pos, incremental);
265 int32_t delta = pos.limit - limit;
266 allLimit += delta;
267 it.adjustLimit(delta);
268
269 // We're done if we enter the post context
270 if (it.limit >= allLimit) break;
271 }
272
273 // Restore limit. pos.start is fine where the last transliterator
274 // left it, or at the end of the last run.
275 pos.limit = allLimit;
276 }
277
278 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
279
280 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
281 return NULL;
282 }
283
284 Transliterator* t = NULL;
285 {
286 Mutex m(NULL);
287 t = (Transliterator*) uhash_iget(cache, (int32_t) source);
288 }
289 if (t == NULL) {
290 UErrorCode ec = U_ZERO_ERROR;
291 UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
292 UnicodeString id(sourceName);
293 id.append(TARGET_SEP).append(target);
294
295 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
296 if (U_FAILURE(ec) || t == NULL) {
297 delete t;
298
299 // Try to pivot around Latin, our most common script
300 id = sourceName;
301 id.append(LATIN_PIVOT, -1).append(target);
302 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
303 if (U_FAILURE(ec) || t == NULL) {
304 delete t;
305 t = NULL;
306 }
307 }
308
309 if (t != NULL) {
310 Transliterator *rt = NULL;
311 {
312 Mutex m(NULL);
313 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
314 if (rt == NULL) {
315 // Common case, no race to cache this new transliterator.
316 uhash_iput(cache, (int32_t) source, t, &ec);
317 } else {
318 // Race case, some other thread beat us to caching this transliterator.
319 Transliterator *temp = rt;
320 rt = t; // Our newly created transliterator that lost the race & now needs deleting.
321 t = temp; // The transliterator from the cache that we will return.
322 }
323 }
324 delete rt; // will be non-null only in case of races.
325 }
326 }
327 return t;
328 }
329
330 /**
331 * Return the script code for a given name, or -1 if not found.
332 */
333 static UScriptCode scriptNameToCode(const UnicodeString& name) {
334 char buf[128];
335 UScriptCode code;
336 UErrorCode ec = U_ZERO_ERROR;
337 int32_t nameLen = name.length();
338 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
339
340 if (isInvariant) {
341 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
342 buf[127] = 0; // Make sure that we NULL terminate the string.
343 }
344 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
345 {
346 code = USCRIPT_INVALID_CODE;
347 }
348 return code;
349 }
350
351 /**
352 * Registers standard transliterators with the system. Called by
353 * Transliterator during initialization. Scan all current targets and
354 * register those that are scripts T as Any-T/V.
355 */
356 void AnyTransliterator::registerIDs() {
357
358 UErrorCode ec = U_ZERO_ERROR;
359 Hashtable seen(TRUE, ec);
360
361 int32_t sourceCount = Transliterator::_countAvailableSources();
362 for (int32_t s=0; s<sourceCount; ++s) {
363 UnicodeString source;
364 Transliterator::_getAvailableSource(s, source);
365
366 // Ignore the "Any" source
367 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
368
369 int32_t targetCount = Transliterator::_countAvailableTargets(source);
370 for (int32_t t=0; t<targetCount; ++t) {
371 UnicodeString target;
372 Transliterator::_getAvailableTarget(t, source, target);
373
374 // Only process each target once
375 if (seen.geti(target) != 0) continue;
376 ec = U_ZERO_ERROR;
377 seen.puti(target, 1, ec);
378
379 // Get the script code for the target. If not a script, ignore.
380 UScriptCode targetScript = scriptNameToCode(target);
381 if (targetScript == USCRIPT_INVALID_CODE) continue;
382
383 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
384 // assert(variantCount >= 1);
385 for (int32_t v=0; v<variantCount; ++v) {
386 UnicodeString variant;
387 Transliterator::_getAvailableVariant(v, source, target, variant);
388
389 UnicodeString id;
390 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
391 ec = U_ZERO_ERROR;
392 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
393 targetScript, ec);
394 if (U_FAILURE(ec)) {
395 delete t;
396 } else {
397 Transliterator::_registerInstance(t);
398 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
399 }
400 }
401 }
402 }
403 }
404
405 U_NAMESPACE_END
406
407 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
408
409 //eof