]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/anytrans.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / i18n / anytrans.cpp
1 /*
2 *****************************************************************
3 * Copyright (c) 2002-2004, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *****************************************************************
6 * Date Name Description
7 * 06/06/2002 aliu Creation.
8 *****************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
17 #include "nultrans.h"
18 #include "anytrans.h"
19 #include "uvector.h"
20 #include "tridpars.h"
21 #include "hash.h"
22 #include "putilimp.h"
23
24 //------------------------------------------------------------
25 // Constants
26
27 static const UChar TARGET_SEP = 45; // '-'
28 static const UChar VARIANT_SEP = 47; // '/'
29 static const UChar ANY[] = {65,110,121,0}; // "Any"
30 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
31 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
32
33 //------------------------------------------------------------
34
35 U_CDECL_BEGIN
36 /**
37 * Deleter function for Transliterator*.
38 */
39 static void U_CALLCONV
40 _deleteTransliterator(void *obj) {
41 delete (Transliterator*) obj;
42 }
43 U_CDECL_END
44
45 //------------------------------------------------------------
46
47 U_NAMESPACE_BEGIN
48
49 //------------------------------------------------------------
50 // ScriptRunIterator
51
52 /**
53 * Returns a series of ranges corresponding to scripts. They will be
54 * of the form:
55 *
56 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
57 * | | - first run (start, limit)
58 * | | - second run (start, limit)
59 *
60 * That is, the runs will overlap. The reason for this is so that a
61 * transliterator can consider common characters both before and after
62 * the scripts.
63 */
64 class ScriptRunIterator : public UMemory {
65 private:
66 const Replaceable& text;
67 int32_t textStart;
68 int32_t textLimit;
69
70 public:
71 /**
72 * The code of the current run, valid after next() returns. May
73 * be USCRIPT_INVALID_CODE if and only if the entire text is
74 * COMMON/INHERITED.
75 */
76 UScriptCode scriptCode;
77
78 /**
79 * The start of the run, inclusive, valid after next() returns.
80 */
81 int32_t start;
82
83 /**
84 * The end of the run, exclusive, valid after next() returns.
85 */
86 int32_t limit;
87
88 /**
89 * Constructs a run iterator over the given text from start
90 * (inclusive) to limit (exclusive).
91 */
92 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
93
94 /**
95 * Returns TRUE if there are any more runs. TRUE is always
96 * returned at least once. Upon return, the caller should
97 * examine scriptCode, start, and limit.
98 */
99 UBool next();
100
101 /**
102 * Adjusts internal indices for a change in the limit index of the
103 * given delta. A positive delta means the limit has increased.
104 */
105 void adjustLimit(int32_t delta);
106
107 private:
108 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
109 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
110 };
111
112 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
113 int32_t myStart, int32_t myLimit) :
114 text(theText)
115 {
116 textStart = myStart;
117 textLimit = myLimit;
118 limit = myStart;
119 }
120
121 UBool ScriptRunIterator::next() {
122 UChar32 ch;
123 UScriptCode s;
124 UErrorCode ec = U_ZERO_ERROR;
125
126 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
127 start = limit;
128
129 // Are we done?
130 if (start == textLimit) {
131 return FALSE;
132 }
133
134 // Move start back to include adjacent COMMON or INHERITED
135 // characters
136 while (start > textStart) {
137 ch = text.char32At(start - 1); // look back
138 s = uscript_getScript(ch, &ec);
139 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
140 --start;
141 } else {
142 break;
143 }
144 }
145
146 // Move limit ahead to include COMMON, INHERITED, and characters
147 // of the current script.
148 while (limit < textLimit) {
149 ch = text.char32At(limit); // look ahead
150 s = uscript_getScript(ch, &ec);
151 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
152 if (scriptCode == USCRIPT_INVALID_CODE) {
153 scriptCode = s;
154 } else if (s != scriptCode) {
155 break;
156 }
157 }
158 ++limit;
159 }
160
161 // Return TRUE even if the entire text is COMMON / INHERITED, in
162 // which case scriptCode will be USCRIPT_INVALID_CODE.
163 return TRUE;
164 }
165
166 void ScriptRunIterator::adjustLimit(int32_t delta) {
167 limit += delta;
168 textLimit += delta;
169 }
170
171 //------------------------------------------------------------
172 // AnyTransliterator
173
174 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
175
176 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
177 const UnicodeString& theTarget,
178 const UnicodeString& theVariant,
179 UScriptCode theTargetScript,
180 UErrorCode& ec) :
181 Transliterator(id, NULL),
182 targetScript(theTargetScript)
183 {
184 cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec);
185 uhash_setValueDeleter(cache, _deleteTransliterator);
186
187 target = theTarget;
188 if (theVariant.length() > 0) {
189 target.append(VARIANT_SEP).append(theVariant);
190 }
191 }
192
193 AnyTransliterator::~AnyTransliterator() {
194 uhash_close(cache);
195 }
196
197 /**
198 * Copy constructor.
199 */
200 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
201 Transliterator(o),
202 target(o.target),
203 targetScript(o.targetScript)
204 {
205 // Don't copy the cache contents
206 UErrorCode ec = U_ZERO_ERROR;
207 cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec);
208 uhash_setValueDeleter(cache, _deleteTransliterator);
209 }
210
211 /**
212 * Transliterator API.
213 */
214 Transliterator* AnyTransliterator::clone() const {
215 return new AnyTransliterator(*this);
216 }
217
218 /**
219 * Implements {@link Transliterator#handleTransliterate}.
220 */
221 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
222 UBool isIncremental) const {
223 int32_t allStart = pos.start;
224 int32_t allLimit = pos.limit;
225
226 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
227
228 while (it.next()) {
229 // Ignore runs in the ante context
230 if (it.limit <= allStart) continue;
231
232 // Try to instantiate transliterator from it.scriptCode to
233 // our target or target/variant
234 Transliterator* t = getTransliterator(it.scriptCode);
235
236 if (t == NULL) {
237 // We have no transliterator. Do nothing, but keep
238 // pos.start up to date.
239 pos.start = it.limit;
240 continue;
241 }
242
243 // If the run end is before the transliteration limit, do
244 // a non-incremental transliteration. Otherwise do an
245 // incremental one.
246 UBool incremental = isIncremental && (it.limit >= allLimit);
247
248 pos.start = uprv_max(allStart, it.start);
249 pos.limit = uprv_min(allLimit, it.limit);
250 int32_t limit = pos.limit;
251 t->filteredTransliterate(text, pos, incremental);
252 int32_t delta = pos.limit - limit;
253 allLimit += delta;
254 it.adjustLimit(delta);
255
256 // We're done if we enter the post context
257 if (it.limit >= allLimit) break;
258 }
259
260 // Restore limit. pos.start is fine where the last transliterator
261 // left it, or at the end of the last run.
262 pos.limit = allLimit;
263 }
264
265 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
266
267 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
268 return NULL;
269 }
270
271 Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
272 if (t == NULL) {
273 UErrorCode ec = U_ZERO_ERROR;
274 UnicodeString sourceName(uscript_getName(source), "");
275 UnicodeString id(sourceName);
276 id.append(TARGET_SEP).append(target);
277
278 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
279 if (U_FAILURE(ec) || t == NULL) {
280 delete t;
281
282 // Try to pivot around Latin, our most common script
283 id = sourceName;
284 id.append(LATIN_PIVOT).append(target);
285 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
286 if (U_FAILURE(ec) || t == NULL) {
287 delete t;
288 t = NULL;
289 }
290 }
291
292 if (t != NULL) {
293 uhash_iput(cache, (int32_t) source, t, &ec);
294 }
295 }
296
297 return t;
298 }
299
300 /**
301 * Return the script code for a given name, or -1 if not found.
302 */
303 UScriptCode AnyTransliterator::scriptNameToCode(const UnicodeString& name) {
304 char buf[128];
305 UScriptCode code;
306 UErrorCode ec = U_ZERO_ERROR;
307
308 name.extract(0, 128, buf, 128, "");
309 if (uscript_getCode(buf, &code, 1, &ec) != 1 ||
310 U_FAILURE(ec)) {
311 code = USCRIPT_INVALID_CODE;
312 }
313 return code;
314 }
315
316 /**
317 * Registers standard transliterators with the system. Called by
318 * Transliterator during initialization. Scan all current targets and
319 * register those that are scripts T as Any-T/V.
320 */
321 void AnyTransliterator::registerIDs() {
322
323 UErrorCode ec = U_ZERO_ERROR;
324 Hashtable seen(TRUE, ec);
325
326 int32_t sourceCount = Transliterator::_countAvailableSources();
327 for (int32_t s=0; s<sourceCount; ++s) {
328 UnicodeString source;
329 Transliterator::_getAvailableSource(s, source);
330
331 // Ignore the "Any" source
332 if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
333
334 int32_t targetCount = Transliterator::_countAvailableTargets(source);
335 for (int32_t t=0; t<targetCount; ++t) {
336 UnicodeString target;
337 Transliterator::_getAvailableTarget(t, source, target);
338
339 // Only process each target once
340 if (seen.geti(target) != 0) continue;
341 ec = U_ZERO_ERROR;
342 seen.puti(target, 1, ec);
343
344 // Get the script code for the target. If not a script, ignore.
345 UScriptCode targetScript = scriptNameToCode(target);
346 if (targetScript == USCRIPT_INVALID_CODE) continue;
347
348 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
349 // assert(variantCount >= 1);
350 for (int32_t v=0; v<variantCount; ++v) {
351 UnicodeString variant;
352 Transliterator::_getAvailableVariant(v, source, target, variant);
353
354 UnicodeString id;
355 TransliteratorIDParser::STVtoID(ANY, target, variant, id);
356 ec = U_ZERO_ERROR;
357 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
358 targetScript, ec);
359 if (U_FAILURE(ec)) {
360 delete t;
361 } else {
362 Transliterator::_registerInstance(t);
363 Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
364 }
365 }
366 }
367 }
368 }
369
370 U_NAMESPACE_END
371
372 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
373
374 //eof