]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/unesctrn.cpp
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / i18n / unesctrn.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uchar.h"
16 #include "unicode/utf16.h"
17 #include "unesctrn.h"
18 #include "util.h"
19
20 #include "cmemory.h"
21
22 U_NAMESPACE_BEGIN
23
24 /**
25 * Special character marking the end of the spec[] array.
26 */
27 static const UChar END = 0xFFFF;
28
29 // Unicode: "U+10FFFF" hex, min=4, max=6
30 static const UChar SPEC_Unicode[] = {
31 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
32 END
33 };
34
35 // Java: "\\uFFFF" hex, min=4, max=4
36 static const UChar SPEC_Java[] = {
37 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
38 END
39 };
40
41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
42 static const UChar SPEC_C[] = {
43 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
44 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
45 END
46 };
47
48 // XML: "" hex, min=1, max=6
49 static const UChar SPEC_XML[] = {
50 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
51 END
52 };
53
54 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
55 static const UChar SPEC_XML10[] = {
56 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
57 END
58 };
59
60 // Perl: "\\x{263A}" hex, min=1, max=6
61 static const UChar SPEC_Perl[] = {
62 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
63 END
64 };
65
66 // All: Java, C, Perl, XML, XML10, Unicode
67 static const UChar SPEC_Any[] = {
68 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
69 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
70 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
71 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
72 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
73 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
74 END
75 };
76
77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
78
79 static UChar* copySpec(const UChar* spec) {
80 int32_t len = 0;
81 while (spec[len] != END) {
82 ++len;
83 }
84 ++len;
85 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
86 // Check for memory allocation error.
87 if (result != NULL) {
88 uprv_memcpy(result, spec, len*sizeof(result[0]));
89 }
90 return result;
91 }
92
93 /**
94 * Factory methods. Ignore the context.
95 */
96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
97 return new UnescapeTransliterator(ID, SPEC_Unicode);
98 }
99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
100 return new UnescapeTransliterator(ID, SPEC_Java);
101 }
102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
103 return new UnescapeTransliterator(ID, SPEC_C);
104 }
105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
106 return new UnescapeTransliterator(ID, SPEC_XML);
107 }
108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
109 return new UnescapeTransliterator(ID, SPEC_XML10);
110 }
111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
112 return new UnescapeTransliterator(ID, SPEC_Perl);
113 }
114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
115 return new UnescapeTransliterator(ID, SPEC_Any);
116 }
117
118 /**
119 * Registers standard variants with the system. Called by
120 * Transliterator during initialization.
121 */
122 void UnescapeTransliterator::registerIDs() {
123 Token t = integerToken(0);
124
125 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
126
127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
128
129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
130
131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
132
133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
134
135 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
136
137 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
138 }
139
140 /**
141 * Constructor. Takes the encoded spec array.
142 */
143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
144 const UChar *newSpec) :
145 Transliterator(newID, NULL)
146 {
147 this->spec = copySpec(newSpec);
148 }
149
150 /**
151 * Copy constructor.
152 */
153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
154 Transliterator(o) {
155 this->spec = copySpec(o.spec);
156 }
157
158 UnescapeTransliterator::~UnescapeTransliterator() {
159 uprv_free(spec);
160 }
161
162 /**
163 * Transliterator API.
164 */
165 Transliterator* UnescapeTransliterator::clone() const {
166 return new UnescapeTransliterator(*this);
167 }
168
169 /**
170 * Implements {@link Transliterator#handleTransliterate}.
171 */
172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
173 UBool isIncremental) const {
174 int32_t start = pos.start;
175 int32_t limit = pos.limit;
176 int32_t i, j, ipat;
177
178 while (start < limit) {
179 // Loop over the forms in spec[]. Exit this loop when we
180 // match one of the specs. Exit the outer loop if a
181 // partial match is detected and isIncremental is true.
182 for (j=0, ipat=0; spec[ipat] != END; ++j) {
183
184 // Read the header
185 int32_t prefixLen = spec[ipat++];
186 int32_t suffixLen = spec[ipat++];
187 int8_t radix = (int8_t) spec[ipat++];
188 int32_t minDigits = spec[ipat++];
189 int32_t maxDigits = spec[ipat++];
190
191 // s is a copy of start that is advanced over the
192 // characters as we parse them.
193 int32_t s = start;
194 UBool match = TRUE;
195
196 for (i=0; i<prefixLen; ++i) {
197 if (s >= limit) {
198 if (i > 0) {
199 // We've already matched a character. This is
200 // a partial match, so we return if in
201 // incremental mode. In non-incremental mode,
202 // go to the next spec.
203 if (isIncremental) {
204 goto exit;
205 }
206 match = FALSE;
207 break;
208 }
209 }
210 UChar c = text.charAt(s++);
211 if (c != spec[ipat + i]) {
212 match = FALSE;
213 break;
214 }
215 }
216
217 if (match) {
218 UChar32 u = 0;
219 int32_t digitCount = 0;
220 for (;;) {
221 if (s >= limit) {
222 // Check for partial match in incremental mode.
223 if (s > start && isIncremental) {
224 goto exit;
225 }
226 break;
227 }
228 UChar32 ch = text.char32At(s);
229 int32_t digit = u_digit(ch, radix);
230 if (digit < 0) {
231 break;
232 }
233 s += U16_LENGTH(ch);
234 u = (u * radix) + digit;
235 if (++digitCount == maxDigits) {
236 break;
237 }
238 }
239
240 match = (digitCount >= minDigits);
241
242 if (match) {
243 for (i=0; i<suffixLen; ++i) {
244 if (s >= limit) {
245 // Check for partial match in incremental mode.
246 if (s > start && isIncremental) {
247 goto exit;
248 }
249 match = FALSE;
250 break;
251 }
252 UChar c = text.charAt(s++);
253 if (c != spec[ipat + prefixLen + i]) {
254 match = FALSE;
255 break;
256 }
257 }
258
259 if (match) {
260 // At this point, we have a match
261 UnicodeString str(u);
262 text.handleReplaceBetween(start, s, str);
263 limit -= s - start - str.length();
264 // The following break statement leaves the
265 // loop that is traversing the forms in
266 // spec[]. We then parse the next input
267 // character.
268 break;
269 }
270 }
271 }
272
273 ipat += prefixLen + suffixLen;
274 }
275
276 if (start < limit) {
277 start += U16_LENGTH(text.char32At(start));
278 }
279 }
280
281 exit:
282 pos.contextLimit += limit - pos.limit;
283 pos.limit = limit;
284 pos.start = start;
285 }
286
287 U_NAMESPACE_END
288
289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
290
291 //eof