]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/unesctrn.cpp
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / i18n / unesctrn.cpp
CommitLineData
b75a7d8f 1/*
374ca955 2 **********************************************************************
46f4442e 3 * Copyright (c) 2001-2008, International Business Machines
374ca955
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
9 */
b75a7d8f
A
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uchar.h"
16#include "unesctrn.h"
17#include "util.h"
18
19#include "cmemory.h"
20
21U_NAMESPACE_BEGIN
22
23/**
24 * Special character marking the end of the spec[] array.
25 */
26static const UChar END = 0xFFFF;
27
28// Unicode: "U+10FFFF" hex, min=4, max=6
29static const UChar SPEC_Unicode[] = {
30 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
31 END
32};
33
34// Java: "\\uFFFF" hex, min=4, max=4
35static const UChar SPEC_Java[] = {
36 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
37 END
38};
39
40// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41static const UChar SPEC_C[] = {
42 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
44 END
45};
46
47// XML: "" hex, min=1, max=6
48static const UChar SPEC_XML[] = {
49 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
50 END
51};
52
53// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54static const UChar SPEC_XML10[] = {
55 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
56 END
57};
58
59// Perl: "\\x{263A}" hex, min=1, max=6
60static const UChar SPEC_Perl[] = {
61 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
62 END
63};
64
65// All: Java, C, Perl, XML, XML10, Unicode
66static const UChar SPEC_Any[] = {
67 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
68 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
69 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
70 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
71 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
72 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
73 END
74};
75
374ca955 76UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
b75a7d8f 77
73c04bcf
A
78static UChar* copySpec(const UChar* spec) {
79 int32_t len = 0;
80 while (spec[len] != END) {
81 ++len;
82 }
83 ++len;
84 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
46f4442e
A
85 // Check for memory allocation error.
86 if (result != NULL) {
87 uprv_memcpy(result, spec, len*sizeof(result[0]));
88 }
73c04bcf
A
89 return result;
90}
91
b75a7d8f
A
92/**
93 * Factory methods. Ignore the context.
94 */
73c04bcf 95static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
96 return new UnescapeTransliterator(ID, SPEC_Unicode);
97}
73c04bcf 98static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
99 return new UnescapeTransliterator(ID, SPEC_Java);
100}
73c04bcf 101static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
102 return new UnescapeTransliterator(ID, SPEC_C);
103}
73c04bcf 104static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
105 return new UnescapeTransliterator(ID, SPEC_XML);
106}
73c04bcf 107static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
108 return new UnescapeTransliterator(ID, SPEC_XML10);
109}
73c04bcf 110static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
111 return new UnescapeTransliterator(ID, SPEC_Perl);
112}
73c04bcf 113static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
b75a7d8f
A
114 return new UnescapeTransliterator(ID, SPEC_Any);
115}
116
117/**
118 * Registers standard variants with the system. Called by
119 * Transliterator during initialization.
120 */
121void UnescapeTransliterator::registerIDs() {
122 Token t = integerToken(0);
123
374ca955 124 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
b75a7d8f 125
374ca955 126 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
b75a7d8f 127
374ca955 128 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
b75a7d8f 129
374ca955 130 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
b75a7d8f 131
374ca955 132 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
b75a7d8f 133
374ca955 134 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
b75a7d8f 135
374ca955 136 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
b75a7d8f
A
137}
138
139/**
140 * Constructor. Takes the encoded spec array.
141 */
142UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
143 const UChar *newSpec) :
144 Transliterator(newID, NULL)
145{
146 this->spec = copySpec(newSpec);
147}
148
149/**
150 * Copy constructor.
151 */
152UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
153 Transliterator(o) {
154 this->spec = copySpec(o.spec);
155}
156
157UnescapeTransliterator::~UnescapeTransliterator() {
158 uprv_free(spec);
159}
160
161/**
162 * Transliterator API.
163 */
164Transliterator* UnescapeTransliterator::clone() const {
165 return new UnescapeTransliterator(*this);
166}
167
b75a7d8f
A
168/**
169 * Implements {@link Transliterator#handleTransliterate}.
170 */
171void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
172 UBool isIncremental) const {
173 int32_t start = pos.start;
174 int32_t limit = pos.limit;
175 int32_t i, j, ipat;
176
177 while (start < limit) {
178 // Loop over the forms in spec[]. Exit this loop when we
179 // match one of the specs. Exit the outer loop if a
180 // partial match is detected and isIncremental is true.
181 for (j=0, ipat=0; spec[ipat] != END; ++j) {
182
183 // Read the header
184 int32_t prefixLen = spec[ipat++];
185 int32_t suffixLen = spec[ipat++];
186 int8_t radix = (int8_t) spec[ipat++];
187 int32_t minDigits = spec[ipat++];
188 int32_t maxDigits = spec[ipat++];
189
190 // s is a copy of start that is advanced over the
191 // characters as we parse them.
192 int32_t s = start;
193 UBool match = TRUE;
194
195 for (i=0; i<prefixLen; ++i) {
196 if (s >= limit) {
197 if (i > 0) {
198 // We've already matched a character. This is
199 // a partial match, so we return if in
200 // incremental mode. In non-incremental mode,
201 // go to the next spec.
202 if (isIncremental) {
203 goto exit;
204 }
205 match = FALSE;
206 break;
207 }
208 }
209 UChar c = text.charAt(s++);
210 if (c != spec[ipat + i]) {
211 match = FALSE;
212 break;
213 }
214 }
215
216 if (match) {
217 UChar32 u = 0;
218 int32_t digitCount = 0;
219 for (;;) {
220 if (s >= limit) {
221 // Check for partial match in incremental mode.
222 if (s > start && isIncremental) {
223 goto exit;
224 }
225 break;
226 }
227 UChar32 ch = text.char32At(s);
228 int32_t digit = u_digit(ch, radix);
229 if (digit < 0) {
230 break;
231 }
232 s += UTF_CHAR_LENGTH(ch);
233 u = (u * radix) + digit;
234 if (++digitCount == maxDigits) {
235 break;
236 }
237 }
238
239 match = (digitCount >= minDigits);
240
241 if (match) {
242 for (i=0; i<suffixLen; ++i) {
243 if (s >= limit) {
244 // Check for partial match in incremental mode.
245 if (s > start && isIncremental) {
246 goto exit;
247 }
248 match = FALSE;
249 break;
250 }
251 UChar c = text.charAt(s++);
252 if (c != spec[ipat + prefixLen + i]) {
253 match = FALSE;
254 break;
255 }
256 }
257
258 if (match) {
259 // At this point, we have a match
260 UnicodeString str(u);
261 text.handleReplaceBetween(start, s, str);
262 limit -= s - start - str.length();
263 // The following break statement leaves the
264 // loop that is traversing the forms in
265 // spec[]. We then parse the next input
266 // character.
267 break;
268 }
269 }
270 }
271
272 ipat += prefixLen + suffixLen;
273 }
274
275 if (start < limit) {
276 start += UTF_CHAR_LENGTH(text.char32At(start));
277 }
278 }
279
280 exit:
281 pos.contextLimit += limit - pos.limit;
282 pos.limit = limit;
283 pos.start = start;
284}
285
286U_NAMESPACE_END
287
288#endif /* #if !UCONFIG_NO_TRANSLITERATION */
289
290//eof