]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/unesctrn.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / unesctrn.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uchar.h"
16 #include "unesctrn.h"
17 #include "util.h"
18
19 #include "cmemory.h"
20
21 U_NAMESPACE_BEGIN
22
23 /**
24 * Special character marking the end of the spec[] array.
25 */
26 static const UChar END = 0xFFFF;
27
28 // Unicode: "U+10FFFF" hex, min=4, max=6
29 static const UChar SPEC_Unicode[] = {
30 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
31 END
32 };
33
34 // Java: "\\uFFFF" hex, min=4, max=4
35 static const UChar SPEC_Java[] = {
36 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
37 END
38 };
39
40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41 static const UChar SPEC_C[] = {
42 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
44 END
45 };
46
47 // XML: "" hex, min=1, max=6
48 static const UChar SPEC_XML[] = {
49 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
50 END
51 };
52
53 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54 static const UChar SPEC_XML10[] = {
55 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
56 END
57 };
58
59 // Perl: "\\x{263A}" hex, min=1, max=6
60 static const UChar SPEC_Perl[] = {
61 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
62 END
63 };
64
65 // All: Java, C, Perl, XML, XML10, Unicode
66 static const UChar SPEC_Any[] = {
67 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
68 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
69 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
70 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
71 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
72 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
73 END
74 };
75
76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
77
78 static UChar* copySpec(const UChar* spec) {
79 int32_t len = 0;
80 while (spec[len] != END) {
81 ++len;
82 }
83 ++len;
84 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
85 uprv_memcpy(result, spec, len*sizeof(result[0]));
86 return result;
87 }
88
89 /**
90 * Factory methods. Ignore the context.
91 */
92 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
93 return new UnescapeTransliterator(ID, SPEC_Unicode);
94 }
95 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
96 return new UnescapeTransliterator(ID, SPEC_Java);
97 }
98 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
99 return new UnescapeTransliterator(ID, SPEC_C);
100 }
101 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
102 return new UnescapeTransliterator(ID, SPEC_XML);
103 }
104 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
105 return new UnescapeTransliterator(ID, SPEC_XML10);
106 }
107 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
108 return new UnescapeTransliterator(ID, SPEC_Perl);
109 }
110 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
111 return new UnescapeTransliterator(ID, SPEC_Any);
112 }
113
114 /**
115 * Registers standard variants with the system. Called by
116 * Transliterator during initialization.
117 */
118 void UnescapeTransliterator::registerIDs() {
119 Token t = integerToken(0);
120
121 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
122
123 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
124
125 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
126
127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
128
129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
130
131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
132
133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
134 }
135
136 /**
137 * Constructor. Takes the encoded spec array.
138 */
139 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
140 const UChar *newSpec) :
141 Transliterator(newID, NULL)
142 {
143 this->spec = copySpec(newSpec);
144 }
145
146 /**
147 * Copy constructor.
148 */
149 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
150 Transliterator(o) {
151 this->spec = copySpec(o.spec);
152 }
153
154 UnescapeTransliterator::~UnescapeTransliterator() {
155 uprv_free(spec);
156 }
157
158 /**
159 * Transliterator API.
160 */
161 Transliterator* UnescapeTransliterator::clone() const {
162 return new UnescapeTransliterator(*this);
163 }
164
165 /**
166 * Implements {@link Transliterator#handleTransliterate}.
167 */
168 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
169 UBool isIncremental) const {
170 int32_t start = pos.start;
171 int32_t limit = pos.limit;
172 int32_t i, j, ipat;
173
174 while (start < limit) {
175 // Loop over the forms in spec[]. Exit this loop when we
176 // match one of the specs. Exit the outer loop if a
177 // partial match is detected and isIncremental is true.
178 for (j=0, ipat=0; spec[ipat] != END; ++j) {
179
180 // Read the header
181 int32_t prefixLen = spec[ipat++];
182 int32_t suffixLen = spec[ipat++];
183 int8_t radix = (int8_t) spec[ipat++];
184 int32_t minDigits = spec[ipat++];
185 int32_t maxDigits = spec[ipat++];
186
187 // s is a copy of start that is advanced over the
188 // characters as we parse them.
189 int32_t s = start;
190 UBool match = TRUE;
191
192 for (i=0; i<prefixLen; ++i) {
193 if (s >= limit) {
194 if (i > 0) {
195 // We've already matched a character. This is
196 // a partial match, so we return if in
197 // incremental mode. In non-incremental mode,
198 // go to the next spec.
199 if (isIncremental) {
200 goto exit;
201 }
202 match = FALSE;
203 break;
204 }
205 }
206 UChar c = text.charAt(s++);
207 if (c != spec[ipat + i]) {
208 match = FALSE;
209 break;
210 }
211 }
212
213 if (match) {
214 UChar32 u = 0;
215 int32_t digitCount = 0;
216 for (;;) {
217 if (s >= limit) {
218 // Check for partial match in incremental mode.
219 if (s > start && isIncremental) {
220 goto exit;
221 }
222 break;
223 }
224 UChar32 ch = text.char32At(s);
225 int32_t digit = u_digit(ch, radix);
226 if (digit < 0) {
227 break;
228 }
229 s += UTF_CHAR_LENGTH(ch);
230 u = (u * radix) + digit;
231 if (++digitCount == maxDigits) {
232 break;
233 }
234 }
235
236 match = (digitCount >= minDigits);
237
238 if (match) {
239 for (i=0; i<suffixLen; ++i) {
240 if (s >= limit) {
241 // Check for partial match in incremental mode.
242 if (s > start && isIncremental) {
243 goto exit;
244 }
245 match = FALSE;
246 break;
247 }
248 UChar c = text.charAt(s++);
249 if (c != spec[ipat + prefixLen + i]) {
250 match = FALSE;
251 break;
252 }
253 }
254
255 if (match) {
256 // At this point, we have a match
257 UnicodeString str(u);
258 text.handleReplaceBetween(start, s, str);
259 limit -= s - start - str.length();
260 // The following break statement leaves the
261 // loop that is traversing the forms in
262 // spec[]. We then parse the next input
263 // character.
264 break;
265 }
266 }
267 }
268
269 ipat += prefixLen + suffixLen;
270 }
271
272 if (start < limit) {
273 start += UTF_CHAR_LENGTH(text.char32At(start));
274 }
275 }
276
277 exit:
278 pos.contextLimit += limit - pos.limit;
279 pos.limit = limit;
280 pos.start = start;
281 }
282
283 U_NAMESPACE_END
284
285 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
286
287 //eof