]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/unesctrn.cpp
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / i18n / unesctrn.cpp
... / ...
CommitLineData
1/*
2 **********************************************************************
3 * Copyright (c) 2001-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
9 */
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uchar.h"
16#include "unesctrn.h"
17#include "util.h"
18
19#include "cmemory.h"
20
21U_NAMESPACE_BEGIN
22
23/**
24 * Special character marking the end of the spec[] array.
25 */
26static const UChar END = 0xFFFF;
27
28// Unicode: "U+10FFFF" hex, min=4, max=6
29static const UChar SPEC_Unicode[] = {
30 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
31 END
32};
33
34// Java: "\\uFFFF" hex, min=4, max=4
35static const UChar SPEC_Java[] = {
36 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
37 END
38};
39
40// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41static const UChar SPEC_C[] = {
42 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
44 END
45};
46
47// XML: "" hex, min=1, max=6
48static const UChar SPEC_XML[] = {
49 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
50 END
51};
52
53// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54static const UChar SPEC_XML10[] = {
55 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
56 END
57};
58
59// Perl: "\\x{263A}" hex, min=1, max=6
60static const UChar SPEC_Perl[] = {
61 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
62 END
63};
64
65// All: Java, C, Perl, XML, XML10, Unicode
66static const UChar SPEC_Any[] = {
67 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
68 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
69 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
70 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
71 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
72 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
73 END
74};
75
76UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
77
78/**
79 * Factory methods. Ignore the context.
80 */
81Transliterator* UnescapeTransliterator::_createUnicode(const UnicodeString& ID, Token /*context*/) {
82 return new UnescapeTransliterator(ID, SPEC_Unicode);
83}
84Transliterator* UnescapeTransliterator::_createJava(const UnicodeString& ID, Token /*context*/) {
85 return new UnescapeTransliterator(ID, SPEC_Java);
86}
87Transliterator* UnescapeTransliterator::_createC(const UnicodeString& ID, Token /*context*/) {
88 return new UnescapeTransliterator(ID, SPEC_C);
89}
90Transliterator* UnescapeTransliterator::_createXML(const UnicodeString& ID, Token /*context*/) {
91 return new UnescapeTransliterator(ID, SPEC_XML);
92}
93Transliterator* UnescapeTransliterator::_createXML10(const UnicodeString& ID, Token /*context*/) {
94 return new UnescapeTransliterator(ID, SPEC_XML10);
95}
96Transliterator* UnescapeTransliterator::_createPerl(const UnicodeString& ID, Token /*context*/) {
97 return new UnescapeTransliterator(ID, SPEC_Perl);
98}
99Transliterator* UnescapeTransliterator::_createAny(const UnicodeString& ID, Token /*context*/) {
100 return new UnescapeTransliterator(ID, SPEC_Any);
101}
102
103/**
104 * Registers standard variants with the system. Called by
105 * Transliterator during initialization.
106 */
107void UnescapeTransliterator::registerIDs() {
108 Token t = integerToken(0);
109
110 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
111
112 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
113
114 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
115
116 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
117
118 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
119
120 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
121
122 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
123}
124
125/**
126 * Constructor. Takes the encoded spec array.
127 */
128UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
129 const UChar *newSpec) :
130 Transliterator(newID, NULL)
131{
132 this->spec = copySpec(newSpec);
133}
134
135/**
136 * Copy constructor.
137 */
138UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
139 Transliterator(o) {
140 this->spec = copySpec(o.spec);
141}
142
143UnescapeTransliterator::~UnescapeTransliterator() {
144 uprv_free(spec);
145}
146
147/**
148 * Transliterator API.
149 */
150Transliterator* UnescapeTransliterator::clone() const {
151 return new UnescapeTransliterator(*this);
152}
153
154UChar* UnescapeTransliterator::copySpec(const UChar* spec) {
155 int32_t len = 0;
156 while (spec[len] != END) {
157 ++len;
158 }
159 ++len;
160 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
161 uprv_memcpy(result, spec, len*sizeof(result[0]));
162 return result;
163}
164
165/**
166 * Implements {@link Transliterator#handleTransliterate}.
167 */
168void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
169 UBool isIncremental) const {
170 int32_t start = pos.start;
171 int32_t limit = pos.limit;
172 int32_t i, j, ipat;
173
174 while (start < limit) {
175 // Loop over the forms in spec[]. Exit this loop when we
176 // match one of the specs. Exit the outer loop if a
177 // partial match is detected and isIncremental is true.
178 for (j=0, ipat=0; spec[ipat] != END; ++j) {
179
180 // Read the header
181 int32_t prefixLen = spec[ipat++];
182 int32_t suffixLen = spec[ipat++];
183 int8_t radix = (int8_t) spec[ipat++];
184 int32_t minDigits = spec[ipat++];
185 int32_t maxDigits = spec[ipat++];
186
187 // s is a copy of start that is advanced over the
188 // characters as we parse them.
189 int32_t s = start;
190 UBool match = TRUE;
191
192 for (i=0; i<prefixLen; ++i) {
193 if (s >= limit) {
194 if (i > 0) {
195 // We've already matched a character. This is
196 // a partial match, so we return if in
197 // incremental mode. In non-incremental mode,
198 // go to the next spec.
199 if (isIncremental) {
200 goto exit;
201 }
202 match = FALSE;
203 break;
204 }
205 }
206 UChar c = text.charAt(s++);
207 if (c != spec[ipat + i]) {
208 match = FALSE;
209 break;
210 }
211 }
212
213 if (match) {
214 UChar32 u = 0;
215 int32_t digitCount = 0;
216 for (;;) {
217 if (s >= limit) {
218 // Check for partial match in incremental mode.
219 if (s > start && isIncremental) {
220 goto exit;
221 }
222 break;
223 }
224 UChar32 ch = text.char32At(s);
225 int32_t digit = u_digit(ch, radix);
226 if (digit < 0) {
227 break;
228 }
229 s += UTF_CHAR_LENGTH(ch);
230 u = (u * radix) + digit;
231 if (++digitCount == maxDigits) {
232 break;
233 }
234 }
235
236 match = (digitCount >= minDigits);
237
238 if (match) {
239 for (i=0; i<suffixLen; ++i) {
240 if (s >= limit) {
241 // Check for partial match in incremental mode.
242 if (s > start && isIncremental) {
243 goto exit;
244 }
245 match = FALSE;
246 break;
247 }
248 UChar c = text.charAt(s++);
249 if (c != spec[ipat + prefixLen + i]) {
250 match = FALSE;
251 break;
252 }
253 }
254
255 if (match) {
256 // At this point, we have a match
257 UnicodeString str(u);
258 text.handleReplaceBetween(start, s, str);
259 limit -= s - start - str.length();
260 // The following break statement leaves the
261 // loop that is traversing the forms in
262 // spec[]. We then parse the next input
263 // character.
264 break;
265 }
266 }
267 }
268
269 ipat += prefixLen + suffixLen;
270 }
271
272 if (start < limit) {
273 start += UTF_CHAR_LENGTH(text.char32At(start));
274 }
275 }
276
277 exit:
278 pos.contextLimit += limit - pos.limit;
279 pos.limit = limit;
280 pos.start = start;
281}
282
283U_NAMESPACE_END
284
285#endif /* #if !UCONFIG_NO_TRANSLITERATION */
286
287//eof