]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/unesctrn.cpp
2 **********************************************************************
3 * Copyright (c) 2001-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/uchar.h"
24 * Special character marking the end of the spec[] array.
26 static const UChar END
= 0xFFFF;
28 // Unicode: "U+10FFFF" hex, min=4, max=6
29 static const UChar SPEC_Unicode
[] = {
30 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
34 // Java: "\\uFFFF" hex, min=4, max=4
35 static const UChar SPEC_Java
[] = {
36 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41 static const UChar SPEC_C
[] = {
42 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
47 // XML: "" hex, min=1, max=6
48 static const UChar SPEC_XML
[] = {
49 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
53 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54 static const UChar SPEC_XML10
[] = {
55 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
59 // Perl: "\\x{263A}" hex, min=1, max=6
60 static const UChar SPEC_Perl
[] = {
61 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
65 // All: Java, C, Perl, XML, XML10, Unicode
66 static const UChar SPEC_Any
[] = {
67 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
68 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
69 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
70 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
71 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
72 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator
)
79 * Factory methods. Ignore the context.
81 Transliterator
* UnescapeTransliterator::_createUnicode(const UnicodeString
& ID
, Token
/*context*/) {
82 return new UnescapeTransliterator(ID
, SPEC_Unicode
);
84 Transliterator
* UnescapeTransliterator::_createJava(const UnicodeString
& ID
, Token
/*context*/) {
85 return new UnescapeTransliterator(ID
, SPEC_Java
);
87 Transliterator
* UnescapeTransliterator::_createC(const UnicodeString
& ID
, Token
/*context*/) {
88 return new UnescapeTransliterator(ID
, SPEC_C
);
90 Transliterator
* UnescapeTransliterator::_createXML(const UnicodeString
& ID
, Token
/*context*/) {
91 return new UnescapeTransliterator(ID
, SPEC_XML
);
93 Transliterator
* UnescapeTransliterator::_createXML10(const UnicodeString
& ID
, Token
/*context*/) {
94 return new UnescapeTransliterator(ID
, SPEC_XML10
);
96 Transliterator
* UnescapeTransliterator::_createPerl(const UnicodeString
& ID
, Token
/*context*/) {
97 return new UnescapeTransliterator(ID
, SPEC_Perl
);
99 Transliterator
* UnescapeTransliterator::_createAny(const UnicodeString
& ID
, Token
/*context*/) {
100 return new UnescapeTransliterator(ID
, SPEC_Any
);
104 * Registers standard variants with the system. Called by
105 * Transliterator during initialization.
107 void UnescapeTransliterator::registerIDs() {
108 Token t
= integerToken(0);
110 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode
, t
);
112 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava
, t
);
114 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC
, t
);
116 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML
, t
);
118 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10
, t
);
120 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl
, t
);
122 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny
, t
);
126 * Constructor. Takes the encoded spec array.
128 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString
& newID
,
129 const UChar
*newSpec
) :
130 Transliterator(newID
, NULL
)
132 this->spec
= copySpec(newSpec
);
138 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator
& o
) :
140 this->spec
= copySpec(o
.spec
);
143 UnescapeTransliterator::~UnescapeTransliterator() {
148 * Transliterator API.
150 Transliterator
* UnescapeTransliterator::clone() const {
151 return new UnescapeTransliterator(*this);
154 UChar
* UnescapeTransliterator::copySpec(const UChar
* spec
) {
156 while (spec
[len
] != END
) {
160 UChar
*result
= (UChar
*)uprv_malloc(len
*sizeof(UChar
));
161 uprv_memcpy(result
, spec
, len
*sizeof(result
[0]));
166 * Implements {@link Transliterator#handleTransliterate}.
168 void UnescapeTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& pos
,
169 UBool isIncremental
) const {
170 int32_t start
= pos
.start
;
171 int32_t limit
= pos
.limit
;
174 while (start
< limit
) {
175 // Loop over the forms in spec[]. Exit this loop when we
176 // match one of the specs. Exit the outer loop if a
177 // partial match is detected and isIncremental is true.
178 for (j
=0, ipat
=0; spec
[ipat
] != END
; ++j
) {
181 int32_t prefixLen
= spec
[ipat
++];
182 int32_t suffixLen
= spec
[ipat
++];
183 int8_t radix
= (int8_t) spec
[ipat
++];
184 int32_t minDigits
= spec
[ipat
++];
185 int32_t maxDigits
= spec
[ipat
++];
187 // s is a copy of start that is advanced over the
188 // characters as we parse them.
192 for (i
=0; i
<prefixLen
; ++i
) {
195 // We've already matched a character. This is
196 // a partial match, so we return if in
197 // incremental mode. In non-incremental mode,
198 // go to the next spec.
206 UChar c
= text
.charAt(s
++);
207 if (c
!= spec
[ipat
+ i
]) {
215 int32_t digitCount
= 0;
218 // Check for partial match in incremental mode.
219 if (s
> start
&& isIncremental
) {
224 UChar32 ch
= text
.char32At(s
);
225 int32_t digit
= u_digit(ch
, radix
);
229 s
+= UTF_CHAR_LENGTH(ch
);
230 u
= (u
* radix
) + digit
;
231 if (++digitCount
== maxDigits
) {
236 match
= (digitCount
>= minDigits
);
239 for (i
=0; i
<suffixLen
; ++i
) {
241 // Check for partial match in incremental mode.
242 if (s
> start
&& isIncremental
) {
248 UChar c
= text
.charAt(s
++);
249 if (c
!= spec
[ipat
+ prefixLen
+ i
]) {
256 // At this point, we have a match
257 UnicodeString
str(u
);
258 text
.handleReplaceBetween(start
, s
, str
);
259 limit
-= s
- start
- str
.length();
260 // The following break statement leaves the
261 // loop that is traversing the forms in
262 // spec[]. We then parse the next input
269 ipat
+= prefixLen
+ suffixLen
;
273 start
+= UTF_CHAR_LENGTH(text
.char32At(start
));
278 pos
.contextLimit
+= limit
- pos
.limit
;
285 #endif /* #if !UCONFIG_NO_TRANSLITERATION */