]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/nortrans.cpp
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / i18n / nortrans.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 07/03/01 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/normalizer2.h"
16 #include "unicode/utf16.h"
17 #include "cstring.h"
18 #include "nortrans.h"
19
20 U_NAMESPACE_BEGIN
21
22 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
23
24 static inline Transliterator::Token cstrToken(const char *s) {
25 return Transliterator::pointerToken((void *)s);
26 }
27
28 /**
29 * System registration hook.
30 */
31 void NormalizationTransliterator::registerIDs() {
32 // In the Token, the byte after the NUL is the UNormalization2Mode.
33 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
34 _create, cstrToken("nfc\0\0"));
35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
36 _create, cstrToken("nfkc\0\0"));
37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
38 _create, cstrToken("nfc\0\1"));
39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
40 _create, cstrToken("nfkc\0\1"));
41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
42 _create, cstrToken("nfc\0\2"));
43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
44 _create, cstrToken("nfc\0\3"));
45 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
46 UNICODE_STRING_SIMPLE("NFD"), TRUE);
47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
48 UNICODE_STRING_SIMPLE("NFKD"), TRUE);
49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
50 UNICODE_STRING_SIMPLE("NFD"), FALSE);
51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
52 UNICODE_STRING_SIMPLE("FCD"), FALSE);
53 }
54
55 /**
56 * Factory methods
57 */
58 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
59 Token context) {
60 const char *name = (const char *)context.pointer;
61 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
62 UErrorCode errorCode = U_ZERO_ERROR;
63 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
64 if(U_SUCCESS(errorCode)) {
65 return new NormalizationTransliterator(ID, *norm2);
66 } else {
67 return NULL;
68 }
69 }
70
71 /**
72 * Constructs a transliterator.
73 */
74 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
75 const Normalizer2 &norm2) :
76 Transliterator(id, 0), fNorm2(norm2) {}
77
78 /**
79 * Destructor.
80 */
81 NormalizationTransliterator::~NormalizationTransliterator() {
82 }
83
84 /**
85 * Copy constructor.
86 */
87 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
88 Transliterator(o), fNorm2(o.fNorm2) {}
89
90 /**
91 * Transliterator API.
92 */
93 Transliterator* NormalizationTransliterator::clone(void) const {
94 return new NormalizationTransliterator(*this);
95 }
96
97 /**
98 * Implements {@link Transliterator#handleTransliterate}.
99 */
100 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
101 UBool isIncremental) const {
102 // start and limit of the input range
103 int32_t start = offsets.start;
104 int32_t limit = offsets.limit;
105 if(start >= limit) {
106 return;
107 }
108
109 /*
110 * Normalize as short chunks at a time as possible even in
111 * bulk mode, so that styled text is minimally disrupted.
112 * In incremental mode, a chunk that ends with offsets.limit
113 * must not be normalized.
114 *
115 * If it was known that the input text is not styled, then
116 * a bulk mode normalization could look like this:
117
118 UnicodeString input, normalized;
119 int32_t length = limit - start;
120 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
121 input.releaseBuffer(length);
122
123 UErrorCode status = U_ZERO_ERROR;
124 fNorm2.normalize(input, normalized, status);
125
126 text.handleReplaceBetween(start, limit, normalized);
127
128 int32_t delta = normalized.length() - length;
129 offsets.contextLimit += delta;
130 offsets.limit += delta;
131 offsets.start = limit + delta;
132
133 */
134 UErrorCode errorCode = U_ZERO_ERROR;
135 UnicodeString segment;
136 UnicodeString normalized;
137 UChar32 c = text.char32At(start);
138 do {
139 int32_t prev = start;
140 // Skip at least one character so we make progress.
141 // c holds the character at start.
142 segment.remove();
143 do {
144 segment.append(c);
145 start += U16_LENGTH(c);
146 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
147 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
148 // stop in incremental mode when we reach the input limit
149 // in case there are additional characters that could change the
150 // normalization result
151 start=prev;
152 break;
153 }
154 fNorm2.normalize(segment, normalized, errorCode);
155 if(U_FAILURE(errorCode)) {
156 break;
157 }
158 if(segment != normalized) {
159 // replace the input chunk with its normalized form
160 text.handleReplaceBetween(prev, start, normalized);
161
162 // update all necessary indexes accordingly
163 int32_t delta = normalized.length() - (start - prev);
164 start += delta;
165 limit += delta;
166 }
167 } while(start < limit);
168
169 offsets.start = start;
170 offsets.contextLimit += limit - offsets.limit;
171 offsets.limit = limit;
172 }
173
174 U_NAMESPACE_END
175
176 #endif /* #if !UCONFIG_NO_TRANSLITERATION */