]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/nortrans.cpp
ICU-511.27.tar.gz
[apple/icu.git] / icuSources / i18n / nortrans.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
4388f060 3* Copyright (C) 2001-2011, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* Date Name Description
7* 07/03/01 aliu Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
729e4ab9 15#include "unicode/normalizer2.h"
4388f060 16#include "unicode/utf16.h"
729e4ab9 17#include "cstring.h"
b75a7d8f 18#include "nortrans.h"
b75a7d8f
A
19
20U_NAMESPACE_BEGIN
21
374ca955 22UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
b75a7d8f 23
729e4ab9
A
24static inline Transliterator::Token cstrToken(const char *s) {
25 return Transliterator::pointerToken((void *)s);
26}
27
b75a7d8f
A
28/**
29 * System registration hook.
30 */
31void NormalizationTransliterator::registerIDs() {
729e4ab9 32 // In the Token, the byte after the NUL is the UNormalization2Mode.
b75a7d8f 33 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
729e4ab9 34 _create, cstrToken("nfc\0\0"));
b75a7d8f 35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
729e4ab9 36 _create, cstrToken("nfkc\0\0"));
b75a7d8f 37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
729e4ab9 38 _create, cstrToken("nfc\0\1"));
b75a7d8f 39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
729e4ab9
A
40 _create, cstrToken("nfkc\0\1"));
41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
42 _create, cstrToken("nfc\0\2"));
43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
44 _create, cstrToken("nfc\0\3"));
b75a7d8f
A
45 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
46 UNICODE_STRING_SIMPLE("NFD"), TRUE);
47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
48 UNICODE_STRING_SIMPLE("NFKD"), TRUE);
729e4ab9
A
49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
50 UNICODE_STRING_SIMPLE("NFD"), FALSE);
51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
52 UNICODE_STRING_SIMPLE("FCD"), FALSE);
b75a7d8f
A
53}
54
55/**
56 * Factory methods
57 */
58Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
59 Token context) {
729e4ab9
A
60 const char *name = (const char *)context.pointer;
61 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
62 UErrorCode errorCode = U_ZERO_ERROR;
63 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
64 if(U_SUCCESS(errorCode)) {
65 return new NormalizationTransliterator(ID, *norm2);
66 } else {
67 return NULL;
68 }
b75a7d8f
A
69}
70
71/**
72 * Constructs a transliterator.
73 */
729e4ab9
A
74NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
75 const Normalizer2 &norm2) :
76 Transliterator(id, 0), fNorm2(norm2) {}
b75a7d8f
A
77
78/**
79 * Destructor.
80 */
81NormalizationTransliterator::~NormalizationTransliterator() {
82}
83
84/**
85 * Copy constructor.
86 */
87NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
729e4ab9 88 Transliterator(o), fNorm2(o.fNorm2) {}
b75a7d8f
A
89
90/**
91 * Transliterator API.
92 */
93Transliterator* NormalizationTransliterator::clone(void) const {
94 return new NormalizationTransliterator(*this);
95}
96
97/**
98 * Implements {@link Transliterator#handleTransliterate}.
99 */
100void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
101 UBool isIncremental) const {
102 // start and limit of the input range
103 int32_t start = offsets.start;
104 int32_t limit = offsets.limit;
b75a7d8f
A
105 if(start >= limit) {
106 return;
107 }
108
b75a7d8f
A
109 /*
110 * Normalize as short chunks at a time as possible even in
111 * bulk mode, so that styled text is minimally disrupted.
112 * In incremental mode, a chunk that ends with offsets.limit
113 * must not be normalized.
114 *
115 * If it was known that the input text is not styled, then
116 * a bulk mode normalization could look like this:
b75a7d8f 117
729e4ab9
A
118 UnicodeString input, normalized;
119 int32_t length = limit - start;
b75a7d8f
A
120 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
121 input.releaseBuffer(length);
122
123 UErrorCode status = U_ZERO_ERROR;
729e4ab9 124 fNorm2.normalize(input, normalized, status);
b75a7d8f 125
729e4ab9 126 text.handleReplaceBetween(start, limit, normalized);
b75a7d8f 127
729e4ab9 128 int32_t delta = normalized.length() - length;
b75a7d8f
A
129 offsets.contextLimit += delta;
130 offsets.limit += delta;
131 offsets.start = limit + delta;
132
b75a7d8f 133 */
729e4ab9
A
134 UErrorCode errorCode = U_ZERO_ERROR;
135 UnicodeString segment;
136 UnicodeString normalized;
137 UChar32 c = text.char32At(start);
138 do {
139 int32_t prev = start;
140 // Skip at least one character so we make progress.
141 // c holds the character at start.
142 segment.remove();
143 do {
144 segment.append(c);
145 start += U16_LENGTH(c);
146 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
147 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
b75a7d8f
A
148 // stop in incremental mode when we reach the input limit
149 // in case there are additional characters that could change the
150 // normalization result
729e4ab9
A
151 start=prev;
152 break;
b75a7d8f 153 }
729e4ab9
A
154 fNorm2.normalize(segment, normalized, errorCode);
155 if(U_FAILURE(errorCode)) {
156 break;
157 }
158 if(segment != normalized) {
b75a7d8f 159 // replace the input chunk with its normalized form
729e4ab9 160 text.handleReplaceBetween(prev, start, normalized);
b75a7d8f
A
161
162 // update all necessary indexes accordingly
729e4ab9
A
163 int32_t delta = normalized.length() - (start - prev);
164 start += delta;
165 limit += delta;
b75a7d8f 166 }
729e4ab9 167 } while(start < limit);
b75a7d8f
A
168
169 offsets.start = start;
729e4ab9
A
170 offsets.contextLimit += limit - offsets.limit;
171 offsets.limit = limit;
b75a7d8f
A
172}
173
174U_NAMESPACE_END
175
176#endif /* #if !UCONFIG_NO_TRANSLITERATION */