]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/nortrans.cpp
ICU-62123.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / nortrans.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
4388f060 5* Copyright (C) 2001-2011, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 07/03/01 aliu Creation.
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
729e4ab9 17#include "unicode/normalizer2.h"
4388f060 18#include "unicode/utf16.h"
729e4ab9 19#include "cstring.h"
b75a7d8f 20#include "nortrans.h"
b75a7d8f
A
21
22U_NAMESPACE_BEGIN
23
374ca955 24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
b75a7d8f 25
729e4ab9
A
26static inline Transliterator::Token cstrToken(const char *s) {
27 return Transliterator::pointerToken((void *)s);
28}
29
b75a7d8f
A
30/**
31 * System registration hook.
32 */
33void NormalizationTransliterator::registerIDs() {
729e4ab9 34 // In the Token, the byte after the NUL is the UNormalization2Mode.
b75a7d8f 35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
729e4ab9 36 _create, cstrToken("nfc\0\0"));
b75a7d8f 37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
729e4ab9 38 _create, cstrToken("nfkc\0\0"));
b75a7d8f 39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
729e4ab9 40 _create, cstrToken("nfc\0\1"));
b75a7d8f 41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
729e4ab9
A
42 _create, cstrToken("nfkc\0\1"));
43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
44 _create, cstrToken("nfc\0\2"));
45 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
46 _create, cstrToken("nfc\0\3"));
b75a7d8f
A
47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
48 UNICODE_STRING_SIMPLE("NFD"), TRUE);
49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
50 UNICODE_STRING_SIMPLE("NFKD"), TRUE);
729e4ab9
A
51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
52 UNICODE_STRING_SIMPLE("NFD"), FALSE);
53 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
54 UNICODE_STRING_SIMPLE("FCD"), FALSE);
b75a7d8f
A
55}
56
57/**
58 * Factory methods
59 */
60Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
61 Token context) {
729e4ab9
A
62 const char *name = (const char *)context.pointer;
63 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
64 UErrorCode errorCode = U_ZERO_ERROR;
65 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
66 if(U_SUCCESS(errorCode)) {
67 return new NormalizationTransliterator(ID, *norm2);
68 } else {
69 return NULL;
70 }
b75a7d8f
A
71}
72
73/**
74 * Constructs a transliterator.
75 */
729e4ab9
A
76NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
77 const Normalizer2 &norm2) :
78 Transliterator(id, 0), fNorm2(norm2) {}
b75a7d8f
A
79
80/**
81 * Destructor.
82 */
83NormalizationTransliterator::~NormalizationTransliterator() {
84}
85
86/**
87 * Copy constructor.
88 */
89NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
729e4ab9 90 Transliterator(o), fNorm2(o.fNorm2) {}
b75a7d8f
A
91
92/**
93 * Transliterator API.
94 */
95Transliterator* NormalizationTransliterator::clone(void) const {
96 return new NormalizationTransliterator(*this);
97}
98
99/**
100 * Implements {@link Transliterator#handleTransliterate}.
101 */
102void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103 UBool isIncremental) const {
104 // start and limit of the input range
105 int32_t start = offsets.start;
106 int32_t limit = offsets.limit;
b75a7d8f
A
107 if(start >= limit) {
108 return;
109 }
110
b75a7d8f
A
111 /*
112 * Normalize as short chunks at a time as possible even in
113 * bulk mode, so that styled text is minimally disrupted.
114 * In incremental mode, a chunk that ends with offsets.limit
115 * must not be normalized.
116 *
117 * If it was known that the input text is not styled, then
118 * a bulk mode normalization could look like this:
b75a7d8f 119
729e4ab9
A
120 UnicodeString input, normalized;
121 int32_t length = limit - start;
b75a7d8f
A
122 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
123 input.releaseBuffer(length);
124
125 UErrorCode status = U_ZERO_ERROR;
729e4ab9 126 fNorm2.normalize(input, normalized, status);
b75a7d8f 127
729e4ab9 128 text.handleReplaceBetween(start, limit, normalized);
b75a7d8f 129
729e4ab9 130 int32_t delta = normalized.length() - length;
b75a7d8f
A
131 offsets.contextLimit += delta;
132 offsets.limit += delta;
133 offsets.start = limit + delta;
134
b75a7d8f 135 */
729e4ab9
A
136 UErrorCode errorCode = U_ZERO_ERROR;
137 UnicodeString segment;
138 UnicodeString normalized;
139 UChar32 c = text.char32At(start);
140 do {
141 int32_t prev = start;
142 // Skip at least one character so we make progress.
143 // c holds the character at start.
144 segment.remove();
145 do {
146 segment.append(c);
147 start += U16_LENGTH(c);
148 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
149 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
b75a7d8f
A
150 // stop in incremental mode when we reach the input limit
151 // in case there are additional characters that could change the
152 // normalization result
729e4ab9
A
153 start=prev;
154 break;
b75a7d8f 155 }
729e4ab9
A
156 fNorm2.normalize(segment, normalized, errorCode);
157 if(U_FAILURE(errorCode)) {
158 break;
159 }
160 if(segment != normalized) {
b75a7d8f 161 // replace the input chunk with its normalized form
729e4ab9 162 text.handleReplaceBetween(prev, start, normalized);
b75a7d8f
A
163
164 // update all necessary indexes accordingly
729e4ab9
A
165 int32_t delta = normalized.length() - (start - prev);
166 start += delta;
167 limit += delta;
b75a7d8f 168 }
729e4ab9 169 } while(start < limit);
b75a7d8f
A
170
171 offsets.start = start;
729e4ab9
A
172 offsets.contextLimit += limit - offsets.limit;
173 offsets.limit = limit;
b75a7d8f
A
174}
175
176U_NAMESPACE_END
177
178#endif /* #if !UCONFIG_NO_TRANSLITERATION */