+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
*************************************************************************
* COPYRIGHT:
- * Copyright (c) 1996-2004, International Business Machines Corporation and
+ * Copyright (c) 1996-2012, International Business Machines Corporation and
* others. All Rights Reserved.
*************************************************************************
*/
#if !UCONFIG_NO_NORMALIZATION
+#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
-#include "unicode/uiter.h"
#include "unicode/normlzr.h"
+#include "unicode/utf16.h"
#include "cmemory.h"
-#include "unormimp.h"
+#include "normalizer2impl.h"
+#include "uprops.h" // for uniset_getUnicode32Instance()
+
+#if defined(_ARM64_) && defined(move32)
+ // System can define move32 intrinsics, but the char iters define move32 method
+ // using same undef trick in headers, so undef here to re-enable the method.
+#undef move32
+#endif
U_NAMESPACE_BEGIN
//-------------------------------------------------------------------------
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
- UObject(), fUMode(mode), fOptions(0),
+ UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+ text(new StringCharacterIterator(str)),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
- init(new StringCharacterIterator(str));
+ init();
}
-Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
- UObject(), fUMode(mode), fOptions(0),
+Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
+ UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+ text(new UCharCharacterIterator(str, length)),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
- init(new UCharCharacterIterator(str, length));
+ init();
}
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
- UObject(), fUMode(mode), fOptions(0),
+ UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+ text(iter.clone()),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
- init(iter.clone());
+ init();
}
Normalizer::Normalizer(const Normalizer ©) :
- UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
+ UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
+ text(copy.text->clone()),
currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
buffer(copy.buffer), bufferPos(copy.bufferPos)
{
- init(((CharacterIterator *)(copy.text->context))->clone());
+ init();
}
-static const UChar _NUL=0;
-
void
-Normalizer::init(CharacterIterator *iter) {
+Normalizer::init() {
UErrorCode errorCode=U_ZERO_ERROR;
-
- text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
- if(text!=NULL) {
- if(unorm_haveData(&errorCode)) {
- uiter_setCharacterIterator(text, iter);
- } else {
- delete iter;
- uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
- }
- } else {
- delete iter;
+ fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
+ if(fOptions&UNORM_UNICODE_3_2) {
+ delete fFilteredNorm2;
+ fNorm2=fFilteredNorm2=
+ new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
+ }
+ if(U_FAILURE(errorCode)) {
+ errorCode=U_ZERO_ERROR;
+ fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
}
}
Normalizer::~Normalizer()
{
- if(text!=NULL) {
- delete (CharacterIterator *)text->context;
- uprv_free(text);
- }
+ delete fFilteredNorm2;
+ delete text;
}
Normalizer*
Normalizer::clone() const
{
- if(this!=0) {
- return new Normalizer(*this);
- } else {
- return 0;
- }
+ return new Normalizer(*this);
}
/**
*/
int32_t Normalizer::hashCode() const
{
- return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
+ return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
}
UBool Normalizer::operator==(const Normalizer& that) const
{
return
this==&that ||
- fUMode==that.fUMode &&
+ (fUMode==that.fUMode &&
fOptions==that.fOptions &&
- *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
+ *text==*that.text &&
buffer==that.buffer &&
bufferPos==that.bufferPos &&
- nextIndex==that.nextIndex;
+ nextIndex==that.nextIndex);
}
//-------------------------------------------------------------------------
// the source and result strings are the same object, use a temporary one
dest=&localDest;
}
-
- UChar *buffer=dest->getBuffer(source.length());
- int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
- source.getBuffer(), source.length(),
- mode, options,
- &status);
- dest->releaseBuffer(length);
- if(status==U_BUFFER_OVERFLOW_ERROR) {
- status=U_ZERO_ERROR;
- buffer=dest->getBuffer(length);
- length=unorm_internalNormalize(buffer, dest->getCapacity(),
- source.getBuffer(), source.length(),
- mode, options,
- &status);
- dest->releaseBuffer(length);
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+ if(U_SUCCESS(status)) {
+ if(options&UNORM_UNICODE_3_2) {
+ FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+ normalize(source, *dest, status);
+ } else {
+ n2->normalize(source, *dest, status);
+ }
}
-
- if(dest==&localDest) {
+ if(dest==&localDest && U_SUCCESS(status)) {
result=*dest;
}
- if(U_FAILURE(status)) {
- result.setToBogus();
- }
}
}
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status) {
- if(source.isBogus() || U_FAILURE(status)) {
- result.setToBogus();
- if(U_SUCCESS(status)) {
- status=U_ILLEGAL_ARGUMENT_ERROR;
- }
- } else {
- UnicodeString localDest;
- UnicodeString *dest;
-
- if(&source!=&result) {
- dest=&result;
- } else {
- // the source and result strings are the same object, use a temporary one
- dest=&localDest;
- }
-
- UChar *buffer=dest->getBuffer(source.length());
- int32_t length=unorm_compose(buffer, dest->getCapacity(),
- source.getBuffer(), source.length(),
- compat, options,
- &status);
- dest->releaseBuffer(length);
- if(status==U_BUFFER_OVERFLOW_ERROR) {
- status=U_ZERO_ERROR;
- buffer=dest->getBuffer(length);
- length=unorm_compose(buffer, dest->getCapacity(),
- source.getBuffer(), source.length(),
- compat, options,
- &status);
- dest->releaseBuffer(length);
- }
-
- if(dest==&localDest) {
- result=*dest;
- }
- if(U_FAILURE(status)) {
- result.setToBogus();
- }
- }
+ normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
}
void U_EXPORT2
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status) {
- if(source.isBogus() || U_FAILURE(status)) {
- result.setToBogus();
- if(U_SUCCESS(status)) {
- status=U_ILLEGAL_ARGUMENT_ERROR;
+ normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
+}
+
+UNormalizationCheckResult
+Normalizer::quickCheck(const UnicodeString& source,
+ UNormalizationMode mode, int32_t options,
+ UErrorCode &status) {
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+ if(U_SUCCESS(status)) {
+ if(options&UNORM_UNICODE_3_2) {
+ return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+ quickCheck(source, status);
+ } else {
+ return n2->quickCheck(source, status);
}
} else {
- UnicodeString localDest;
- UnicodeString *dest;
+ return UNORM_MAYBE;
+ }
+}
- if(&source!=&result) {
- dest=&result;
+UBool
+Normalizer::isNormalized(const UnicodeString& source,
+ UNormalizationMode mode, int32_t options,
+ UErrorCode &status) {
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+ if(U_SUCCESS(status)) {
+ if(options&UNORM_UNICODE_3_2) {
+ return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+ isNormalized(source, status);
} else {
- // the source and result strings are the same object, use a temporary one
- dest=&localDest;
- }
-
- UChar *buffer=dest->getBuffer(source.length());
- int32_t length=unorm_decompose(buffer, dest->getCapacity(),
- source.getBuffer(), source.length(),
- compat, options,
- &status);
- dest->releaseBuffer(length);
- if(status==U_BUFFER_OVERFLOW_ERROR) {
- status=U_ZERO_ERROR;
- buffer=dest->getBuffer(length);
- length=unorm_decompose(buffer, dest->getCapacity(),
- source.getBuffer(), source.length(),
- compat, options,
- &status);
- dest->releaseBuffer(length);
- }
-
- if(dest==&localDest) {
- result=*dest;
- }
- if(U_FAILURE(status)) {
- result.setToBogus();
+ return n2->isNormalized(source, status);
}
+ } else {
+ return FALSE;
}
}
UnicodeString & U_EXPORT2
-Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
+Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
UnicodeString &result,
UNormalizationMode mode, int32_t options,
UErrorCode &errorCode) {
UnicodeString localDest;
UnicodeString *dest;
- if(&left!=&result && &right!=&result) {
+ if(&right!=&result) {
dest=&result;
} else {
- // the source and result strings are the same object, use a temporary one
+ // the right and result strings are the same object, use a temporary one
dest=&localDest;
}
-
- UChar *buffer=dest->getBuffer(left.length()+right.length());
- int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
- right.getBuffer(), right.length(),
- buffer, dest->getCapacity(),
- mode, options,
- &errorCode);
- dest->releaseBuffer(length);
- if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
- errorCode=U_ZERO_ERROR;
- buffer=dest->getBuffer(length);
- int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
- right.getBuffer(), right.length(),
- buffer, dest->getCapacity(),
- mode, options,
- &errorCode);
- dest->releaseBuffer(length);
+ *dest=left;
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
+ if(U_SUCCESS(errorCode)) {
+ if(options&UNORM_UNICODE_3_2) {
+ FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
+ append(*dest, right, errorCode);
+ } else {
+ n2->append(*dest, right, errorCode);
+ }
}
-
- if(dest==&localDest) {
+ if(dest==&localDest && U_SUCCESS(errorCode)) {
result=*dest;
}
- if(U_FAILURE(errorCode)) {
- result.setToBogus();
- }
}
return result;
}
UChar32 Normalizer::next() {
if(bufferPos<buffer.length() || nextNormalize()) {
UChar32 c=buffer.char32At(bufferPos);
- bufferPos+=UTF_CHAR_LENGTH(c);
+ bufferPos+=U16_LENGTH(c);
return c;
} else {
return DONE;
UChar32 Normalizer::previous() {
if(bufferPos>0 || previousNormalize()) {
UChar32 c=buffer.char32At(bufferPos-1);
- bufferPos-=UTF_CHAR_LENGTH(c);
+ bufferPos-=U16_LENGTH(c);
return c;
} else {
return DONE;
}
void Normalizer::reset() {
- currentIndex=nextIndex=text->move(text, 0, UITER_START);
+ currentIndex=nextIndex=text->setToStart();
clearBuffer();
}
void
Normalizer::setIndexOnly(int32_t index) {
- currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
+ text->setIndex(index); // pins index
+ currentIndex=nextIndex=text->getIndex();
clearBuffer();
}
/**
- * Return the first character in the normalized text-> This resets
- * the <tt>Normalizer's</tt> position to the beginning of the text->
+ * Return the first character in the normalized text. This resets
+ * the <tt>Normalizer's</tt> position to the beginning of the text.
*/
UChar32 Normalizer::first() {
reset();
}
/**
- * Return the last character in the normalized text-> This resets
+ * Return the last character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to be just before the
* the input text corresponding to that normalized character.
*/
UChar32 Normalizer::last() {
- currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
+ currentIndex=nextIndex=text->setToEnd();
clearBuffer();
return previous();
}
}
/**
- * Retrieve the index of the start of the input text-> This is the begin index
+ * Retrieve the index of the start of the input text. This is the begin index
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
int32_t Normalizer::startIndex() const {
- return text->getIndex(text, UITER_START);
+ return text->startIndex();
}
/**
- * Retrieve the index of the end of the input text-> This is the end index
+ * Retrieve the index of the end of the input text. This is the end index
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
int32_t Normalizer::endIndex() const {
- return text->getIndex(text, UITER_LIMIT);
+ return text->endIndex();
}
//-------------------------------------------------------------------------
Normalizer::setMode(UNormalizationMode newMode)
{
fUMode = newMode;
+ init();
}
UNormalizationMode
} else {
fOptions &= (~option);
}
+ init();
}
UBool
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
- * The iteration position is set to the beginning of the input text->
+ * The iteration position is set to the beginning of the input text.
*/
void
Normalizer::setText(const UnicodeString& newText,
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- delete (CharacterIterator *)(text->context);
- text->context = newIter;
+ delete text;
+ text = newIter;
reset();
}
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- delete (CharacterIterator *)(text->context);
- text->context = newIter;
+ delete text;
+ text = newIter;
reset();
}
void
-Normalizer::setText(const UChar* newText,
+Normalizer::setText(ConstChar16Ptr newText,
int32_t length,
UErrorCode &status)
{
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- delete (CharacterIterator *)(text->context);
- text->context = newIter;
+ delete text;
+ text = newIter;
reset();
}
void
Normalizer::getText(UnicodeString& result)
{
- ((CharacterIterator *)(text->context))->getText(result);
+ text->getText(result);
}
//-------------------------------------------------------------------------
UBool
Normalizer::nextNormalize() {
- UChar *p;
- int32_t length;
- UErrorCode errorCode;
-
clearBuffer();
currentIndex=nextIndex;
- text->move(text, nextIndex, UITER_ZERO);
- if(!text->hasNext(text)) {
+ text->setIndex(nextIndex);
+ if(!text->hasNext()) {
return FALSE;
}
-
- errorCode=U_ZERO_ERROR;
- p=buffer.getBuffer(-1);
- length=unorm_next(text, p, buffer.getCapacity(),
- fUMode, fOptions,
- TRUE, 0,
- &errorCode);
- buffer.releaseBuffer(length);
- if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
- errorCode=U_ZERO_ERROR;
- text->move(text, nextIndex, UITER_ZERO);
- p=buffer.getBuffer(length);
- length=unorm_next(text, p, buffer.getCapacity(),
- fUMode, fOptions,
- TRUE, 0,
- &errorCode);
- buffer.releaseBuffer(length);
+ // Skip at least one character so we make progress.
+ UnicodeString segment(text->next32PostInc());
+ while(text->hasNext()) {
+ UChar32 c;
+ if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
+ text->move32(-1, CharacterIterator::kCurrent);
+ break;
+ }
+ segment.append(c);
}
-
- nextIndex=text->getIndex(text, UITER_CURRENT);
+ nextIndex=text->getIndex();
+ UErrorCode errorCode=U_ZERO_ERROR;
+ fNorm2->normalize(segment, buffer, errorCode);
return U_SUCCESS(errorCode) && !buffer.isEmpty();
}
UBool
Normalizer::previousNormalize() {
- UChar *p;
- int32_t length;
- UErrorCode errorCode;
-
clearBuffer();
nextIndex=currentIndex;
- text->move(text, currentIndex, UITER_ZERO);
- if(!text->hasPrevious(text)) {
+ text->setIndex(currentIndex);
+ if(!text->hasPrevious()) {
return FALSE;
}
-
- errorCode=U_ZERO_ERROR;
- p=buffer.getBuffer(-1);
- length=unorm_previous(text, p, buffer.getCapacity(),
- fUMode, fOptions,
- TRUE, 0,
- &errorCode);
- buffer.releaseBuffer(length);
- if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
- errorCode=U_ZERO_ERROR;
- text->move(text, currentIndex, UITER_ZERO);
- p=buffer.getBuffer(length);
- length=unorm_previous(text, p, buffer.getCapacity(),
- fUMode, fOptions,
- TRUE, 0,
- &errorCode);
- buffer.releaseBuffer(length);
+ UnicodeString segment;
+ while(text->hasPrevious()) {
+ UChar32 c=text->previous32();
+ segment.insert(0, c);
+ if(fNorm2->hasBoundaryBefore(c)) {
+ break;
+ }
}
-
+ currentIndex=text->getIndex();
+ UErrorCode errorCode=U_ZERO_ERROR;
+ fNorm2->normalize(segment, buffer, errorCode);
bufferPos=buffer.length();
- currentIndex=text->getIndex(text, UITER_CURRENT);
return U_SUCCESS(errorCode) && !buffer.isEmpty();
}