[apple/icu.git] / icuSources / i18n / nortrans.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2001-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/03/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/normalizer2.h"
#include "unicode/utf16.h"
#include "cstring.h"
#include "nortrans.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)

static inline Transliterator::Token cstrToken(const char *s) {
    return Transliterator::pointerToken((void *)s);
}

/**
 * System registration hook.
 */
void NormalizationTransliterator::registerIDs() {
    // In the Token, the byte after the NUL is the UNormalization2Mode.
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
                                     _create, cstrToken("nfc\0\0"));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
                                     _create, cstrToken("nfkc\0\0"));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
                                     _create, cstrToken("nfc\0\1"));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
                                     _create, cstrToken("nfkc\0\1"));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
                                     _create, cstrToken("nfc\0\2"));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
                                     _create, cstrToken("nfc\0\3"));
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
}

/**
 * Factory methods
 */
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
                                                     Token context) {
    const char *name = (const char *)context.pointer;
    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
    UErrorCode errorCode = U_ZERO_ERROR;
    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
    if(U_SUCCESS(errorCode)) {
        return new NormalizationTransliterator(ID, *norm2);
    } else {
        return NULL;
    }
}

/**
 * Constructs a transliterator.
 */
NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
                                                         const Normalizer2 &norm2) :
    Transliterator(id, 0), fNorm2(norm2) {}

/**
 * Destructor.
 */
NormalizationTransliterator::~NormalizationTransliterator() {
}

/**
 * Copy constructor.
 */
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
    Transliterator(o), fNorm2(o.fNorm2) {}

/**
 * Transliterator API.
 */
Transliterator* NormalizationTransliterator::clone(void) const {
    return new NormalizationTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
    // start and limit of the input range
    int32_t start = offsets.start;
    int32_t limit = offsets.limit;
    if(start >= limit) {
        return;
    }

    /*
     * Normalize as short chunks at a time as possible even in
     * bulk mode, so that styled text is minimally disrupted.
     * In incremental mode, a chunk that ends with offsets.limit
     * must not be normalized.
     *
     * If it was known that the input text is not styled, then
     * a bulk mode normalization could look like this:

    UnicodeString input, normalized;
    int32_t length = limit - start;
    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    input.releaseBuffer(length);

    UErrorCode status = U_ZERO_ERROR;
    fNorm2.normalize(input, normalized, status);

    text.handleReplaceBetween(start, limit, normalized);

    int32_t delta = normalized.length() - length;
    offsets.contextLimit += delta;
    offsets.limit += delta;
    offsets.start = limit + delta;

     */
    UErrorCode errorCode = U_ZERO_ERROR;
    UnicodeString segment;
    UnicodeString normalized;
    UChar32 c = text.char32At(start);
    do {
        int32_t prev = start;
        // Skip at least one character so we make progress.
        // c holds the character at start.
        segment.remove();
        do {
            segment.append(c);
            start += U16_LENGTH(c);
        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
            // stop in incremental mode when we reach the input limit
            // in case there are additional characters that could change the
            // normalization result
            start=prev;
            break;
        }
        fNorm2.normalize(segment, normalized, errorCode);
        if(U_FAILURE(errorCode)) {
            break;
        }
        if(segment != normalized) {
            // replace the input chunk with its normalized form
            text.handleReplaceBetween(prev, start, normalized);

            // update all necessary indexes accordingly
            int32_t delta = normalized.length() - (start - prev);
            start += delta;
            limit += delta;
        }
    } while(start < limit);

    offsets.start = start;
    offsets.contextLimit += limit - offsets.limit;
    offsets.limit = limit;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
b75a7d8f A	4	**********************************************************************
4388f060	5	* Copyright (C) 2001-2011, International Business Machines
b75a7d8f A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	* Date Name Description
	9	* 07/03/01 aliu Creation.
	10	**********************************************************************
	11	*/
	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_TRANSLITERATION
	16
729e4ab9	17	#include "unicode/normalizer2.h"
4388f060	18	#include "unicode/utf16.h"
729e4ab9	19	#include "cstring.h"
b75a7d8f	20	#include "nortrans.h"
b75a7d8f A	21
	22	U_NAMESPACE_BEGIN
	23
374ca955	24	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
b75a7d8f	25
729e4ab9 A	26	static inline Transliterator::Token cstrToken(const char *s) {
	27	return Transliterator::pointerToken((void *)s);
	28	}
	29
b75a7d8f A	30	/**
	31	* System registration hook.
	32	*/
	33	void NormalizationTransliterator::registerIDs() {
729e4ab9	34	// In the Token, the byte after the NUL is the UNormalization2Mode.
b75a7d8f	35	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
729e4ab9	36	_create, cstrToken("nfc\0\0"));
b75a7d8f	37	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
729e4ab9	38	_create, cstrToken("nfkc\0\0"));
b75a7d8f	39	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
729e4ab9	40	_create, cstrToken("nfc\0\1"));
b75a7d8f	41	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
729e4ab9 A	42	_create, cstrToken("nfkc\0\1"));
	43	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
	44	_create, cstrToken("nfc\0\2"));
	45	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
	46	_create, cstrToken("nfc\0\3"));
b75a7d8f A	47	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
	48	UNICODE_STRING_SIMPLE("NFD"), TRUE);
	49	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
	50	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
729e4ab9 A	51	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
	52	UNICODE_STRING_SIMPLE("NFD"), FALSE);
	53	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
	54	UNICODE_STRING_SIMPLE("FCD"), FALSE);
b75a7d8f A	55	}
	56
	57	/**
	58	* Factory methods
	59	*/
	60	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
	61	Token context) {
729e4ab9 A	62	const char name = (const char )context.pointer;
	63	UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
	64	UErrorCode errorCode = U_ZERO_ERROR;
	65	const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
	66	if(U_SUCCESS(errorCode)) {
	67	return new NormalizationTransliterator(ID, *norm2);
	68	} else {
	69	return NULL;
	70	}
b75a7d8f A	71	}
	72
	73	/**
	74	* Constructs a transliterator.
	75	*/
729e4ab9 A	76	NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
	77	const Normalizer2 &norm2) :
	78	Transliterator(id, 0), fNorm2(norm2) {}
b75a7d8f A	79
	80	/**
	81	* Destructor.
	82	*/
	83	NormalizationTransliterator::~NormalizationTransliterator() {
	84	}
	85
	86	/**
	87	* Copy constructor.
	88	*/
	89	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
729e4ab9	90	Transliterator(o), fNorm2(o.fNorm2) {}
b75a7d8f A	91
	92	/**
	93	* Transliterator API.
	94	*/
	95	Transliterator* NormalizationTransliterator::clone(void) const {
	96	return new NormalizationTransliterator(*this);
	97	}
	98
	99	/**
	100	* Implements {@link Transliterator#handleTransliterate}.
	101	*/
	102	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	103	UBool isIncremental) const {
	104	// start and limit of the input range
	105	int32_t start = offsets.start;
	106	int32_t limit = offsets.limit;
b75a7d8f A	107	if(start >= limit) {
	108	return;
	109	}
	110
b75a7d8f A	111	/*
	112	* Normalize as short chunks at a time as possible even in
	113	* bulk mode, so that styled text is minimally disrupted.
	114	* In incremental mode, a chunk that ends with offsets.limit
	115	* must not be normalized.
	116	*
	117	* If it was known that the input text is not styled, then
	118	* a bulk mode normalization could look like this:
b75a7d8f	119
729e4ab9 A	120	UnicodeString input, normalized;
729e4ab9 A	121	int32_t length = limit - start;
b75a7d8f A	122	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
	123	input.releaseBuffer(length);
	124
	125	UErrorCode status = U_ZERO_ERROR;
729e4ab9	126	fNorm2.normalize(input, normalized, status);
b75a7d8f	127
729e4ab9	128	text.handleReplaceBetween(start, limit, normalized);
b75a7d8f	129
729e4ab9	130	int32_t delta = normalized.length() - length;
b75a7d8f A	131	offsets.contextLimit += delta;
	132	offsets.limit += delta;
	133	offsets.start = limit + delta;
	134
b75a7d8f	135	*/
729e4ab9 A	136	UErrorCode errorCode = U_ZERO_ERROR;
	137	UnicodeString segment;
	138	UnicodeString normalized;
	139	UChar32 c = text.char32At(start);
	140	do {
	141	int32_t prev = start;
	142	// Skip at least one character so we make progress.
	143	// c holds the character at start.
	144	segment.remove();
	145	do {
	146	segment.append(c);
	147	start += U16_LENGTH(c);
	148	} while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
	149	if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
b75a7d8f A	150	// stop in incremental mode when we reach the input limit
	151	// in case there are additional characters that could change the
	152	// normalization result
729e4ab9 A	153	start=prev;
729e4ab9 A	154	break;
b75a7d8f	155	}
729e4ab9 A	156	fNorm2.normalize(segment, normalized, errorCode);
	157	if(U_FAILURE(errorCode)) {
	158	break;
	159	}
	160	if(segment != normalized) {
b75a7d8f	161	// replace the input chunk with its normalized form
729e4ab9	162	text.handleReplaceBetween(prev, start, normalized);
b75a7d8f A	163
b75a7d8f A	164	// update all necessary indexes accordingly
729e4ab9 A	165	int32_t delta = normalized.length() - (start - prev);
	166	start += delta;
	167	limit += delta;
b75a7d8f	168	}
729e4ab9	169	} while(start < limit);
b75a7d8f A	170
b75a7d8f A	171	offsets.start = start;
729e4ab9 A	172	offsets.contextLimit += limit - offsets.limit;
729e4ab9 A	173	offsets.limit = limit;
b75a7d8f A	174	}
	175
	176	U_NAMESPACE_END
	177
	178	#endif /* #if !UCONFIG_NO_TRANSLITERATION */