[apple/icu.git] / icuSources / i18n / nortrans.cpp

/*
**********************************************************************
*   Copyright (C) 2001-2007, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/03/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uniset.h"
#include "unicode/uiter.h"
#include "nortrans.h"
#include "unormimp.h"
#include "ucln_in.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)

/**
 * System registration hook.
 */
void NormalizationTransliterator::registerIDs() {
    UErrorCode errorCode = U_ZERO_ERROR;
    if(!unorm_haveData(&errorCode)) {
        return;
    }

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
                                     _create, integerToken(UNORM_NFC));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
                                     _create, integerToken(UNORM_NFKC));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
                                     _create, integerToken(UNORM_NFD));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
                                     _create, integerToken(UNORM_NFKD));
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
}

/**
 * Factory methods
 */
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
                                                     Token context) {
    return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
}

/**
 * Constructs a transliterator.
 */
NormalizationTransliterator::NormalizationTransliterator(
                                 const UnicodeString& id,
                                 UNormalizationMode mode, int32_t opt) :
    Transliterator(id, 0) {
    fMode = mode;
    options = opt;
}

/**
 * Destructor.
 */
NormalizationTransliterator::~NormalizationTransliterator() {
}

/**
 * Copy constructor.
 */
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o) {
    fMode = o.fMode;
    options = o.options;
}

/**
 * Assignment operator.
 */
/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
    Transliterator::operator=(o);
    fMode = o.fMode;
    options = o.options;
    return *this;
}*/

/**
 * Transliterator API.
 */
Transliterator* NormalizationTransliterator::clone(void) const {
    return new NormalizationTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
    // start and limit of the input range
    int32_t start = offsets.start;
    int32_t limit = offsets.limit;
    int32_t length, delta;

    if(start >= limit) {
        return;
    }

    // a C code unit iterator, implemented around the Replaceable
    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);

    // the output string and buffer pointer
    UnicodeString output;
    UChar *buffer;
    UBool neededToNormalize;

    UErrorCode errorCode;

    /*
     * Normalize as short chunks at a time as possible even in
     * bulk mode, so that styled text is minimally disrupted.
     * In incremental mode, a chunk that ends with offsets.limit
     * must not be normalized.
     *
     * If it was known that the input text is not styled, then
     * a bulk mode normalization could look like this:
     *

    UChar staticChars[256];
    UnicodeString input;

    length = limit - start;
    input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    input.releaseBuffer(length);

    UErrorCode status = U_ZERO_ERROR;
    Normalizer::normalize(input, fMode, options, output, status);

    text.handleReplaceBetween(start, limit, output);

    int32_t delta = output.length() - length;
    offsets.contextLimit += delta;
    offsets.limit += delta;
    offsets.start = limit + delta;

     *
     */
    while(start < limit) {
        // set the iterator limits for the remaining input range
        // this is a moving target because of the replacements in the text object
        iter.start = iter.index = start;
        iter.limit = limit;

        // incrementally normalize a small chunk of the input
        buffer = output.getBuffer(-1);
        errorCode = U_ZERO_ERROR;
        length = unorm_next(&iter, buffer, output.getCapacity(),
                            fMode, 0,
                            TRUE, &neededToNormalize,
                            &errorCode);
        output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);

        if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
            // use a larger output string buffer and do it again from the start
            iter.index = start;
            buffer = output.getBuffer(length);
            errorCode = U_ZERO_ERROR;
            length = unorm_next(&iter, buffer, output.getCapacity(),
                                fMode, 0,
                                TRUE, &neededToNormalize,
                                &errorCode);
            output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
        }

        if(U_FAILURE(errorCode)) {
            break;
        }

        limit = iter.index;
        if(isIncremental && limit == iter.limit) {
            // stop in incremental mode when we reach the input limit
            // in case there are additional characters that could change the
            // normalization result

            // UNLESS all characters in the result of the normalization of
            // the last run are in the skippable set
            const UChar *s=output.getBuffer();
            int32_t i=0, outLength=output.length();
            UChar32 c;

            while(i<outLength) {
                U16_NEXT(s, i, outLength, c);
                if(!unorm_isNFSkippable(c, fMode)) {
                    outLength=-1; // I wish C++ had labeled loops and break outer; ...
                    break;
                }
            }
            if (outLength<0) {
                break;
            }
        }

        if(neededToNormalize) {
            // replace the input chunk with its normalized form
            text.handleReplaceBetween(start, limit, output);

            // update all necessary indexes accordingly
            delta = length - (limit - start);   // length change in the text object
            start = limit += delta;             // the next chunk starts where this one ends, with adjustment
            limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
            offsets.contextLimit += delta;
        } else {
            // delta == 0
            start = limit;
            limit = offsets.limit;
        }
    }

    offsets.start = start;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	**********************************************************************
46f4442e	3	* Copyright (C) 2001-2007, International Business Machines
b75a7d8f A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 07/03/01 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uniset.h"
	16	#include "unicode/uiter.h"
	17	#include "nortrans.h"
	18	#include "unormimp.h"
b75a7d8f A	19	#include "ucln_in.h"
	20
	21	U_NAMESPACE_BEGIN
	22
374ca955	23	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
b75a7d8f A	24
	25	/**
	26	* System registration hook.
	27	*/
	28	void NormalizationTransliterator::registerIDs() {
	29	UErrorCode errorCode = U_ZERO_ERROR;
	30	if(!unorm_haveData(&errorCode)) {
	31	return;
	32	}
	33
	34	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
	35	_create, integerToken(UNORM_NFC));
	36	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
	37	_create, integerToken(UNORM_NFKC));
	38	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
	39	_create, integerToken(UNORM_NFD));
	40	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
	41	_create, integerToken(UNORM_NFKD));
	42	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
	43	UNICODE_STRING_SIMPLE("NFD"), TRUE);
	44	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
	45	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
	46	}
	47
	48	/**
	49	* Factory methods
	50	*/
	51	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
	52	Token context) {
	53	return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
	54	}
	55
	56	/**
	57	* Constructs a transliterator.
	58	*/
	59	NormalizationTransliterator::NormalizationTransliterator(
	60	const UnicodeString& id,
	61	UNormalizationMode mode, int32_t opt) :
	62	Transliterator(id, 0) {
	63	fMode = mode;
	64	options = opt;
	65	}
	66
	67	/**
	68	* Destructor.
	69	*/
	70	NormalizationTransliterator::~NormalizationTransliterator() {
	71	}
	72
	73	/**
	74	* Copy constructor.
	75	*/
	76	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
	77	Transliterator(o) {
	78	fMode = o.fMode;
	79	options = o.options;
	80	}
	81
	82	/**
	83	* Assignment operator.
	84	*/
46f4442e	85	/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
b75a7d8f A	86	Transliterator::operator=(o);
	87	fMode = o.fMode;
	88	options = o.options;
	89	return *this;
46f4442e	90	}*/
b75a7d8f A	91
	92	/**
	93	* Transliterator API.
	94	*/
	95	Transliterator* NormalizationTransliterator::clone(void) const {
	96	return new NormalizationTransliterator(*this);
	97	}
	98
	99	/**
	100	* Implements {@link Transliterator#handleTransliterate}.
	101	*/
	102	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	103	UBool isIncremental) const {
	104	// start and limit of the input range
	105	int32_t start = offsets.start;
	106	int32_t limit = offsets.limit;
	107	int32_t length, delta;
	108
	109	if(start >= limit) {
	110	return;
	111	}
	112
	113	// a C code unit iterator, implemented around the Replaceable
	114	UCharIterator iter;
	115	uiter_setReplaceable(&iter, &text);
	116
	117	// the output string and buffer pointer
	118	UnicodeString output;
	119	UChar *buffer;
	120	UBool neededToNormalize;
	121
	122	UErrorCode errorCode;
	123
	124	/*
	125	* Normalize as short chunks at a time as possible even in
	126	* bulk mode, so that styled text is minimally disrupted.
	127	* In incremental mode, a chunk that ends with offsets.limit
	128	* must not be normalized.
	129	*
	130	* If it was known that the input text is not styled, then
	131	* a bulk mode normalization could look like this:
	132	*
	133
	134	UChar staticChars[256];
	135	UnicodeString input;
	136
	137	length = limit - start;
	138	input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
	139
	140	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
	141	input.releaseBuffer(length);
	142
	143	UErrorCode status = U_ZERO_ERROR;
	144	Normalizer::normalize(input, fMode, options, output, status);
	145
	146	text.handleReplaceBetween(start, limit, output);
	147
	148	int32_t delta = output.length() - length;
	149	offsets.contextLimit += delta;
	150	offsets.limit += delta;
	151	offsets.start = limit + delta;
	152
	153	*
	154	*/
155	while(start < limit) {
156	// set the iterator limits for the remaining input range
157	// this is a moving target because of the replacements in the text object
158	iter.start = iter.index = start;
159	iter.limit = limit;
160
161	// incrementally normalize a small chunk of the input
162	buffer = output.getBuffer(-1);
163	errorCode = U_ZERO_ERROR;
164	length = unorm_next(&iter, buffer, output.getCapacity(),
165	fMode, 0,
166	TRUE, &neededToNormalize,
167	&errorCode);
73c04bcf	168	output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f A	169
	170	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
	171	// use a larger output string buffer and do it again from the start
	172	iter.index = start;
	173	buffer = output.getBuffer(length);
	174	errorCode = U_ZERO_ERROR;
	175	length = unorm_next(&iter, buffer, output.getCapacity(),
	176	fMode, 0,
	177	TRUE, &neededToNormalize,
	178	&errorCode);
73c04bcf	179	output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f A	180	}
	181
	182	if(U_FAILURE(errorCode)) {
	183	break;
	184	}
	185
	186	limit = iter.index;
	187	if(isIncremental && limit == iter.limit) {
	188	// stop in incremental mode when we reach the input limit
	189	// in case there are additional characters that could change the
	190	// normalization result
	191
	192	// UNLESS all characters in the result of the normalization of
	193	// the last run are in the skippable set
	194	const UChar *s=output.getBuffer();
	195	int32_t i=0, outLength=output.length();
	196	UChar32 c;
	197
	198	while(i<outLength) {
	199	U16_NEXT(s, i, outLength, c);
	200	if(!unorm_isNFSkippable(c, fMode)) {
	201	outLength=-1; // I wish C++ had labeled loops and break outer; ...
	202	break;
	203	}
	204	}
	205	if (outLength<0) {
	206	break;
	207	}
	208	}
	209
	210	if(neededToNormalize) {
	211	// replace the input chunk with its normalized form
	212	text.handleReplaceBetween(start, limit, output);
	213
	214	// update all necessary indexes accordingly
	215	delta = length - (limit - start); // length change in the text object
	216	start = limit += delta; // the next chunk starts where this one ends, with adjustment
	217	limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
	218	offsets.contextLimit += delta;
	219	} else {
	220	// delta == 0
	221	start = limit;
	222	limit = offsets.limit;
	223	}
	224	}
	225
	226	offsets.start = start;
	227	}
	228
	229	U_NAMESPACE_END
	230
	231	#endif /* #if !UCONFIG_NO_TRANSLITERATION */