[apple/icu.git] / icuSources / i18n / nortrans.cpp

/*
**********************************************************************
*   Copyright (C) 2001-2005, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/03/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uniset.h"
#include "unicode/uiter.h"
#include "nortrans.h"
#include "unormimp.h"
#include "mutex.h"
#include "ucln_in.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)

/**
 * System registration hook.
 */
void NormalizationTransliterator::registerIDs() {
    UErrorCode errorCode = U_ZERO_ERROR;
    if(!unorm_haveData(&errorCode)) {
        return;
    }

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
                                     _create, integerToken(UNORM_NFC));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
                                     _create, integerToken(UNORM_NFKC));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
                                     _create, integerToken(UNORM_NFD));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
                                     _create, integerToken(UNORM_NFKD));
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
}

/**
 * Factory methods
 */
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
                                                     Token context) {
    return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
}

/**
 * Constructs a transliterator.
 */
NormalizationTransliterator::NormalizationTransliterator(
                                 const UnicodeString& id,
                                 UNormalizationMode mode, int32_t opt) :
    Transliterator(id, 0) {
    fMode = mode;
    options = opt;
}

/**
 * Destructor.
 */
NormalizationTransliterator::~NormalizationTransliterator() {
}

/**
 * Copy constructor.
 */
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o) {
    fMode = o.fMode;
    options = o.options;
}

/**
 * Assignment operator.
 */
NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
    Transliterator::operator=(o);
    fMode = o.fMode;
    options = o.options;
    return *this;
}

/**
 * Transliterator API.
 */
Transliterator* NormalizationTransliterator::clone(void) const {
    return new NormalizationTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
    // start and limit of the input range
    int32_t start = offsets.start;
    int32_t limit = offsets.limit;
    int32_t length, delta;

    if(start >= limit) {
        return;
    }

    // a C code unit iterator, implemented around the Replaceable
    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);

    // the output string and buffer pointer
    UnicodeString output;
    UChar *buffer;
    UBool neededToNormalize;

    UErrorCode errorCode;

    /*
     * Normalize as short chunks at a time as possible even in
     * bulk mode, so that styled text is minimally disrupted.
     * In incremental mode, a chunk that ends with offsets.limit
     * must not be normalized.
     *
     * If it was known that the input text is not styled, then
     * a bulk mode normalization could look like this:
     *

    UChar staticChars[256];
    UnicodeString input;

    length = limit - start;
    input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    input.releaseBuffer(length);

    UErrorCode status = U_ZERO_ERROR;
    Normalizer::normalize(input, fMode, options, output, status);

    text.handleReplaceBetween(start, limit, output);

    int32_t delta = output.length() - length;
    offsets.contextLimit += delta;
    offsets.limit += delta;
    offsets.start = limit + delta;

     *
     */
    while(start < limit) {
        // set the iterator limits for the remaining input range
        // this is a moving target because of the replacements in the text object
        iter.start = iter.index = start;
        iter.limit = limit;

        // incrementally normalize a small chunk of the input
        buffer = output.getBuffer(-1);
        errorCode = U_ZERO_ERROR;
        length = unorm_next(&iter, buffer, output.getCapacity(),
                            fMode, 0,
                            TRUE, &neededToNormalize,
                            &errorCode);
        output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);

        if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
            // use a larger output string buffer and do it again from the start
            iter.index = start;
            buffer = output.getBuffer(length);
            errorCode = U_ZERO_ERROR;
            length = unorm_next(&iter, buffer, output.getCapacity(),
                                fMode, 0,
                                TRUE, &neededToNormalize,
                                &errorCode);
            output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
        }

        if(U_FAILURE(errorCode)) {
            break;
        }

        limit = iter.index;
        if(isIncremental && limit == iter.limit) {
            // stop in incremental mode when we reach the input limit
            // in case there are additional characters that could change the
            // normalization result

            // UNLESS all characters in the result of the normalization of
            // the last run are in the skippable set
            const UChar *s=output.getBuffer();
            int32_t i=0, outLength=output.length();
            UChar32 c;

            while(i<outLength) {
                U16_NEXT(s, i, outLength, c);
                if(!unorm_isNFSkippable(c, fMode)) {
                    outLength=-1; // I wish C++ had labeled loops and break outer; ...
                    break;
                }
            }
            if (outLength<0) {
                break;
            }
        }

        if(neededToNormalize) {
            // replace the input chunk with its normalized form
            text.handleReplaceBetween(start, limit, output);

            // update all necessary indexes accordingly
            delta = length - (limit - start);   // length change in the text object
            start = limit += delta;             // the next chunk starts where this one ends, with adjustment
            limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
            offsets.contextLimit += delta;
        } else {
            // delta == 0
            start = limit;
            limit = offsets.limit;
        }
    }

    offsets.start = start;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	**********************************************************************
73c04bcf	3	* Copyright (C) 2001-2005, International Business Machines
b75a7d8f A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 07/03/01 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uniset.h"
	16	#include "unicode/uiter.h"
	17	#include "nortrans.h"
	18	#include "unormimp.h"
	19	#include "mutex.h"
	20	#include "ucln_in.h"
	21
	22	U_NAMESPACE_BEGIN
	23
374ca955	24	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
b75a7d8f A	25
	26	/**
	27	* System registration hook.
	28	*/
	29	void NormalizationTransliterator::registerIDs() {
	30	UErrorCode errorCode = U_ZERO_ERROR;
	31	if(!unorm_haveData(&errorCode)) {
	32	return;
	33	}
	34
	35	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
	36	_create, integerToken(UNORM_NFC));
	37	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
	38	_create, integerToken(UNORM_NFKC));
	39	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
	40	_create, integerToken(UNORM_NFD));
	41	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
	42	_create, integerToken(UNORM_NFKD));
	43	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
	44	UNICODE_STRING_SIMPLE("NFD"), TRUE);
	45	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
	46	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
	47	}
	48
	49	/**
	50	* Factory methods
	51	*/
	52	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
	53	Token context) {
	54	return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
	55	}
	56
	57	/**
	58	* Constructs a transliterator.
	59	*/
	60	NormalizationTransliterator::NormalizationTransliterator(
	61	const UnicodeString& id,
	62	UNormalizationMode mode, int32_t opt) :
	63	Transliterator(id, 0) {
	64	fMode = mode;
	65	options = opt;
	66	}
	67
	68	/**
	69	* Destructor.
	70	*/
	71	NormalizationTransliterator::~NormalizationTransliterator() {
	72	}
	73
	74	/**
	75	* Copy constructor.
	76	*/
	77	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
	78	Transliterator(o) {
	79	fMode = o.fMode;
	80	options = o.options;
	81	}
	82
	83	/**
	84	* Assignment operator.
	85	*/
	86	NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
	87	Transliterator::operator=(o);
	88	fMode = o.fMode;
89	options = o.options;
90	return *this;
91	}
92
93	/**
94	* Transliterator API.
95	*/
96	Transliterator* NormalizationTransliterator::clone(void) const {
97	return new NormalizationTransliterator(*this);
98	}
99
100	/**
101	* Implements {@link Transliterator#handleTransliterate}.
102	*/
103	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
104	UBool isIncremental) const {
105	// start and limit of the input range
106	int32_t start = offsets.start;
107	int32_t limit = offsets.limit;
108	int32_t length, delta;
109
110	if(start >= limit) {
111	return;
112	}
113
114	// a C code unit iterator, implemented around the Replaceable
115	UCharIterator iter;
116	uiter_setReplaceable(&iter, &text);
117
118	// the output string and buffer pointer
119	UnicodeString output;
120	UChar *buffer;
121	UBool neededToNormalize;
122
123	UErrorCode errorCode;
124
125	/*
126	* Normalize as short chunks at a time as possible even in
127	* bulk mode, so that styled text is minimally disrupted.
128	* In incremental mode, a chunk that ends with offsets.limit
129	* must not be normalized.
130	*
131	* If it was known that the input text is not styled, then
132	* a bulk mode normalization could look like this:
133	*
134
135	UChar staticChars[256];
136	UnicodeString input;
137
138	length = limit - start;
139	input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
140
141	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
142	input.releaseBuffer(length);
143
144	UErrorCode status = U_ZERO_ERROR;
145	Normalizer::normalize(input, fMode, options, output, status);
146
147	text.handleReplaceBetween(start, limit, output);
148
149	int32_t delta = output.length() - length;
150	offsets.contextLimit += delta;
151	offsets.limit += delta;
152	offsets.start = limit + delta;
153
154	*
155	*/
156	while(start < limit) {
157	// set the iterator limits for the remaining input range
158	// this is a moving target because of the replacements in the text object
159	iter.start = iter.index = start;
160	iter.limit = limit;
161
162	// incrementally normalize a small chunk of the input
163	buffer = output.getBuffer(-1);
164	errorCode = U_ZERO_ERROR;
165	length = unorm_next(&iter, buffer, output.getCapacity(),
166	fMode, 0,
167	TRUE, &neededToNormalize,
168	&errorCode);
73c04bcf	169	output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f A	170
	171	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
	172	// use a larger output string buffer and do it again from the start
	173	iter.index = start;
	174	buffer = output.getBuffer(length);
	175	errorCode = U_ZERO_ERROR;
	176	length = unorm_next(&iter, buffer, output.getCapacity(),
	177	fMode, 0,
	178	TRUE, &neededToNormalize,
	179	&errorCode);
73c04bcf	180	output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f A	181	}
	182
	183	if(U_FAILURE(errorCode)) {
	184	break;
	185	}
	186
	187	limit = iter.index;
	188	if(isIncremental && limit == iter.limit) {
	189	// stop in incremental mode when we reach the input limit
	190	// in case there are additional characters that could change the
	191	// normalization result
	192
	193	// UNLESS all characters in the result of the normalization of
	194	// the last run are in the skippable set
	195	const UChar *s=output.getBuffer();
	196	int32_t i=0, outLength=output.length();
	197	UChar32 c;
	198
	199	while(i<outLength) {
	200	U16_NEXT(s, i, outLength, c);
	201	if(!unorm_isNFSkippable(c, fMode)) {
	202	outLength=-1; // I wish C++ had labeled loops and break outer; ...
	203	break;
	204	}
	205	}
	206	if (outLength<0) {
	207	break;
	208	}
	209	}
	210
	211	if(neededToNormalize) {
	212	// replace the input chunk with its normalized form
	213	text.handleReplaceBetween(start, limit, output);
	214
	215	// update all necessary indexes accordingly
	216	delta = length - (limit - start); // length change in the text object
	217	start = limit += delta; // the next chunk starts where this one ends, with adjustment
	218	limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
	219	offsets.contextLimit += delta;
	220	} else {
	221	// delta == 0
	222	start = limit;
	223	limit = offsets.limit;
	224	}
	225	}
	226
	227	offsets.start = start;
	228	}
	229
	230	U_NAMESPACE_END
	231
	232	#endif /* #if !UCONFIG_NO_TRANSLITERATION */