git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2001-2003, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 07/03/01 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uniset.h"
	16	#include "unicode/uiter.h"
	17	#include "nortrans.h"
	18	#include "unormimp.h"
	19	#include "mutex.h"
	20	#include "ucln_in.h"
	21
	22	U_NAMESPACE_BEGIN
	23
	24	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
	25
	26	/**
	27	* System registration hook.
	28	*/
	29	void NormalizationTransliterator::registerIDs() {
	30	UErrorCode errorCode = U_ZERO_ERROR;
	31	if(!unorm_haveData(&errorCode)) {
	32	return;
	33	}
	34
	35	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
	36	_create, integerToken(UNORM_NFC));
	37	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
	38	_create, integerToken(UNORM_NFKC));
	39	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
	40	_create, integerToken(UNORM_NFD));
	41	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
	42	_create, integerToken(UNORM_NFKD));
	43	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
	44	UNICODE_STRING_SIMPLE("NFD"), TRUE);
	45	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
	46	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
	47	}
	48
	49	/**
	50	* Factory methods
	51	*/
	52	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
	53	Token context) {
	54	return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
	55	}
	56
	57	/**
	58	* Constructs a transliterator.
	59	*/
	60	NormalizationTransliterator::NormalizationTransliterator(
	61	const UnicodeString& id,
	62	UNormalizationMode mode, int32_t opt) :
	63	Transliterator(id, 0) {
	64	fMode = mode;
	65	options = opt;
	66	}
	67
	68	/**
	69	* Destructor.
	70	*/
	71	NormalizationTransliterator::~NormalizationTransliterator() {
	72	}
	73
	74	/**
	75	* Copy constructor.
	76	*/
	77	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
	78	Transliterator(o) {
	79	fMode = o.fMode;
	80	options = o.options;
	81	}
	82
	83	/**
	84	* Assignment operator.
	85	*/
	86	NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
	87	Transliterator::operator=(o);
	88	fMode = o.fMode;
	89	options = o.options;
	90	return *this;
	91	}
	92
	93	/**
	94	* Transliterator API.
	95	*/
	96	Transliterator* NormalizationTransliterator::clone(void) const {
	97	return new NormalizationTransliterator(*this);
	98	}
	99
	100	/**
	101	* Implements {@link Transliterator#handleTransliterate}.
	102	*/
	103	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	104	UBool isIncremental) const {
	105	// start and limit of the input range
	106	int32_t start = offsets.start;
	107	int32_t limit = offsets.limit;
	108	int32_t length, delta;
	109
	110	if(start >= limit) {
	111	return;
	112	}
	113
	114	// a C code unit iterator, implemented around the Replaceable
	115	UCharIterator iter;
	116	uiter_setReplaceable(&iter, &text);
	117
	118	// the output string and buffer pointer
	119	UnicodeString output;
	120	UChar *buffer;
	121	UBool neededToNormalize;
	122
	123	UErrorCode errorCode;
	124
	125	/*
	126	* Normalize as short chunks at a time as possible even in
	127	* bulk mode, so that styled text is minimally disrupted.
	128	* In incremental mode, a chunk that ends with offsets.limit
	129	* must not be normalized.
	130	*
	131	* If it was known that the input text is not styled, then
	132	* a bulk mode normalization could look like this:
	133	*
	134
	135	UChar staticChars[256];
	136	UnicodeString input;
	137
	138	length = limit - start;
	139	input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
	140
	141	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
	142	input.releaseBuffer(length);
	143
	144	UErrorCode status = U_ZERO_ERROR;
	145	Normalizer::normalize(input, fMode, options, output, status);
	146
	147	text.handleReplaceBetween(start, limit, output);
	148
	149	int32_t delta = output.length() - length;
	150	offsets.contextLimit += delta;
	151	offsets.limit += delta;
	152	offsets.start = limit + delta;
	153
	154	*
	155	*/
	156	while(start < limit) {
	157	// set the iterator limits for the remaining input range
	158	// this is a moving target because of the replacements in the text object
	159	iter.start = iter.index = start;
	160	iter.limit = limit;
	161
	162	// incrementally normalize a small chunk of the input
	163	buffer = output.getBuffer(-1);
	164	errorCode = U_ZERO_ERROR;
	165	length = unorm_next(&iter, buffer, output.getCapacity(),
	166	fMode, 0,
	167	TRUE, &neededToNormalize,
	168	&errorCode);
	169	output.releaseBuffer(length);
	170
	171	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
	172	// use a larger output string buffer and do it again from the start
	173	iter.index = start;
	174	buffer = output.getBuffer(length);
	175	errorCode = U_ZERO_ERROR;
	176	length = unorm_next(&iter, buffer, output.getCapacity(),
	177	fMode, 0,
	178	TRUE, &neededToNormalize,
	179	&errorCode);
	180	output.releaseBuffer(length);
	181	}
	182
	183	if(U_FAILURE(errorCode)) {
	184	break;
	185	}
	186
	187	limit = iter.index;
	188	if(isIncremental && limit == iter.limit) {
	189	// stop in incremental mode when we reach the input limit
	190	// in case there are additional characters that could change the
	191	// normalization result
	192
	193	// UNLESS all characters in the result of the normalization of
	194	// the last run are in the skippable set
	195	const UChar *s=output.getBuffer();
	196	int32_t i=0, outLength=output.length();
	197	UChar32 c;
	198
	199	while(i<outLength) {
	200	U16_NEXT(s, i, outLength, c);
	201	if(!unorm_isNFSkippable(c, fMode)) {
	202	outLength=-1; // I wish C++ had labeled loops and break outer; ...
	203	break;
	204	}
	205	}
	206	if (outLength<0) {
	207	break;
	208	}
	209	}
	210
	211	if(neededToNormalize) {
	212	// replace the input chunk with its normalized form
	213	text.handleReplaceBetween(start, limit, output);
	214
	215	// update all necessary indexes accordingly
	216	delta = length - (limit - start); // length change in the text object
	217	start = limit += delta; // the next chunk starts where this one ends, with adjustment
	218	limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
	219	offsets.contextLimit += delta;
	220	} else {
	221	// delta == 0
	222	start = limit;
	223	limit = offsets.limit;
	224	}
	225	}
	226
	227	offsets.start = start;
	228	}
	229
	230	U_NAMESPACE_END
	231
	232	#endif /* #if !UCONFIG_NO_TRANSLITERATION */