git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2001-2011, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 07/03/01 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/normalizer2.h"
	16	#include "unicode/utf16.h"
	17	#include "cstring.h"
	18	#include "nortrans.h"
	19
	20	U_NAMESPACE_BEGIN
	21
	22	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
	23
	24	static inline Transliterator::Token cstrToken(const char *s) {
	25	return Transliterator::pointerToken((void *)s);
	26	}
	27
	28	/**
	29	* System registration hook.
	30	*/
	31	void NormalizationTransliterator::registerIDs() {
	32	// In the Token, the byte after the NUL is the UNormalization2Mode.
	33	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
	34	_create, cstrToken("nfc\0\0"));
	35	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
	36	_create, cstrToken("nfkc\0\0"));
	37	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
	38	_create, cstrToken("nfc\0\1"));
	39	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
	40	_create, cstrToken("nfkc\0\1"));
	41	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
	42	_create, cstrToken("nfc\0\2"));
	43	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
	44	_create, cstrToken("nfc\0\3"));
	45	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
	46	UNICODE_STRING_SIMPLE("NFD"), TRUE);
	47	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
	48	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
	49	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
	50	UNICODE_STRING_SIMPLE("NFD"), FALSE);
	51	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
	52	UNICODE_STRING_SIMPLE("FCD"), FALSE);
	53	}
	54
	55	/**
	56	* Factory methods
	57	*/
	58	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
	59	Token context) {
	60	const char name = (const char )context.pointer;
	61	UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
	62	UErrorCode errorCode = U_ZERO_ERROR;
	63	const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
	64	if(U_SUCCESS(errorCode)) {
	65	return new NormalizationTransliterator(ID, *norm2);
	66	} else {
	67	return NULL;
	68	}
	69	}
	70
	71	/**
	72	* Constructs a transliterator.
	73	*/
	74	NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
	75	const Normalizer2 &norm2) :
	76	Transliterator(id, 0), fNorm2(norm2) {}
	77
	78	/**
	79	* Destructor.
	80	*/
	81	NormalizationTransliterator::~NormalizationTransliterator() {
	82	}
	83
	84	/**
	85	* Copy constructor.
	86	*/
	87	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
	88	Transliterator(o), fNorm2(o.fNorm2) {}
	89
	90	/**
	91	* Transliterator API.
	92	*/
	93	Transliterator* NormalizationTransliterator::clone(void) const {
	94	return new NormalizationTransliterator(*this);
	95	}
	96
	97	/**
	98	* Implements {@link Transliterator#handleTransliterate}.
	99	*/
	100	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	101	UBool isIncremental) const {
	102	// start and limit of the input range
	103	int32_t start = offsets.start;
	104	int32_t limit = offsets.limit;
	105	if(start >= limit) {
	106	return;
	107	}
	108
	109	/*
	110	* Normalize as short chunks at a time as possible even in
	111	* bulk mode, so that styled text is minimally disrupted.
	112	* In incremental mode, a chunk that ends with offsets.limit
	113	* must not be normalized.
	114	*
	115	* If it was known that the input text is not styled, then
	116	* a bulk mode normalization could look like this:
	117
	118	UnicodeString input, normalized;
	119	int32_t length = limit - start;
	120	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
	121	input.releaseBuffer(length);
	122
	123	UErrorCode status = U_ZERO_ERROR;
	124	fNorm2.normalize(input, normalized, status);
	125
	126	text.handleReplaceBetween(start, limit, normalized);
	127
	128	int32_t delta = normalized.length() - length;
	129	offsets.contextLimit += delta;
	130	offsets.limit += delta;
	131	offsets.start = limit + delta;
	132
	133	*/
	134	UErrorCode errorCode = U_ZERO_ERROR;
	135	UnicodeString segment;
	136	UnicodeString normalized;
	137	UChar32 c = text.char32At(start);
	138	do {
	139	int32_t prev = start;
	140	// Skip at least one character so we make progress.
	141	// c holds the character at start.
	142	segment.remove();
	143	do {
	144	segment.append(c);
	145	start += U16_LENGTH(c);
	146	} while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
	147	if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
	148	// stop in incremental mode when we reach the input limit
	149	// in case there are additional characters that could change the
	150	// normalization result
	151	start=prev;
	152	break;
	153	}
	154	fNorm2.normalize(segment, normalized, errorCode);
	155	if(U_FAILURE(errorCode)) {
	156	break;
	157	}
	158	if(segment != normalized) {
	159	// replace the input chunk with its normalized form
	160	text.handleReplaceBetween(prev, start, normalized);
	161
	162	// update all necessary indexes accordingly
	163	int32_t delta = normalized.length() - (start - prev);
	164	start += delta;
	165	limit += delta;
	166	}
	167	} while(start < limit);
	168
	169	offsets.start = start;
	170	offsets.contextLimit += limit - offsets.limit;
	171	offsets.limit = limit;
	172	}
	173
	174	U_NAMESPACE_END
	175
	176	#endif /* #if !UCONFIG_NO_TRANSLITERATION */