git.saurik.com Git - apple/icu.git/blame - icuSources/common/normlzr.cpp

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
	4	*************************************************************************
	5	* COPYRIGHT:
51004dcb	6	* Copyright (c) 1996-2012, International Business Machines Corporation and
b75a7d8f A	7	* others. All Rights Reserved.
	8	*************************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_NORMALIZATION
	14
729e4ab9	15	#include "unicode/uniset.h"
b75a7d8f A	16	#include "unicode/unistr.h"
	17	#include "unicode/chariter.h"
	18	#include "unicode/schriter.h"
	19	#include "unicode/uchriter.h"
b75a7d8f	20	#include "unicode/normlzr.h"
4388f060	21	#include "unicode/utf16.h"
b75a7d8f	22	#include "cmemory.h"
729e4ab9 A	23	#include "normalizer2impl.h"
729e4ab9 A	24	#include "uprops.h" // for uniset_getUnicode32Instance()
b75a7d8f	25
3d1f044b	26	#if defined(move32)
f3c0d7a5 A	27	// System can define move32 intrinsics, but the char iters define move32 method
	28	// using same undef trick in headers, so undef here to re-enable the method.
	29	#undef move32
	30	#endif
	31
b75a7d8f A	32	U_NAMESPACE_BEGIN
b75a7d8f A	33
374ca955	34	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
b75a7d8f A	35
	36	//-------------------------------------------------------------------------
	37	// Constructors and other boilerplate
	38	//-------------------------------------------------------------------------
	39
	40	Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
729e4ab9 A	41	UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
729e4ab9 A	42	text(new StringCharacterIterator(str)),
b75a7d8f A	43	currentIndex(0), nextIndex(0),
	44	buffer(), bufferPos(0)
	45	{
729e4ab9	46	init();
b75a7d8f A	47	}
b75a7d8f A	48
f3c0d7a5	49	Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
729e4ab9 A	50	UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
729e4ab9 A	51	text(new UCharCharacterIterator(str, length)),
b75a7d8f A	52	currentIndex(0), nextIndex(0),
	53	buffer(), bufferPos(0)
	54	{
729e4ab9	55	init();
b75a7d8f A	56	}
	57
	58	Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
729e4ab9 A	59	UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
729e4ab9 A	60	text(iter.clone()),
b75a7d8f A	61	currentIndex(0), nextIndex(0),
	62	buffer(), bufferPos(0)
	63	{
729e4ab9	64	init();
b75a7d8f A	65	}
	66
	67	Normalizer::Normalizer(const Normalizer &copy) :
729e4ab9 A	68	UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
729e4ab9 A	69	text(copy.text->clone()),
b75a7d8f A	70	currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
	71	buffer(copy.buffer), bufferPos(copy.bufferPos)
	72	{
729e4ab9	73	init();
b75a7d8f A	74	}
b75a7d8f A	75
b75a7d8f	76	void
729e4ab9	77	Normalizer::init() {
b75a7d8f	78	UErrorCode errorCode=U_ZERO_ERROR;
729e4ab9 A	79	fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
	80	if(fOptions&UNORM_UNICODE_3_2) {
	81	delete fFilteredNorm2;
	82	fNorm2=fFilteredNorm2=
	83	new FilteredNormalizer2(fNorm2, uniset_getUnicode32Instance(errorCode));
	84	}
	85	if(U_FAILURE(errorCode)) {
	86	errorCode=U_ZERO_ERROR;
	87	fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
b75a7d8f A	88	}
	89	}
	90
	91	Normalizer::~Normalizer()
	92	{
729e4ab9 A	93	delete fFilteredNorm2;
729e4ab9 A	94	delete text;
b75a7d8f A	95	}
	96
	97	Normalizer*
	98	Normalizer::clone() const
	99	{
729e4ab9	100	return new Normalizer(*this);
b75a7d8f A	101	}
	102
	103	/**
	104	* Generates a hash code for this iterator.
	105	*/
	106	int32_t Normalizer::hashCode() const
	107	{
729e4ab9	108	return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
b75a7d8f A	109	}
	110
	111	UBool Normalizer::operator==(const Normalizer& that) const
	112	{
	113	return
	114	this==&that \|\|
729e4ab9	115	(fUMode==that.fUMode &&
b75a7d8f	116	fOptions==that.fOptions &&
729e4ab9	117	text==that.text &&
b75a7d8f A	118	buffer==that.buffer &&
b75a7d8f A	119	bufferPos==that.bufferPos &&
729e4ab9	120	nextIndex==that.nextIndex);
b75a7d8f A	121	}
	122
	123	//-------------------------------------------------------------------------
	124	// Static utility methods
	125	//-------------------------------------------------------------------------
	126
374ca955	127	void U_EXPORT2
b75a7d8f A	128	Normalizer::normalize(const UnicodeString& source,
	129	UNormalizationMode mode, int32_t options,
	130	UnicodeString& result,
	131	UErrorCode &status) {
	132	if(source.isBogus() \|\| U_FAILURE(status)) {
	133	result.setToBogus();
	134	if(U_SUCCESS(status)) {
	135	status=U_ILLEGAL_ARGUMENT_ERROR;
	136	}
	137	} else {
	138	UnicodeString localDest;
	139	UnicodeString *dest;
	140
	141	if(&source!=&result) {
	142	dest=&result;
	143	} else {
	144	// the source and result strings are the same object, use a temporary one
	145	dest=&localDest;
	146	}
729e4ab9 A	147	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
	148	if(U_SUCCESS(status)) {
	149	if(options&UNORM_UNICODE_3_2) {
	150	FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).
	151	normalize(source, *dest, status);
	152	} else {
	153	n2->normalize(source, *dest, status);
	154	}
b75a7d8f	155	}
729e4ab9	156	if(dest==&localDest && U_SUCCESS(status)) {
b75a7d8f A	157	result=*dest;
b75a7d8f A	158	}
b75a7d8f A	159	}
	160	}
	161
374ca955	162	void U_EXPORT2
b75a7d8f A	163	Normalizer::compose(const UnicodeString& source,
	164	UBool compat, int32_t options,
	165	UnicodeString& result,
	166	UErrorCode &status) {
729e4ab9	167	normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
b75a7d8f A	168	}
b75a7d8f A	169
374ca955	170	void U_EXPORT2
b75a7d8f A	171	Normalizer::decompose(const UnicodeString& source,
	172	UBool compat, int32_t options,
	173	UnicodeString& result,
	174	UErrorCode &status) {
729e4ab9 A	175	normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
	176	}
	177
	178	UNormalizationCheckResult
	179	Normalizer::quickCheck(const UnicodeString& source,
	180	UNormalizationMode mode, int32_t options,
	181	UErrorCode &status) {
	182	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
	183	if(U_SUCCESS(status)) {
	184	if(options&UNORM_UNICODE_3_2) {
	185	return FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).
	186	quickCheck(source, status);
	187	} else {
	188	return n2->quickCheck(source, status);
b75a7d8f A	189	}
b75a7d8f A	190	} else {
729e4ab9 A	191	return UNORM_MAYBE;
	192	}
	193	}
b75a7d8f	194
729e4ab9 A	195	UBool
	196	Normalizer::isNormalized(const UnicodeString& source,
	197	UNormalizationMode mode, int32_t options,
	198	UErrorCode &status) {
	199	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
	200	if(U_SUCCESS(status)) {
	201	if(options&UNORM_UNICODE_3_2) {
	202	return FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).
	203	isNormalized(source, status);
b75a7d8f	204	} else {
729e4ab9	205	return n2->isNormalized(source, status);
b75a7d8f	206	}
729e4ab9 A	207	} else {
729e4ab9 A	208	return FALSE;
b75a7d8f A	209	}
	210	}
	211
374ca955	212	UnicodeString & U_EXPORT2
4388f060	213	Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
b75a7d8f A	214	UnicodeString &result,
	215	UNormalizationMode mode, int32_t options,
	216	UErrorCode &errorCode) {
	217	if(left.isBogus() \|\| right.isBogus() \|\| U_FAILURE(errorCode)) {
	218	result.setToBogus();
	219	if(U_SUCCESS(errorCode)) {
	220	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	221	}
	222	} else {
	223	UnicodeString localDest;
	224	UnicodeString *dest;
	225
729e4ab9	226	if(&right!=&result) {
b75a7d8f A	227	dest=&result;
b75a7d8f A	228	} else {
729e4ab9	229	// the right and result strings are the same object, use a temporary one
b75a7d8f A	230	dest=&localDest;
b75a7d8f A	231	}
729e4ab9 A	232	*dest=left;
	233	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
	234	if(U_SUCCESS(errorCode)) {
	235	if(options&UNORM_UNICODE_3_2) {
	236	FilteredNormalizer2(n2, uniset_getUnicode32Instance(errorCode)).
	237	append(*dest, right, errorCode);
	238	} else {
	239	n2->append(*dest, right, errorCode);
	240	}
b75a7d8f	241	}
729e4ab9	242	if(dest==&localDest && U_SUCCESS(errorCode)) {
b75a7d8f A	243	result=*dest;
b75a7d8f A	244	}
b75a7d8f A	245	}
	246	return result;
	247	}
	248
	249	//-------------------------------------------------------------------------
	250	// Iteration API
	251	//-------------------------------------------------------------------------
	252
	253	/**
	254	* Return the current character in the normalized text.
	255	*/
	256	UChar32 Normalizer::current() {
	257	if(bufferPos<buffer.length() \|\| nextNormalize()) {
	258	return buffer.char32At(bufferPos);
	259	} else {
	260	return DONE;
	261	}
	262	}
	263
	264	/**
	265	* Return the next character in the normalized text and advance
	266	* the iteration position by one. If the end
	267	* of the text has already been reached, {@link #DONE} is returned.
	268	*/
	269	UChar32 Normalizer::next() {
	270	if(bufferPos<buffer.length() \|\| nextNormalize()) {
	271	UChar32 c=buffer.char32At(bufferPos);
4388f060	272	bufferPos+=U16_LENGTH(c);
b75a7d8f A	273	return c;
	274	} else {
	275	return DONE;
	276	}
	277	}
	278
	279	/**
	280	* Return the previous character in the normalized text and decrement
	281	* the iteration position by one. If the beginning
	282	* of the text has already been reached, {@link #DONE} is returned.
	283	*/
	284	UChar32 Normalizer::previous() {
	285	if(bufferPos>0 \|\| previousNormalize()) {
	286	UChar32 c=buffer.char32At(bufferPos-1);
4388f060	287	bufferPos-=U16_LENGTH(c);
b75a7d8f A	288	return c;
	289	} else {
	290	return DONE;
	291	}
	292	}
	293
	294	void Normalizer::reset() {
729e4ab9	295	currentIndex=nextIndex=text->setToStart();
b75a7d8f A	296	clearBuffer();
	297	}
	298
	299	void
	300	Normalizer::setIndexOnly(int32_t index) {
729e4ab9 A	301	text->setIndex(index); // pins index
729e4ab9 A	302	currentIndex=nextIndex=text->getIndex();
b75a7d8f A	303	clearBuffer();
	304	}
	305
	306	/**
729e4ab9 A	307	* Return the first character in the normalized text. This resets
729e4ab9 A	308	* the <tt>Normalizer's</tt> position to the beginning of the text.
b75a7d8f A	309	*/
	310	UChar32 Normalizer::first() {
	311	reset();
	312	return next();
	313	}
	314
	315	/**
729e4ab9	316	* Return the last character in the normalized text. This resets
b75a7d8f A	317	* the <tt>Normalizer's</tt> position to be just before the
	318	* the input text corresponding to that normalized character.
	319	*/
	320	UChar32 Normalizer::last() {
729e4ab9	321	currentIndex=nextIndex=text->setToEnd();
b75a7d8f A	322	clearBuffer();
	323	return previous();
	324	}
	325
	326	/**
	327	* Retrieve the current iteration position in the input text that is
	328	* being normalized. This method is useful in applications such as
	329	* searching, where you need to be able to determine the position in
	330	* the input text that corresponds to a given normalized output character.
	331	* <p>
	332	* <b>Note:</b> This method sets the position in the <em>input</em>, while
	333	* {@link #next} and {@link #previous} iterate through characters in the
	334	* <em>output</em>. This means that there is not necessarily a one-to-one
	335	* correspondence between characters returned by <tt>next</tt> and
	336	* <tt>previous</tt> and the indices passed to and returned from
	337	* <tt>setIndex</tt> and {@link #getIndex}.
	338	*
	339	*/
	340	int32_t Normalizer::getIndex() const {
	341	if(bufferPos<buffer.length()) {
	342	return currentIndex;
	343	} else {
	344	return nextIndex;
	345	}
	346	}
	347
	348	/**
729e4ab9	349	* Retrieve the index of the start of the input text. This is the begin index
b75a7d8f A	350	* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
	351	* over which this <tt>Normalizer</tt> is iterating
	352	*/
	353	int32_t Normalizer::startIndex() const {
729e4ab9	354	return text->startIndex();
b75a7d8f A	355	}
	356
	357	/**
729e4ab9	358	* Retrieve the index of the end of the input text. This is the end index
b75a7d8f A	359	* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
	360	* over which this <tt>Normalizer</tt> is iterating
	361	*/
	362	int32_t Normalizer::endIndex() const {
729e4ab9	363	return text->endIndex();
b75a7d8f A	364	}
	365
	366	//-------------------------------------------------------------------------
	367	// Property access methods
	368	//-------------------------------------------------------------------------
	369
	370	void
	371	Normalizer::setMode(UNormalizationMode newMode)
	372	{
	373	fUMode = newMode;
729e4ab9	374	init();
b75a7d8f A	375	}
	376
	377	UNormalizationMode
	378	Normalizer::getUMode() const
	379	{
	380	return fUMode;
	381	}
	382
	383	void
	384	Normalizer::setOption(int32_t option,
	385	UBool value)
	386	{
	387	if (value) {
	388	fOptions \|= option;
	389	} else {
	390	fOptions &= (~option);
	391	}
729e4ab9	392	init();
b75a7d8f A	393	}
	394
	395	UBool
	396	Normalizer::getOption(int32_t option) const
	397	{
	398	return (fOptions & option) != 0;
	399	}
	400
	401	/**
	402	* Set the input text over which this <tt>Normalizer</tt> will iterate.
729e4ab9	403	* The iteration position is set to the beginning of the input text.
b75a7d8f A	404	*/
	405	void
	406	Normalizer::setText(const UnicodeString& newText,
	407	UErrorCode &status)
	408	{
	409	if (U_FAILURE(status)) {
	410	return;
	411	}
	412	CharacterIterator *newIter = new StringCharacterIterator(newText);
	413	if (newIter == NULL) {
	414	status = U_MEMORY_ALLOCATION_ERROR;
	415	return;
	416	}
729e4ab9 A	417	delete text;
729e4ab9 A	418	text = newIter;
b75a7d8f A	419	reset();
	420	}
	421
	422	/**
	423	* Set the input text over which this <tt>Normalizer</tt> will iterate.
	424	* The iteration position is set to the beginning of the string.
	425	*/
	426	void
	427	Normalizer::setText(const CharacterIterator& newText,
	428	UErrorCode &status)
	429	{
	430	if (U_FAILURE(status)) {
	431	return;
	432	}
	433	CharacterIterator *newIter = newText.clone();
	434	if (newIter == NULL) {
	435	status = U_MEMORY_ALLOCATION_ERROR;
	436	return;
	437	}
729e4ab9 A	438	delete text;
729e4ab9 A	439	text = newIter;
b75a7d8f A	440	reset();
	441	}
	442
	443	void
f3c0d7a5	444	Normalizer::setText(ConstChar16Ptr newText,
b75a7d8f A	445	int32_t length,
	446	UErrorCode &status)
	447	{
	448	if (U_FAILURE(status)) {
	449	return;
	450	}
	451	CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
	452	if (newIter == NULL) {
	453	status = U_MEMORY_ALLOCATION_ERROR;
	454	return;
	455	}
729e4ab9 A	456	delete text;
729e4ab9 A	457	text = newIter;
b75a7d8f A	458	reset();
	459	}
	460
	461	/**
	462	* Copies the text under iteration into the UnicodeString referred to by "result".
	463	* @param result Receives a copy of the text under iteration.
	464	*/
	465	void
	466	Normalizer::getText(UnicodeString& result)
	467	{
729e4ab9	468	text->getText(result);
b75a7d8f A	469	}
	470
	471	//-------------------------------------------------------------------------
	472	// Private utility methods
	473	//-------------------------------------------------------------------------
	474
	475	void Normalizer::clearBuffer() {
	476	buffer.remove();
	477	bufferPos=0;
	478	}
	479
	480	UBool
	481	Normalizer::nextNormalize() {
b75a7d8f A	482	clearBuffer();
b75a7d8f A	483	currentIndex=nextIndex;
729e4ab9 A	484	text->setIndex(nextIndex);
729e4ab9 A	485	if(!text->hasNext()) {
b75a7d8f A	486	return FALSE;
b75a7d8f A	487	}
729e4ab9 A	488	// Skip at least one character so we make progress.
	489	UnicodeString segment(text->next32PostInc());
	490	while(text->hasNext()) {
	491	UChar32 c;
	492	if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
	493	text->move32(-1, CharacterIterator::kCurrent);
	494	break;
	495	}
	496	segment.append(c);
b75a7d8f	497	}
729e4ab9 A	498	nextIndex=text->getIndex();
	499	UErrorCode errorCode=U_ZERO_ERROR;
	500	fNorm2->normalize(segment, buffer, errorCode);
b75a7d8f A	501	return U_SUCCESS(errorCode) && !buffer.isEmpty();
	502	}
	503
	504	UBool
	505	Normalizer::previousNormalize() {
b75a7d8f A	506	clearBuffer();
b75a7d8f A	507	nextIndex=currentIndex;
729e4ab9 A	508	text->setIndex(currentIndex);
729e4ab9 A	509	if(!text->hasPrevious()) {
b75a7d8f A	510	return FALSE;
b75a7d8f A	511	}
729e4ab9 A	512	UnicodeString segment;
	513	while(text->hasPrevious()) {
	514	UChar32 c=text->previous32();
	515	segment.insert(0, c);
	516	if(fNorm2->hasBoundaryBefore(c)) {
	517	break;
	518	}
b75a7d8f	519	}
729e4ab9 A	520	currentIndex=text->getIndex();
	521	UErrorCode errorCode=U_ZERO_ERROR;
	522	fNorm2->normalize(segment, buffer, errorCode);
b75a7d8f	523	bufferPos=buffer.length();
b75a7d8f A	524	return U_SUCCESS(errorCode) && !buffer.isEmpty();
	525	}
	526
	527	U_NAMESPACE_END
	528
	529	#endif /* #if !UCONFIG_NO_NORMALIZATION */