[apple/icu.git] / icuSources / common / filterednormalizer2.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 2009-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  filterednormalizer2.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2009dec10
*   created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "unicode/normalizer2.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"

U_NAMESPACE_BEGIN

FilteredNormalizer2::~FilteredNormalizer2() {}

UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
                               UnicodeString &dest,
                               UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(src, errorCode);
    if(U_FAILURE(errorCode)) {
        dest.setToBogus();
        return dest;
    }
    if(&dest==&src) {
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return dest;
    }
    dest.remove();
    return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
}

// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// USET_SPAN_SIMPLE should be passed in for the start of src
// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
                               UnicodeString &dest,
                               USetSpanCondition spanCondition,
                               UErrorCode &errorCode) const {
    UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
    for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
        int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
        int32_t spanLength=spanLimit-prevSpanLimit;
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            if(spanLength!=0) {
                dest.append(src, prevSpanLimit, spanLength);
            }
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            if(spanLength!=0) {
                // Not norm2.normalizeSecondAndAppend() because we do not want
                // to modify the non-filter part of dest.
                dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
                                            tempDest, errorCode));
                if(U_FAILURE(errorCode)) {
                    break;
                }
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return dest;
}

UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
                                              const UnicodeString &second,
                                              UErrorCode &errorCode) const {
    return normalizeSecondAndAppend(first, second, TRUE, errorCode);
}

UnicodeString &
FilteredNormalizer2::append(UnicodeString &first,
                            const UnicodeString &second,
                            UErrorCode &errorCode) const {
    return normalizeSecondAndAppend(first, second, FALSE, errorCode);
}

UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
                                              const UnicodeString &second,
                                              UBool doNormalize,
                                              UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(first, errorCode);
    uprv_checkCanGetBuffer(second, errorCode);
    if(U_FAILURE(errorCode)) {
        return first;
    }
    if(&first==&second) {
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return first;
    }
    if(first.isEmpty()) {
        if(doNormalize) {
            return normalize(second, first, errorCode);
        } else {
            return first=second;
        }
    }
    // merge the in-filter suffix of the first string with the in-filter prefix of the second
    int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
    if(prefixLimit!=0) {
        UnicodeString prefix(second.tempSubString(0, prefixLimit));
        int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
        if(suffixStart==0) {
            if(doNormalize) {
                norm2.normalizeSecondAndAppend(first, prefix, errorCode);
            } else {
                norm2.append(first, prefix, errorCode);
            }
        } else {
            UnicodeString middle(first, suffixStart, INT32_MAX);
            if(doNormalize) {
                norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
            } else {
                norm2.append(middle, prefix, errorCode);
            }
            first.replace(suffixStart, INT32_MAX, middle);
        }
    }
    if(prefixLimit<second.length()) {
        UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
        if(doNormalize) {
            normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
        } else {
            first.append(rest);
        }
    }
    return first;
}

UBool
FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
    return set.contains(c) && norm2.getDecomposition(c, decomposition);
}

UBool
FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
    return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
}

UChar32
FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
    return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
}

uint8_t
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
    return set.contains(c) ? norm2.getCombiningClass(c) : 0;
}

UBool
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
    if(U_FAILURE(errorCode)) {
        return FALSE;
    }
    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
                U_FAILURE(errorCode)
            ) {
                return FALSE;
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return TRUE;
}

UNormalizationCheckResult
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
    if(U_FAILURE(errorCode)) {
        return UNORM_MAYBE;
    }
    UNormalizationCheckResult result=UNORM_YES;
    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            UNormalizationCheckResult qcResult=
                norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
            if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
                return qcResult;
            } else if(qcResult==UNORM_MAYBE) {
                result=qcResult;
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return result;
}

int32_t
FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
    if(U_FAILURE(errorCode)) {
        return 0;
    }
    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            int32_t yesLimit=
                prevSpanLimit+
                norm2.spanQuickCheckYes(
                    s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
            if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
                return yesLimit;
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return s.length();
}

UBool
FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
    return !set.contains(c) || norm2.hasBoundaryBefore(c);
}

UBool
FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
    return !set.contains(c) || norm2.hasBoundaryAfter(c);
}

UBool
FilteredNormalizer2::isInert(UChar32 c) const {
    return !set.contains(c) || norm2.isInert(c);
}

U_NAMESPACE_END

// C API ------------------------------------------------------------------- ***

U_NAMESPACE_USE

U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    if(filterSet==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
                                             *UnicodeSet::fromUSet(filterSet));
    if(fn2==NULL) {
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    }
    return (UNormalizer2 *)fn2;
}

#endif  // !UCONFIG_NO_NORMALIZATION
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9 A	3	/*
	4	*******************************************************************************
	5	*
51004dcb	6	* Copyright (C) 2009-2012, International Business Machines
729e4ab9 A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: filterednormalizer2.cpp
f3c0d7a5	11	* encoding: UTF-8
729e4ab9 A	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2009dec10
	16	* created by: Markus W. Scherer
	17	*/
	18
	19	#include "unicode/utypes.h"
	20
	21	#if !UCONFIG_NO_NORMALIZATION
	22
	23	#include "unicode/normalizer2.h"
	24	#include "unicode/uniset.h"
	25	#include "unicode/unistr.h"
	26	#include "unicode/unorm.h"
	27	#include "cpputils.h"
	28
	29	U_NAMESPACE_BEGIN
	30
4388f060 A	31	FilteredNormalizer2::~FilteredNormalizer2() {}
4388f060 A	32
729e4ab9 A	33	UnicodeString &
	34	FilteredNormalizer2::normalize(const UnicodeString &src,
	35	UnicodeString &dest,
	36	UErrorCode &errorCode) const {
	37	uprv_checkCanGetBuffer(src, errorCode);
	38	if(U_FAILURE(errorCode)) {
	39	dest.setToBogus();
	40	return dest;
	41	}
	42	if(&dest==&src) {
	43	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	44	return dest;
	45	}
	46	dest.remove();
	47	return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
	48	}
	49
	50	// Internal: No argument checking, and appends to dest.
	51	// Pass as input spanCondition the one that is likely to yield a non-zero
	52	// span length at the start of src.
	53	// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
	54	// USET_SPAN_SIMPLE should be passed in for the start of src
	55	// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
	56	// an in-filter prefix.
	57	UnicodeString &
	58	FilteredNormalizer2::normalize(const UnicodeString &src,
	59	UnicodeString &dest,
	60	USetSpanCondition spanCondition,
	61	UErrorCode &errorCode) const {
	62	UnicodeString tempDest; // Don't throw away destination buffer between iterations.
	63	for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
	64	int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
	65	int32_t spanLength=spanLimit-prevSpanLimit;
	66	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	67	if(spanLength!=0) {
	68	dest.append(src, prevSpanLimit, spanLength);
	69	}
	70	spanCondition=USET_SPAN_SIMPLE;
	71	} else {
	72	if(spanLength!=0) {
	73	// Not norm2.normalizeSecondAndAppend() because we do not want
	74	// to modify the non-filter part of dest.
	75	dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
	76	tempDest, errorCode));
	77	if(U_FAILURE(errorCode)) {
	78	break;
	79	}
	80	}
	81	spanCondition=USET_SPAN_NOT_CONTAINED;
	82	}
	83	prevSpanLimit=spanLimit;
	84	}
	85	return dest;
	86	}
	87
	88	UnicodeString &
	89	FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
	90	const UnicodeString &second,
	91	UErrorCode &errorCode) const {
	92	return normalizeSecondAndAppend(first, second, TRUE, errorCode);
	93	}
	94
	95	UnicodeString &
	96	FilteredNormalizer2::append(UnicodeString &first,
97	const UnicodeString &second,
98	UErrorCode &errorCode) const {
99	return normalizeSecondAndAppend(first, second, FALSE, errorCode);
100	}
101
102	UnicodeString &
103	FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
104	const UnicodeString &second,
105	UBool doNormalize,
106	UErrorCode &errorCode) const {
107	uprv_checkCanGetBuffer(first, errorCode);
108	uprv_checkCanGetBuffer(second, errorCode);
109	if(U_FAILURE(errorCode)) {
110	return first;
111	}
112	if(&first==&second) {
113	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
114	return first;
115	}
116	if(first.isEmpty()) {
117	if(doNormalize) {
118	return normalize(second, first, errorCode);
119	} else {
120	return first=second;
121	}
122	}
123	// merge the in-filter suffix of the first string with the in-filter prefix of the second
124	int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
125	if(prefixLimit!=0) {
126	UnicodeString prefix(second.tempSubString(0, prefixLimit));
127	int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
128	if(suffixStart==0) {
129	if(doNormalize) {
130	norm2.normalizeSecondAndAppend(first, prefix, errorCode);
131	} else {
132	norm2.append(first, prefix, errorCode);
133	}
134	} else {
135	UnicodeString middle(first, suffixStart, INT32_MAX);
136	if(doNormalize) {
137	norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
138	} else {
139	norm2.append(middle, prefix, errorCode);
140	}
141	first.replace(suffixStart, INT32_MAX, middle);
142	}
143	}
144	if(prefixLimit<second.length()) {
145	UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
146	if(doNormalize) {
147	normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
148	} else {
149	first.append(rest);
150	}
151	}
152	return first;
153	}
154
155	UBool
156	FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
157	return set.contains(c) && norm2.getDecomposition(c, decomposition);
158	}
159
4388f060 A	160	UBool
	161	FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
	162	return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
	163	}
	164
	165	UChar32
	166	FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
	167	return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
	168	}
	169
	170	uint8_t
	171	FilteredNormalizer2::getCombiningClass(UChar32 c) const {
	172	return set.contains(c) ? norm2.getCombiningClass(c) : 0;
	173	}
	174
729e4ab9 A	175	UBool
	176	FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
	177	uprv_checkCanGetBuffer(s, errorCode);
	178	if(U_FAILURE(errorCode)) {
	179	return FALSE;
	180	}
	181	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
	182	for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
	183	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
	184	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	185	spanCondition=USET_SPAN_SIMPLE;
	186	} else {
	187	if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) \|\|
	188	U_FAILURE(errorCode)
	189	) {
	190	return FALSE;
	191	}
	192	spanCondition=USET_SPAN_NOT_CONTAINED;
	193	}
	194	prevSpanLimit=spanLimit;
	195	}
	196	return TRUE;
	197	}
	198
	199	UNormalizationCheckResult
	200	FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
	201	uprv_checkCanGetBuffer(s, errorCode);
	202	if(U_FAILURE(errorCode)) {
	203	return UNORM_MAYBE;
	204	}
	205	UNormalizationCheckResult result=UNORM_YES;
	206	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
	207	for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
	208	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
	209	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	210	spanCondition=USET_SPAN_SIMPLE;
	211	} else {
	212	UNormalizationCheckResult qcResult=
	213	norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
	214	if(U_FAILURE(errorCode) \|\| qcResult==UNORM_NO) {
	215	return qcResult;
	216	} else if(qcResult==UNORM_MAYBE) {
	217	result=qcResult;
	218	}
	219	spanCondition=USET_SPAN_NOT_CONTAINED;
	220	}
	221	prevSpanLimit=spanLimit;
	222	}
	223	return result;
	224	}
	225
	226	int32_t
	227	FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
	228	uprv_checkCanGetBuffer(s, errorCode);
	229	if(U_FAILURE(errorCode)) {
	230	return 0;
	231	}
	232	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
	233	for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
	234	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
	235	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	236	spanCondition=USET_SPAN_SIMPLE;
	237	} else {
	238	int32_t yesLimit=
239	prevSpanLimit+
240	norm2.spanQuickCheckYes(
241	s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
242	if(U_FAILURE(errorCode) \|\| yesLimit<spanLimit) {
243	return yesLimit;
244	}
245	spanCondition=USET_SPAN_NOT_CONTAINED;
246	}
247	prevSpanLimit=spanLimit;
248	}
249	return s.length();
250	}
251
252	UBool
253	FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
254	return !set.contains(c) \|\| norm2.hasBoundaryBefore(c);
255	}
256
257	UBool
258	FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
259	return !set.contains(c) \|\| norm2.hasBoundaryAfter(c);
260	}
261
262	UBool
263	FilteredNormalizer2::isInert(UChar32 c) const {
264	return !set.contains(c) \|\| norm2.isInert(c);
265	}
266
267	U_NAMESPACE_END
268
269	// C API ------------------------------------------------------------------- ***
270
271	U_NAMESPACE_USE
272
51004dcb	273	U_CAPI UNormalizer2 * U_EXPORT2
729e4ab9 A	274	unorm2_openFiltered(const UNormalizer2 norm2, const USet filterSet, UErrorCode *pErrorCode) {
	275	if(U_FAILURE(*pErrorCode)) {
	276	return NULL;
	277	}
	278	if(filterSet==NULL) {
	279	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	280	return NULL;
	281	}
	282	Normalizer2 fn2=new FilteredNormalizer2((Normalizer2 *)norm2,
	283	*UnicodeSet::fromUSet(filterSet));
	284	if(fn2==NULL) {
	285	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
	286	}
	287	return (UNormalizer2 *)fn2;
	288	}
	289
	290	#endif // !UCONFIG_NO_NORMALIZATION