[apple/icu.git] / icuSources / common / filterednormalizer2.cpp

/*
*******************************************************************************
*
*   Copyright (C) 2009-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  filterednormalizer2.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2009dec10
*   created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "unicode/normalizer2.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"

U_NAMESPACE_BEGIN

UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
                               UnicodeString &dest,
                               UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(src, errorCode);
    if(U_FAILURE(errorCode)) {
        dest.setToBogus();
        return dest;
    }
    if(&dest==&src) {
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return dest;
    }
    dest.remove();
    return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
}

// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// USET_SPAN_SIMPLE should be passed in for the start of src
// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
                               UnicodeString &dest,
                               USetSpanCondition spanCondition,
                               UErrorCode &errorCode) const {
    UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
    for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
        int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
        int32_t spanLength=spanLimit-prevSpanLimit;
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            if(spanLength!=0) {
                dest.append(src, prevSpanLimit, spanLength);
            }
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            if(spanLength!=0) {
                // Not norm2.normalizeSecondAndAppend() because we do not want
                // to modify the non-filter part of dest.
                dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
                                            tempDest, errorCode));
                if(U_FAILURE(errorCode)) {
                    break;
                }
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return dest;
}

UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
                                              const UnicodeString &second,
                                              UErrorCode &errorCode) const {
    return normalizeSecondAndAppend(first, second, TRUE, errorCode);
}

UnicodeString &
FilteredNormalizer2::append(UnicodeString &first,
                            const UnicodeString &second,
                            UErrorCode &errorCode) const {
    return normalizeSecondAndAppend(first, second, FALSE, errorCode);
}

UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
                                              const UnicodeString &second,
                                              UBool doNormalize,
                                              UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(first, errorCode);
    uprv_checkCanGetBuffer(second, errorCode);
    if(U_FAILURE(errorCode)) {
        return first;
    }
    if(&first==&second) {
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return first;
    }
    if(first.isEmpty()) {
        if(doNormalize) {
            return normalize(second, first, errorCode);
        } else {
            return first=second;
        }
    }
    // merge the in-filter suffix of the first string with the in-filter prefix of the second
    int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
    if(prefixLimit!=0) {
        UnicodeString prefix(second.tempSubString(0, prefixLimit));
        int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
        if(suffixStart==0) {
            if(doNormalize) {
                norm2.normalizeSecondAndAppend(first, prefix, errorCode);
            } else {
                norm2.append(first, prefix, errorCode);
            }
        } else {
            UnicodeString middle(first, suffixStart, INT32_MAX);
            if(doNormalize) {
                norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
            } else {
                norm2.append(middle, prefix, errorCode);
            }
            first.replace(suffixStart, INT32_MAX, middle);
        }
    }
    if(prefixLimit<second.length()) {
        UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
        if(doNormalize) {
            normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
        } else {
            first.append(rest);
        }
    }
    return first;
}

UBool
FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
    return set.contains(c) && norm2.getDecomposition(c, decomposition);
}

UBool
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
    if(U_FAILURE(errorCode)) {
        return FALSE;
    }
    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
                U_FAILURE(errorCode)
            ) {
                return FALSE;
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return TRUE;
}

UNormalizationCheckResult
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
    if(U_FAILURE(errorCode)) {
        return UNORM_MAYBE;
    }
    UNormalizationCheckResult result=UNORM_YES;
    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            UNormalizationCheckResult qcResult=
                norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
            if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
                return qcResult;
            } else if(qcResult==UNORM_MAYBE) {
                result=qcResult;
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return result;
}

int32_t
FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
    if(U_FAILURE(errorCode)) {
        return 0;
    }
    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
            spanCondition=USET_SPAN_SIMPLE;
        } else {
            int32_t yesLimit=
                prevSpanLimit+
                norm2.spanQuickCheckYes(
                    s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
            if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
                return yesLimit;
            }
            spanCondition=USET_SPAN_NOT_CONTAINED;
        }
        prevSpanLimit=spanLimit;
    }
    return s.length();
}

UBool
FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
    return !set.contains(c) || norm2.hasBoundaryBefore(c);
}

UBool
FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
    return !set.contains(c) || norm2.hasBoundaryAfter(c);
}

UBool
FilteredNormalizer2::isInert(UChar32 c) const {
    return !set.contains(c) || norm2.isInert(c);
}

U_NAMESPACE_END

// C API ------------------------------------------------------------------- ***

U_NAMESPACE_USE

U_DRAFT UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    if(filterSet==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
                                             *UnicodeSet::fromUSet(filterSet));
    if(fn2==NULL) {
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    }
    return (UNormalizer2 *)fn2;
}

#endif  // !UCONFIG_NO_NORMALIZATION
Commit	Line	Data
729e4ab9 A	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2009-2010, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: filterednormalizer2.cpp
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2009dec10
	14	* created by: Markus W. Scherer
	15	*/
	16
	17	#include "unicode/utypes.h"
	18
	19	#if !UCONFIG_NO_NORMALIZATION
	20
	21	#include "unicode/normalizer2.h"
	22	#include "unicode/uniset.h"
	23	#include "unicode/unistr.h"
	24	#include "unicode/unorm.h"
	25	#include "cpputils.h"
	26
	27	U_NAMESPACE_BEGIN
	28
	29	UnicodeString &
	30	FilteredNormalizer2::normalize(const UnicodeString &src,
	31	UnicodeString &dest,
	32	UErrorCode &errorCode) const {
	33	uprv_checkCanGetBuffer(src, errorCode);
	34	if(U_FAILURE(errorCode)) {
	35	dest.setToBogus();
	36	return dest;
	37	}
	38	if(&dest==&src) {
	39	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	40	return dest;
	41	}
	42	dest.remove();
	43	return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
	44	}
	45
	46	// Internal: No argument checking, and appends to dest.
	47	// Pass as input spanCondition the one that is likely to yield a non-zero
	48	// span length at the start of src.
	49	// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
	50	// USET_SPAN_SIMPLE should be passed in for the start of src
	51	// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
	52	// an in-filter prefix.
	53	UnicodeString &
	54	FilteredNormalizer2::normalize(const UnicodeString &src,
	55	UnicodeString &dest,
	56	USetSpanCondition spanCondition,
	57	UErrorCode &errorCode) const {
	58	UnicodeString tempDest; // Don't throw away destination buffer between iterations.
	59	for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
	60	int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
	61	int32_t spanLength=spanLimit-prevSpanLimit;
	62	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	63	if(spanLength!=0) {
	64	dest.append(src, prevSpanLimit, spanLength);
65	}
66	spanCondition=USET_SPAN_SIMPLE;
67	} else {
68	if(spanLength!=0) {
69	// Not norm2.normalizeSecondAndAppend() because we do not want
70	// to modify the non-filter part of dest.
71	dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
72	tempDest, errorCode));
73	if(U_FAILURE(errorCode)) {
74	break;
75	}
76	}
77	spanCondition=USET_SPAN_NOT_CONTAINED;
78	}
79	prevSpanLimit=spanLimit;
80	}
81	return dest;
82	}
83
84	UnicodeString &
85	FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
86	const UnicodeString &second,
87	UErrorCode &errorCode) const {
88	return normalizeSecondAndAppend(first, second, TRUE, errorCode);
89	}
90
91	UnicodeString &
92	FilteredNormalizer2::append(UnicodeString &first,
93	const UnicodeString &second,
94	UErrorCode &errorCode) const {
95	return normalizeSecondAndAppend(first, second, FALSE, errorCode);
96	}
97
98	UnicodeString &
99	FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
100	const UnicodeString &second,
101	UBool doNormalize,
102	UErrorCode &errorCode) const {
103	uprv_checkCanGetBuffer(first, errorCode);
104	uprv_checkCanGetBuffer(second, errorCode);
105	if(U_FAILURE(errorCode)) {
106	return first;
107	}
108	if(&first==&second) {
109	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
110	return first;
111	}
112	if(first.isEmpty()) {
113	if(doNormalize) {
114	return normalize(second, first, errorCode);
115	} else {
116	return first=second;
117	}
118	}
119	// merge the in-filter suffix of the first string with the in-filter prefix of the second
120	int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
121	if(prefixLimit!=0) {
122	UnicodeString prefix(second.tempSubString(0, prefixLimit));
123	int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
124	if(suffixStart==0) {
125	if(doNormalize) {
126	norm2.normalizeSecondAndAppend(first, prefix, errorCode);
127	} else {
128	norm2.append(first, prefix, errorCode);
129	}
130	} else {
131	UnicodeString middle(first, suffixStart, INT32_MAX);
132	if(doNormalize) {
133	norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
134	} else {
135	norm2.append(middle, prefix, errorCode);
136	}
137	first.replace(suffixStart, INT32_MAX, middle);
138	}
139	}
140	if(prefixLimit<second.length()) {
141	UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
142	if(doNormalize) {
143	normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
144	} else {
145	first.append(rest);
146	}
147	}
148	return first;
149	}
150
151	UBool
152	FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
153	return set.contains(c) && norm2.getDecomposition(c, decomposition);
154	}
155
156	UBool
157	FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
158	uprv_checkCanGetBuffer(s, errorCode);
159	if(U_FAILURE(errorCode)) {
160	return FALSE;
161	}
162	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
163	for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
164	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
165	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
166	spanCondition=USET_SPAN_SIMPLE;
167	} else {
168	if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) \|\|
169	U_FAILURE(errorCode)
170	) {
171	return FALSE;
172	}
173	spanCondition=USET_SPAN_NOT_CONTAINED;
174	}
175	prevSpanLimit=spanLimit;
176	}
177	return TRUE;
178	}
179
180	UNormalizationCheckResult
181	FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
182	uprv_checkCanGetBuffer(s, errorCode);
183	if(U_FAILURE(errorCode)) {
184	return UNORM_MAYBE;
185	}
186	UNormalizationCheckResult result=UNORM_YES;
187	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
188	for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
189	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
190	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
191	spanCondition=USET_SPAN_SIMPLE;
192	} else {
193	UNormalizationCheckResult qcResult=
194	norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
195	if(U_FAILURE(errorCode) \|\| qcResult==UNORM_NO) {
196	return qcResult;
197	} else if(qcResult==UNORM_MAYBE) {
198	result=qcResult;
199	}
200	spanCondition=USET_SPAN_NOT_CONTAINED;
201	}
202	prevSpanLimit=spanLimit;
203	}
204	return result;
205	}
206
207	int32_t
208	FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
209	uprv_checkCanGetBuffer(s, errorCode);
210	if(U_FAILURE(errorCode)) {
211	return 0;
212	}
213	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
214	for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
215	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
216	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
217	spanCondition=USET_SPAN_SIMPLE;
218	} else {
219	int32_t yesLimit=
220	prevSpanLimit+
221	norm2.spanQuickCheckYes(
222	s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
223	if(U_FAILURE(errorCode) \|\| yesLimit<spanLimit) {
224	return yesLimit;
225	}
226	spanCondition=USET_SPAN_NOT_CONTAINED;
227	}
228	prevSpanLimit=spanLimit;
229	}
230	return s.length();
231	}
232
233	UBool
234	FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
235	return !set.contains(c) \|\| norm2.hasBoundaryBefore(c);
236	}
237
238	UBool
239	FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
240	return !set.contains(c) \|\| norm2.hasBoundaryAfter(c);
241	}
242
243	UBool
244	FilteredNormalizer2::isInert(UChar32 c) const {
245	return !set.contains(c) \|\| norm2.isInert(c);
246	}
247
248	U_NAMESPACE_END
249
250	// C API ------------------------------------------------------------------- ***
251
252	U_NAMESPACE_USE
253
254	U_DRAFT UNormalizer2 * U_EXPORT2
255	unorm2_openFiltered(const UNormalizer2 norm2, const USet filterSet, UErrorCode *pErrorCode) {
256	if(U_FAILURE(*pErrorCode)) {
257	return NULL;
258	}
259	if(filterSet==NULL) {
260	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
261	return NULL;
262	}
263	Normalizer2 fn2=new FilteredNormalizer2((Normalizer2 *)norm2,
264	*UnicodeSet::fromUSet(filterSet));
265	if(fn2==NULL) {
266	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
267	}
268	return (UNormalizer2 *)fn2;
269	}
270
271	#endif // !UCONFIG_NO_NORMALIZATION