X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..HEAD:/icuSources/i18n/unicode/usearch.h diff --git a/icuSources/i18n/unicode/usearch.h b/icuSources/i18n/unicode/usearch.h index 2f3d5639..080528e3 100644 --- a/icuSources/i18n/unicode/usearch.h +++ b/icuSources/i18n/unicode/usearch.h @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** -* Copyright (C) 2001-2010 IBM and others. All rights reserved. +* Copyright (C) 2001-2011,2014 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 06/28/2001 synwee Creation. @@ -22,7 +24,7 @@ * \file * \brief C API: StringSearch * - * C Apis for an engine that provides language-sensitive text searching based + * C APIs for an engine that provides language-sensitive text searching based * on the comparison rules defined in a UCollator data struct, * see ucol.h. This ensures that language eccentricity can be * handled, e.g. for the German collator, characters ß and SS will be matched @@ -30,11 +32,11 @@ * See the * "ICU Collation Design Document" for more information. *
- * The algorithm implemented is a modified form of the Boyer Moore's search. - * For more information see + * The implementation may use a linear search or a modified form of the Boyer-Moore + * search; for more information on the latter see * * "Efficient Text Searching in Java", published in Java Report - * in February, 1999, for further information on the algorithm. + * in February, 1999. *
* There are 2 match options for selection:
* Let S' be the sub-string of a text string S between the offsets start and
@@ -53,7 +55,7 @@
*
* This search has APIs similar to that of other text iteration mechanisms * such as the break iterators in ubrk.h. Using these - * APIs, it is easy to scan through text looking for all occurances of + * APIs, it is easy to scan through text looking for all occurrences of * a given pattern. This search iterator allows changing of direction by * calling a reset followed by a next or previous. * Though a direction change can occur without calling reset first, @@ -91,6 +93,11 @@ * for the pattern "abab" in the text "ababab", where else mutually * exclusive matches only produce the result of 0. *
+ * Options are also provided to implement "asymmetric search" as described in + * + * UTS #10 Unicode Collation Algorithm, specifically the USearchAttribute + * USEARCH_ELEMENT_COMPARISON and its values. + *
* Though collator attributes will be taken into consideration while * performing matches, there are no APIs here for setting and getting the * attributes. These attributes can be set by getting the collator @@ -123,7 +130,7 @@ * pos = usearch_next(search, &status)) * { * printf("Found match at %d pos, length is %d\n", pos, - * usearch_getMatchLength(search)); + * usearch_getMatchedLength(search)); * } * } * @@ -154,32 +161,62 @@ typedef struct UStringSearch UStringSearch; * @stable ICU 2.4 */ typedef enum { - /** Option for overlapping matches */ - USEARCH_OVERLAP, + /** + * Option for overlapping matches + * @stable ICU 2.4 + */ + USEARCH_OVERLAP = 0, +#ifndef U_HIDE_DEPRECATED_API /** - * Option for canonical matches. option 1 in header documentation. - * The default value will be USEARCH_OFF + * Option for canonical matches; option 1 in header documentation. + * The default value will be USEARCH_OFF. + * Note: Setting this option to USEARCH_ON currently has no effect on + * search behavior, and this option is deprecated. Instead, to control + * canonical match behavior, you must set UCOL_NORMALIZATION_MODE + * appropriately (to UCOL_OFF or UCOL_ON) in the UCollator used by + * the UStringSearch object. + * @see usearch_openFromCollator + * @see usearch_getCollator + * @see usearch_setCollator + * @see ucol_getAttribute + * @deprecated ICU 53 */ - USEARCH_CANONICAL_MATCH, + USEARCH_CANONICAL_MATCH = 1, +#endif /* U_HIDE_DEPRECATED_API */ /** * Option to control how collation elements are compared. * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. * @stable ICU 4.4 */ - USEARCH_ELEMENT_COMPARISON, + USEARCH_ELEMENT_COMPARISON = 2, - USEARCH_ATTRIBUTE_COUNT +#ifndef U_HIDE_DEPRECATED_API + /** + * One more than the highest normal USearchAttribute value. + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ + USEARCH_ATTRIBUTE_COUNT = 3 +#endif /* U_HIDE_DEPRECATED_API */ } USearchAttribute; /** * @stable ICU 2.4 */ typedef enum { - /** Default value for any USearchAttribute */ + /** + * Default value for any USearchAttribute + * @stable ICU 2.4 + */ USEARCH_DEFAULT = -1, - /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ + /** + * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH + * @stable ICU 2.4 + */ USEARCH_OFF, - /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ + /** + * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH + * @stable ICU 2.4 + */ USEARCH_ON, /** * Value (default) for USEARCH_ELEMENT_COMPARISON; @@ -199,6 +236,11 @@ typedef enum { * the pattern will match a plain e or an e with any diacritic in the * searched text, but an e with diacritic in the pattern will only * match an e with the same diacritic in the searched text. + * + * This supports "asymmetric search" as described in + * + * UTS #10 Unicode Collation Algorithm. + * * @stable ICU 4.4 */ USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, @@ -213,11 +255,23 @@ typedef enum { * in the pattern will match a plain e or an e with any diacritic in the * searched text, but an e with diacritic in the pattern will only * match an e with the same diacritic or a plain e in the searched text. + * + * This option is similar to "asymmetric search" as described in + * [UTS #10 Unicode Collation Algorithm](http://www.unicode.org/reports/tr10/#Asymmetric_Search), + * but also allows unmarked characters in the searched text to match + * marked or unmarked versions of that character in the pattern. + * * @stable ICU 4.4 */ USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, +#ifndef U_HIDE_DEPRECATED_API + /** + * One more than the highest normal USearchAttributeValue value. + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ USEARCH_ATTRIBUTE_VALUE_COUNT +#endif /* U_HIDE_DEPRECATED_API */ } USearchAttributeValue; /* open and close ------------------------------------------------------ */ @@ -425,7 +479,7 @@ U_STABLE int32_t U_EXPORT2 usearch_getMatchedLength( * possible. If the buffer fits the matched text exactly, a null-termination * is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status. * Pre-flighting can be either done with length = 0 or the API -* usearch_getMatchLength. +* usearch_getMatchedLength. * @param strsrch search iterator data struct * @param result UChar buffer to store the matched string * @param resultCapacity length of the result buffer @@ -586,8 +640,8 @@ U_STABLE int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, UErrorCode *status); /** -* Returns the first index greater than position at which the string -* text +* Returns the first index equal or greater than position at which +* the string text * matches the search pattern. The iterator is adjusted so that its current * index (as returned by usearch_getOffset) is the match position if * one was found. @@ -638,7 +692,12 @@ U_STABLE int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, *
* Search positions that may render incorrect results are highlighted in the * header comments. If position is less than or greater than the text range -* for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned +* for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. +*
+* When USEARCH_OVERLAP option is off, the last index of the +* result match is always less than position. +* When USERARCH_OVERLAP is on, the result match may span across +* position. * @param strsrch search iterator data struct * @param position index position the search is to begin at * @param status for errors if it occurs @@ -704,9 +763,10 @@ U_STABLE int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, */ U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch); +#ifndef U_HIDE_INTERNAL_API /** * Simple forward search for the pattern, starting at a specified index, - * and using using a default set search options. + * and using a default set search options. * * This is an experimental function, and is not an official part of the * ICU API. @@ -723,7 +783,7 @@ U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch); * are part of a combining sequence, as described below. * * A match will not include a partial combining sequence. Combining - * character sequences are considered to be inseperable units, + * character sequences are considered to be inseparable units, * and either match the pattern completely, or are considered to not match * at all. Thus, for example, an A followed a combining accent mark will * not be found when searching for a plain (unaccented) A. (unless @@ -732,7 +792,7 @@ U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch); * When beginning a search, the initial starting position, startIdx, * is assumed to be an acceptable match boundary with respect to * combining characters. A combining sequence that spans across the - * starting point will not supress a match beginning at startIdx. + * starting point will not suppress a match beginning at startIdx. * * Characters that expand to multiple collation elements * (German sharp-S becoming 'ss', or the composed forms of accented @@ -783,7 +843,7 @@ U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, * are part of a combining sequence, as described below. * * A match will not include a partial combining sequence. Combining - * character sequences are considered to be inseperable units, + * character sequences are considered to be inseparable units, * and either match the pattern completely, or are considered to not match * at all. Thus, for example, an A followed a combining accent mark will * not be found when searching for a plain (unaccented) A. (unless @@ -792,7 +852,7 @@ U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, * When beginning a search, the initial starting position, startIdx, * is assumed to be an acceptable match boundary with respect to * combining characters. A combining sequence that spans across the - * starting point will not supress a match beginning at startIdx. + * starting point will not suppress a match beginning at startIdx. * * Characters that expand to multiple collation elements * (German sharp-S becoming 'ss', or the composed forms of accented @@ -823,6 +883,7 @@ U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, int32_t *matchStart, int32_t *matchLimit, UErrorCode *status); +#endif /* U_HIDE_INTERNAL_API */ #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */