+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
-* Copyright (C) 2001-2010 IBM and others. All rights reserved.
+* Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 06/28/2001 synwee Creation.
* \file
* \brief C API: StringSearch
*
- * C Apis for an engine that provides language-sensitive text searching based
+ * C APIs for an engine that provides language-sensitive text searching based
* on the comparison rules defined in a <tt>UCollator</tt> data struct,
* see <tt>ucol.h</tt>. This ensures that language eccentricity can be
* handled, e.g. for the German collator, characters ß and SS will be matched
* See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
* "ICU Collation Design Document"</a> for more information.
* <p>
- * The algorithm implemented is a modified form of the Boyer Moore's search.
- * For more information see
+ * The implementation may use a linear search or a modified form of the Boyer-Moore
+ * search; for more information on the latter see
* <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.html">
* "Efficient Text Searching in Java"</a>, published in <i>Java Report</i>
- * in February, 1999, for further information on the algorithm.
+ * in February, 1999.
* <p>
* There are 2 match options for selection:<br>
* Let S' be the sub-string of a text string S between the offsets start and
* <p>
* This search has APIs similar to that of other text iteration mechanisms
* such as the break iterators in <tt>ubrk.h</tt>. Using these
- * APIs, it is easy to scan through text looking for all occurances of
+ * APIs, it is easy to scan through text looking for all occurrences of
* a given pattern. This search iterator allows changing of direction by
* calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>.
* Though a direction change can occur without calling <tt>reset</tt> first,
* for the pattern "abab" in the text "ababab", where else mutually
* exclusive matches only produce the result of 0.
* <p>
+ * Options are also provided to implement "asymmetric search" as described in
+ * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
+ * UTS #10 Unicode Collation Algorithm</a>, specifically the USearchAttribute
+ * USEARCH_ELEMENT_COMPARISON and its values.
+ * <p>
* Though collator attributes will be taken into consideration while
* performing matches, there are no APIs here for setting and getting the
* attributes. These attributes can be set by getting the collator
* pos = usearch_next(search, &status))
* {
* printf("Found match at %d pos, length is %d\n", pos,
- * usearch_getMatchLength(search));
+ * usearch_getMatchedLength(search));
* }
* }
*
* @stable ICU 2.4
*/
typedef enum {
- /** Option for overlapping matches */
- USEARCH_OVERLAP,
+ /**
+ * Option for overlapping matches
+ * @stable ICU 2.4
+ */
+ USEARCH_OVERLAP = 0,
+#ifndef U_HIDE_DEPRECATED_API
/**
- * Option for canonical matches. option 1 in header documentation.
- * The default value will be USEARCH_OFF
+ * Option for canonical matches; option 1 in header documentation.
+ * The default value will be USEARCH_OFF.
+ * Note: Setting this option to USEARCH_ON currently has no effect on
+ * search behavior, and this option is deprecated. Instead, to control
+ * canonical match behavior, you must set UCOL_NORMALIZATION_MODE
+ * appropriately (to UCOL_OFF or UCOL_ON) in the UCollator used by
+ * the UStringSearch object.
+ * @see usearch_openFromCollator
+ * @see usearch_getCollator
+ * @see usearch_setCollator
+ * @see ucol_getAttribute
+ * @deprecated ICU 53
*/
- USEARCH_CANONICAL_MATCH,
+ USEARCH_CANONICAL_MATCH = 1,
+#endif /* U_HIDE_DEPRECATED_API */
/**
* Option to control how collation elements are compared.
* The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON.
* @stable ICU 4.4
*/
- USEARCH_ELEMENT_COMPARISON,
+ USEARCH_ELEMENT_COMPARISON = 2,
- USEARCH_ATTRIBUTE_COUNT
+#ifndef U_HIDE_DEPRECATED_API
+ /**
+ * One more than the highest normal USearchAttribute value.
+ * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
+ */
+ USEARCH_ATTRIBUTE_COUNT = 3
+#endif /* U_HIDE_DEPRECATED_API */
} USearchAttribute;
/**
* @stable ICU 2.4
*/
typedef enum {
- /** Default value for any USearchAttribute */
+ /**
+ * Default value for any USearchAttribute
+ * @stable ICU 2.4
+ */
USEARCH_DEFAULT = -1,
- /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */
+ /**
+ * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH
+ * @stable ICU 2.4
+ */
USEARCH_OFF,
- /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */
+ /**
+ * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH
+ * @stable ICU 2.4
+ */
USEARCH_ON,
/**
* Value (default) for USEARCH_ELEMENT_COMPARISON;
* the pattern will match a plain e or an e with any diacritic in the
* searched text, but an e with diacritic in the pattern will only
* match an e with the same diacritic in the searched text.
+ *
+ * This supports "asymmetric search" as described in
+ * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
+ * UTS #10 Unicode Collation Algorithm</a>.
+ *
* @stable ICU 4.4
*/
USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD,
* in the pattern will match a plain e or an e with any diacritic in the
* searched text, but an e with diacritic in the pattern will only
* match an e with the same diacritic or a plain e in the searched text.
+ *
+ * This option is similar to "asymmetric search" as described in
+ * [UTS #10 Unicode Collation Algorithm](http://www.unicode.org/reports/tr10/#Asymmetric_Search),
+ * but also allows unmarked characters in the searched text to match
+ * marked or unmarked versions of that character in the pattern.
+ *
* @stable ICU 4.4
*/
USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD,
+#ifndef U_HIDE_DEPRECATED_API
+ /**
+ * One more than the highest normal USearchAttributeValue value.
+ * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
+ */
USEARCH_ATTRIBUTE_VALUE_COUNT
+#endif /* U_HIDE_DEPRECATED_API */
} USearchAttributeValue;
/* open and close ------------------------------------------------------ */
* possible. If the buffer fits the matched text exactly, a null-termination
* is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status.
* Pre-flighting can be either done with length = 0 or the API
-* <tt>usearch_getMatchLength</tt>.
+* <tt>usearch_getMatchedLength</tt>.
* @param strsrch search iterator data struct
* @param result UChar buffer to store the matched string
* @param resultCapacity length of the result buffer
UErrorCode *status);
/**
-* Returns the first index greater than <tt>position</tt> at which the string
-* text
+* Returns the first index equal or greater than <tt>position</tt> at which
+* the string text
* matches the search pattern. The iterator is adjusted so that its current
* index (as returned by <tt>usearch_getOffset</tt>) is the match position if
* one was found.
* <p>
* Search positions that may render incorrect results are highlighted in the
* header comments. If position is less than or greater than the text range
-* for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned
+* for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned.
+* <p>
+* When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
+* result match is always less than <tt>position</tt>.
+* When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
+* <tt>position</tt>.
* @param strsrch search iterator data struct
* @param position index position the search is to begin at
* @param status for errors if it occurs
*/
U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch);
+#ifndef U_HIDE_INTERNAL_API
/**
* Simple forward search for the pattern, starting at a specified index,
- * and using using a default set search options.
+ * and using a default set search options.
*
* This is an experimental function, and is not an official part of the
* ICU API.
* are part of a combining sequence, as described below.
*
* A match will not include a partial combining sequence. Combining
- * character sequences are considered to be inseperable units,
+ * character sequences are considered to be inseparable units,
* and either match the pattern completely, or are considered to not match
* at all. Thus, for example, an A followed a combining accent mark will
* not be found when searching for a plain (unaccented) A. (unless
* When beginning a search, the initial starting position, startIdx,
* is assumed to be an acceptable match boundary with respect to
* combining characters. A combining sequence that spans across the
- * starting point will not supress a match beginning at startIdx.
+ * starting point will not suppress a match beginning at startIdx.
*
* Characters that expand to multiple collation elements
* (German sharp-S becoming 'ss', or the composed forms of accented
* are part of a combining sequence, as described below.
*
* A match will not include a partial combining sequence. Combining
- * character sequences are considered to be inseperable units,
+ * character sequences are considered to be inseparable units,
* and either match the pattern completely, or are considered to not match
* at all. Thus, for example, an A followed a combining accent mark will
* not be found when searching for a plain (unaccented) A. (unless
* When beginning a search, the initial starting position, startIdx,
* is assumed to be an acceptable match boundary with respect to
* combining characters. A combining sequence that spans across the
- * starting point will not supress a match beginning at startIdx.
+ * starting point will not suppress a match beginning at startIdx.
*
* Characters that expand to multiple collation elements
* (German sharp-S becoming 'ss', or the composed forms of accented
int32_t *matchStart,
int32_t *matchLimit,
UErrorCode *status);
+#endif /* U_HIDE_INTERNAL_API */
#endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */