1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2014 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 03/22/2000 helena Creation.
9 **********************************************************************
15 #include "unicode/utypes.h"
19 * \brief C++ API: Service for searching text based on RuleBasedCollator.
22 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
24 #include "unicode/tblcoll.h"
25 #include "unicode/coleitr.h"
26 #include "unicode/search.h"
28 #if U_SHOW_CPLUSPLUS_API
33 * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides
34 * language-sensitive text searching based on the comparison rules defined
35 * in a {@link RuleBasedCollator} object.
36 * StringSearch ensures that language eccentricity can be
37 * handled, e.g. for the German collator, characters ß and SS will be matched
38 * if case is chosen to be ignored.
39 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
40 * "ICU Collation Design Document"</a> for more information.
42 * There are 2 match options for selection:<br>
43 * Let S' be the sub-string of a text string S between the offsets start and
46 * A pattern string P matches a text string S at the offsets [start, end]
49 * option 1. Some canonical equivalent of P matches some canonical equivalent
51 * option 2. P matches S' and if P starts or ends with a combining mark,
52 * there exists no non-ignorable combining mark before or after S?
55 * Option 2. will be the default.
57 * This search has APIs similar to that of other text iteration mechanisms
58 * such as the break iterators in <tt>BreakIterator</tt>. Using these
59 * APIs, it is easy to scan through text looking for all occurrences of
60 * a given pattern. This search iterator allows changing of direction by
61 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>.
62 * Though a direction change can occur without calling <tt>reset</tt> first,
63 * this operation comes with some speed penalty.
64 * Match results in the forward direction will match the result matches in
65 * the backwards direction in the reverse order
67 * <tt>SearchIterator</tt> provides APIs to specify the starting position
68 * within the text string to be searched, e.g. <tt>setOffset</tt>,
69 * <tt>preceding</tt> and <tt>following</tt>. Since the
70 * starting position will be set as it is specified, please take note that
71 * there are some danger points which the search may render incorrect
74 * <li> The midst of a substring that requires normalization.
75 * <li> If the following match is to be found, the position should not be the
76 * second character which requires to be swapped with the preceding
77 * character. Vice versa, if the preceding match is to be found,
78 * position to search from should not be the first character which
79 * requires to be swapped with the next character. E.g certain Thai and
80 * Lao characters require swapping.
81 * <li> If a following pattern match is to be found, any position within a
82 * contracting sequence except the first will fail. Vice versa if a
83 * preceding pattern match is to be found, a invalid starting point
84 * would be any character within a contracting sequence except the last.
87 * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired.
88 * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the
89 * boundaries given by the breakiterator. For instance the pattern "e" will
90 * not be found in the string "\u00e9" if a character break iterator is used.
92 * Options are provided to handle overlapping matches.
93 * E.g. In English, overlapping matches produces the result 0 and 2
94 * for the pattern "abab" in the text "ababab", where else mutually
95 * exclusive matches only produce the result of 0.
97 * Though collator attributes will be taken into consideration while
98 * performing matches, there are no APIs here for setting and getting the
99 * attributes. These attributes can be set by getting the collator
100 * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>.
101 * Lastly to update <tt>StringSearch</tt> to the new collator attributes,
102 * <tt>reset</tt> has to be called.
105 * Currently there are no composite characters that consists of a
106 * character with combining class > 0 before a character with combining
107 * class == 0. However, if such a character exists in the future,
108 * <tt>StringSearch</tt> does not guarantee the results for option 1.
110 * Consult the <tt>SearchIterator</tt> documentation for information on
111 * and examples of how to use instances of this class to implement text
114 * UnicodeString target("The quick brown fox jumps over the lazy dog.");
115 * UnicodeString pattern("fox");
117 * UErrorCode error = U_ZERO_ERROR;
118 * StringSearch iter(pattern, target, Locale::getUS(), NULL, status);
119 * for (int pos = iter.first(error);
120 * pos != USEARCH_DONE;
121 * pos = iter.next(error))
123 * printf("Found match at %d pos, length is %d\n", pos,
124 * iter.getMatchLength());
128 * Note, <tt>StringSearch</tt> is not to be subclassed.
130 * @see SearchIterator
131 * @see RuleBasedCollator
135 class U_I18N_API StringSearch U_FINAL
: public SearchIterator
139 // public constructors and destructors --------------------------------
142 * Creating a <tt>StringSearch</tt> instance using the argument locale
143 * language rule set. A collator will be created in the process, which
144 * will be owned by this instance and will be deleted during
146 * @param pattern The text for which this object will search.
147 * @param text The text in which to search for the pattern.
148 * @param locale A locale which defines the language-sensitive
149 * comparison rules used to determine whether text in the
150 * pattern and target matches.
151 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
152 * the matches that are found. Matches whose start and end
153 * indices in the target text are not boundaries as
154 * determined by the <tt>BreakIterator</tt> are
155 * ignored. If this behavior is not desired,
156 * <tt>NULL</tt> can be passed in instead.
157 * @param status for errors if any. If pattern or text is NULL, or if
158 * either the length of pattern or text is 0 then an
159 * U_ILLEGAL_ARGUMENT_ERROR is returned.
162 StringSearch(const UnicodeString
&pattern
, const UnicodeString
&text
,
163 const Locale
&locale
,
164 BreakIterator
*breakiter
,
168 * Creating a <tt>StringSearch</tt> instance using the argument collator
169 * language rule set. Note, user retains the ownership of this collator,
170 * it does not get destroyed during this instance's destruction.
171 * @param pattern The text for which this object will search.
172 * @param text The text in which to search for the pattern.
173 * @param coll A <tt>RuleBasedCollator</tt> object which defines
174 * the language-sensitive comparison rules used to
175 * determine whether text in the pattern and target
176 * matches. User is responsible for the clearing of this
178 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
179 * the matches that are found. Matches whose start and end
180 * indices in the target text are not boundaries as
181 * determined by the <tt>BreakIterator</tt> are
182 * ignored. If this behavior is not desired,
183 * <tt>NULL</tt> can be passed in instead.
184 * @param status for errors if any. If either the length of pattern or
185 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
188 StringSearch(const UnicodeString
&pattern
,
189 const UnicodeString
&text
,
190 RuleBasedCollator
*coll
,
191 BreakIterator
*breakiter
,
195 * Creating a <tt>StringSearch</tt> instance using the argument locale
196 * language rule set. A collator will be created in the process, which
197 * will be owned by this instance and will be deleted during
200 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
201 * will be done during searching for this version. The block of text
202 * in <tt>CharacterIterator</tt> will be used as it is.
203 * @param pattern The text for which this object will search.
204 * @param text The text iterator in which to search for the pattern.
205 * @param locale A locale which defines the language-sensitive
206 * comparison rules used to determine whether text in the
207 * pattern and target matches. User is responsible for
208 * the clearing of this object.
209 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
210 * the matches that are found. Matches whose start and end
211 * indices in the target text are not boundaries as
212 * determined by the <tt>BreakIterator</tt> are
213 * ignored. If this behavior is not desired,
214 * <tt>NULL</tt> can be passed in instead.
215 * @param status for errors if any. If either the length of pattern or
216 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
219 StringSearch(const UnicodeString
&pattern
, CharacterIterator
&text
,
220 const Locale
&locale
,
221 BreakIterator
*breakiter
,
225 * Creating a <tt>StringSearch</tt> instance using the argument collator
226 * language rule set. Note, user retains the ownership of this collator,
227 * it does not get destroyed during this instance's destruction.
229 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
230 * will be done during searching for this version. The block of text
231 * in <tt>CharacterIterator</tt> will be used as it is.
232 * @param pattern The text for which this object will search.
233 * @param text The text in which to search for the pattern.
234 * @param coll A <tt>RuleBasedCollator</tt> object which defines
235 * the language-sensitive comparison rules used to
236 * determine whether text in the pattern and target
237 * matches. User is responsible for the clearing of this
239 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
240 * the matches that are found. Matches whose start and end
241 * indices in the target text are not boundaries as
242 * determined by the <tt>BreakIterator</tt> are
243 * ignored. If this behavior is not desired,
244 * <tt>NULL</tt> can be passed in instead.
245 * @param status for errors if any. If either the length of pattern or
246 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
249 StringSearch(const UnicodeString
&pattern
, CharacterIterator
&text
,
250 RuleBasedCollator
*coll
,
251 BreakIterator
*breakiter
,
255 * Copy constructor that creates a StringSearch instance with the same
256 * behavior, and iterating over the same text.
257 * @param that StringSearch instance to be copied.
260 StringSearch(const StringSearch
&that
);
263 * Destructor. Cleans up the search iterator data struct.
264 * If a collator is created in the constructor, it will be destroyed here.
267 virtual ~StringSearch(void);
271 * Clones can be used concurrently in multiple threads.
272 * If an error occurs, then NULL is returned.
273 * The caller must delete the clone.
275 * @return a clone of this object
277 * @see getDynamicClassID
280 StringSearch
*clone() const;
282 // operator overloading ---------------------------------------------
285 * Assignment operator. Sets this iterator to have the same behavior,
286 * and iterate over the same text, as the one passed in.
287 * @param that instance to be copied.
290 StringSearch
& operator=(const StringSearch
&that
);
294 * @param that instance to be compared.
295 * @return TRUE if both instances have the same attributes,
296 * breakiterators, collators and iterate over the same text
297 * while looking for the same pattern.
300 virtual UBool
operator==(const SearchIterator
&that
) const;
302 // public get and set methods ----------------------------------------
305 * Sets the index to point to the given position, and clears any state
308 * This method takes the argument index and sets the position in the text
309 * string accordingly without checking if the index is pointing to a
310 * valid starting point to begin searching.
311 * @param position within the text to be set. If position is less
312 * than or greater than the text range for searching,
313 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
314 * @param status for errors if it occurs
317 virtual void setOffset(int32_t position
, UErrorCode
&status
);
320 * Return the current index in the text being searched.
321 * If the iteration has gone past the end of the text
322 * (or past the beginning for a backwards search), USEARCH_DONE
324 * @return current index in the text being searched.
327 virtual int32_t getOffset(void) const;
330 * Set the target text to be searched.
331 * Text iteration will hence begin at the start of the text string.
333 * useful if you want to re-use an iterator to search for the same
334 * pattern within a different body of text.
335 * @param text text string to be searched
336 * @param status for errors if any. If the text length is 0 then an
337 * U_ILLEGAL_ARGUMENT_ERROR is returned.
340 virtual void setText(const UnicodeString
&text
, UErrorCode
&status
);
343 * Set the target text to be searched.
344 * Text iteration will hence begin at the start of the text string.
346 * useful if you want to re-use an iterator to search for the same
347 * pattern within a different body of text.
348 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
349 * will be done during searching for this version. The block of text
350 * in <tt>CharacterIterator</tt> will be used as it is.
351 * @param text text string to be searched
352 * @param status for errors if any. If the text length is 0 then an
353 * U_ILLEGAL_ARGUMENT_ERROR is returned.
356 virtual void setText(CharacterIterator
&text
, UErrorCode
&status
);
359 * Gets the collator used for the language rules.
361 * Caller may modify but <b>must not</b> delete the <tt>RuleBasedCollator</tt>!
362 * Modifications to this collator will affect the original collator passed in to
363 * the <tt>StringSearch></tt> constructor or to setCollator, if any.
364 * @return collator used for string search
367 RuleBasedCollator
* getCollator() const;
370 * Sets the collator used for the language rules. User retains the
371 * ownership of this collator, thus the responsibility of deletion lies
372 * with the user. The iterator's position will not be changed by this method.
373 * @param coll collator
374 * @param status for errors if any
377 void setCollator(RuleBasedCollator
*coll
, UErrorCode
&status
);
380 * Sets the pattern used for matching.
381 * The iterator's position will not be changed by this method.
382 * @param pattern search pattern to be found
383 * @param status for errors if any. If the pattern length is 0 then an
384 * U_ILLEGAL_ARGUMENT_ERROR is returned.
387 void setPattern(const UnicodeString
&pattern
, UErrorCode
&status
);
390 * Gets the search pattern.
391 * @return pattern used for matching
394 const UnicodeString
& getPattern() const;
396 // public methods ----------------------------------------------------
399 * Reset the iteration.
400 * Search will begin at the start of the text string if a forward
401 * iteration is initiated before a backwards iteration. Otherwise if
402 * a backwards iteration is initiated before a forwards iteration, the
403 * search will begin at the end of the text string.
406 virtual void reset();
409 * Returns a copy of StringSearch with the same behavior, and
410 * iterating over the same text, as this one. Note that all data will be
411 * replicated, except for the user-specified collator and the
413 * @return cloned object
416 virtual SearchIterator
* safeClone(void) const;
419 * ICU "poor man's RTTI", returns a UClassID for the actual class.
423 virtual UClassID
getDynamicClassID() const;
426 * ICU "poor man's RTTI", returns a UClassID for this class.
430 static UClassID U_EXPORT2
getStaticClassID();
434 // protected method -------------------------------------------------
437 * Search forward for matching text, starting at a given location.
438 * Clients should not call this method directly; instead they should
439 * call {@link SearchIterator#next }.
441 * If a match is found, this method returns the index at which the match
442 * starts and calls {@link SearchIterator#setMatchLength } with the number
443 * of characters in the target text that make up the match. If no match
444 * is found, the method returns <tt>USEARCH_DONE</tt>.
446 * The <tt>StringSearch</tt> is adjusted so that its current index
447 * (as returned by {@link #getOffset }) is the match position if one was
449 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
450 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE.
451 * @param position The index in the target text at which the search
453 * @param status for errors if any occurs
454 * @return The index at which the matched text in the target starts, or
455 * USEARCH_DONE if no match was found.
458 virtual int32_t handleNext(int32_t position
, UErrorCode
&status
);
461 * Search backward for matching text, starting at a given location.
462 * Clients should not call this method directly; instead they should call
463 * <tt>SearchIterator.previous()</tt>, which this method overrides.
465 * If a match is found, this method returns the index at which the match
466 * starts and calls {@link SearchIterator#setMatchLength } with the number
467 * of characters in the target text that make up the match. If no match
468 * is found, the method returns <tt>USEARCH_DONE</tt>.
470 * The <tt>StringSearch</tt> is adjusted so that its current index
471 * (as returned by {@link #getOffset }) is the match position if one was
473 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
474 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE.
475 * @param position The index in the target text at which the search
477 * @param status for errors if any occurs
478 * @return The index at which the matched text in the target starts, or
479 * USEARCH_DONE if no match was found.
482 virtual int32_t handlePrev(int32_t position
, UErrorCode
&status
);
485 StringSearch(); // default constructor not implemented
487 // private data members ----------------------------------------------
493 UnicodeString m_pattern_
;
495 * String search struct data
498 UStringSearch
*m_strsrch_
;
503 #endif // U_SHOW_CPLUSPLUS_API
505 #endif /* #if !UCONFIG_NO_COLLATION */