1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2011 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 03/22/2000 helena Creation.
9 **********************************************************************
15 #include "unicode/utypes.h"
19 * \brief C++ API: SearchIterator object.
22 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
24 #include "unicode/uobject.h"
25 #include "unicode/unistr.h"
26 #include "unicode/chariter.h"
27 #include "unicode/brkiter.h"
28 #include "unicode/usearch.h"
37 typedef struct USearch USearch
;
39 #if U_SHOW_CPLUSPLUS_API
44 * <tt>SearchIterator</tt> is an abstract base class that provides
45 * methods to search for a pattern within a text string. Instances of
46 * <tt>SearchIterator</tt> maintain a current position and scans over the
47 * target text, returning the indices the pattern is matched and the length
50 * <tt>SearchIterator</tt> defines a protocol for text searching.
51 * Subclasses provide concrete implementations of various search algorithms.
52 * For example, <tt>StringSearch</tt> implements language-sensitive pattern
53 * matching based on the comparison rules defined in a
54 * <tt>RuleBasedCollator</tt> object.
56 * Other options for searching includes using a BreakIterator to restrict
57 * the points at which matches are detected.
59 * <tt>SearchIterator</tt> provides an API that is similar to that of
60 * other text iteration classes such as <tt>BreakIterator</tt>. Using
61 * this class, it is easy to scan through text looking for all occurances of
62 * a given pattern. The following example uses a <tt>StringSearch</tt>
63 * object to find all instances of "fox" in the target string. Any other
64 * subclass of <tt>SearchIterator</tt> can be used in an identical
67 * UnicodeString target("The quick brown fox jumped over the lazy fox");
68 * UnicodeString pattern("fox");
70 * SearchIterator *iter = new StringSearch(pattern, target);
71 * UErrorCode error = U_ZERO_ERROR;
72 * for (int pos = iter->first(error); pos != USEARCH_DONE;
73 * pos = iter->next(error)) {
74 * printf("Found match at %d pos, length is %d\n", pos,
75 * iter.getMatchLength());
80 * @see RuleBasedCollator
82 class U_I18N_API SearchIterator
: public UObject
{
86 // public constructors and destructors -------------------------------
89 * Copy constructor that creates a SearchIterator instance with the same
90 * behavior, and iterating over the same text.
91 * @param other the SearchIterator instance to be copied.
94 SearchIterator(const SearchIterator
&other
);
97 * Destructor. Cleans up the search iterator data struct.
100 virtual ~SearchIterator();
102 // public get and set methods ----------------------------------------
105 * Sets the index to point to the given position, and clears any state
108 * This method takes the argument index and sets the position in the text
109 * string accordingly without checking if the index is pointing to a
110 * valid starting point to begin searching.
111 * @param position within the text to be set. If position is less
112 * than or greater than the text range for searching,
113 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
114 * @param status for errors if it occurs
117 virtual void setOffset(int32_t position
, UErrorCode
&status
) = 0;
120 * Return the current index in the text being searched.
121 * If the iteration has gone past the end of the text
122 * (or past the beginning for a backwards search), USEARCH_DONE
124 * @return current index in the text being searched.
127 virtual int32_t getOffset(void) const = 0;
130 * Sets the text searching attributes located in the enum
131 * USearchAttribute with values from the enum USearchAttributeValue.
132 * USEARCH_DEFAULT can be used for all attributes for resetting.
133 * @param attribute text attribute (enum USearchAttribute) to be set
134 * @param value text attribute value
135 * @param status for errors if it occurs
138 void setAttribute(USearchAttribute attribute
,
139 USearchAttributeValue value
,
143 * Gets the text searching attributes
144 * @param attribute text attribute (enum USearchAttribute) to be retrieve
145 * @return text attribute value
148 USearchAttributeValue
getAttribute(USearchAttribute attribute
) const;
151 * Returns the index to the match in the text string that was searched.
152 * This call returns a valid result only after a successful call to
153 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
154 * Just after construction, or after a searching method returns
155 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
157 * Use getMatchedLength to get the matched string length.
158 * @return index of a substring within the text string that is being
166 int32_t getMatchedStart(void) const;
169 * Returns the length of text in the string which matches the search
170 * pattern. This call returns a valid result only after a successful call
171 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
172 * Just after construction, or after a searching method returns
173 * <tt>USEARCH_DONE</tt>, this method will return 0.
174 * @return The length of the match in the target text, or 0 if there
175 * is no match currently.
182 int32_t getMatchedLength(void) const;
185 * Returns the text that was matched by the most recent call to
186 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
187 * If the iterator is not pointing at a valid match (e.g. just after
188 * construction or after <tt>USEARCH_DONE</tt> has been returned,
189 * returns an empty string.
190 * @param result stores the matched string or an empty string if a match
198 void getMatchedText(UnicodeString
&result
) const;
201 * Set the BreakIterator that will be used to restrict the points
202 * at which matches are detected. The user is responsible for deleting
204 * @param breakiter A BreakIterator that will be used to restrict the
205 * points at which matches are detected. If a match is
206 * found, but the match's start or end index is not a
207 * boundary as determined by the <tt>BreakIterator</tt>,
208 * the match will be rejected and another will be searched
209 * for. If this parameter is <tt>NULL</tt>, no break
210 * detection is attempted.
211 * @param status for errors if it occurs
215 void setBreakIterator(BreakIterator
*breakiter
, UErrorCode
&status
);
218 * Returns the BreakIterator that is used to restrict the points at
219 * which matches are detected. This will be the same object that was
220 * passed to the constructor or to <tt>setBreakIterator</tt>.
221 * Note that <tt>NULL</tt> is a legal value; it means that break
222 * detection should not be attempted.
223 * @return BreakIterator used to restrict matchings.
224 * @see #setBreakIterator
227 const BreakIterator
* getBreakIterator(void) const;
230 * Set the string text to be searched. Text iteration will hence begin at
231 * the start of the text string. This method is useful if you want to
232 * re-use an iterator to search for the same pattern within a different
233 * body of text. The user is responsible for deleting the text.
234 * @param text string to be searched.
235 * @param status for errors. If the text length is 0,
236 * an U_ILLEGAL_ARGUMENT_ERROR is returned.
239 virtual void setText(const UnicodeString
&text
, UErrorCode
&status
);
242 * Set the string text to be searched. Text iteration will hence begin at
243 * the start of the text string. This method is useful if you want to
244 * re-use an iterator to search for the same pattern within a different
247 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
248 * will be done during searching for this version. The block of text
249 * in <tt>CharacterIterator</tt> will be used as it is.
250 * The user is responsible for deleting the text.
251 * @param text string iterator to be searched.
252 * @param status for errors if any. If the text length is 0 then an
253 * U_ILLEGAL_ARGUMENT_ERROR is returned.
256 virtual void setText(CharacterIterator
&text
, UErrorCode
&status
);
259 * Return the string text to be searched.
260 * @return text string to be searched.
263 const UnicodeString
& getText(void) const;
265 // operator overloading ----------------------------------------------
269 * @param that SearchIterator instance to be compared.
270 * @return TRUE if both BreakIterators are of the same class, have the
271 * same behavior, terates over the same text and have the same
272 * attributes. FALSE otherwise.
275 virtual UBool
operator==(const SearchIterator
&that
) const;
278 * Not-equal operator.
279 * @param that SearchIterator instance to be compared.
280 * @return FALSE if operator== returns TRUE, and vice versa.
283 UBool
operator!=(const SearchIterator
&that
) const;
285 // public methods ----------------------------------------------------
288 * Returns a copy of SearchIterator with the same behavior, and
289 * iterating over the same text, as this one. Note that all data will be
290 * replicated, except for the text string to be searched.
291 * @return cloned object
294 virtual SearchIterator
* safeClone(void) const = 0;
297 * Returns the first index at which the string text matches the search
298 * pattern. The iterator is adjusted so that its current index (as
299 * returned by <tt>getOffset</tt>) is the match position if one
301 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
302 * the iterator will be adjusted to the index USEARCH_DONE
303 * @param status for errors if it occurs
304 * @return The character index of the first match, or
305 * <tt>USEARCH_DONE</tt> if there are no matches.
309 int32_t first(UErrorCode
&status
);
312 * Returns the first index equal or greater than <tt>position</tt> at which the
313 * string text matches the search pattern. The iterator is adjusted so
314 * that its current index (as returned by <tt>getOffset</tt>) is the
315 * match position if one was found.
316 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
317 * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
318 * @param position where search if to start from. If position is less
319 * than or greater than the text range for searching,
320 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
321 * @param status for errors if it occurs
322 * @return The character index of the first match following
323 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
328 int32_t following(int32_t position
, UErrorCode
&status
);
331 * Returns the last index in the target text at which it matches the
332 * search pattern. The iterator is adjusted so that its current index
333 * (as returned by <tt>getOffset</tt>) is the match position if one was
335 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
336 * the iterator will be adjusted to the index USEARCH_DONE.
337 * @param status for errors if it occurs
338 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
339 * there are no matches.
343 int32_t last(UErrorCode
&status
);
346 * Returns the first index less than <tt>position</tt> at which the string
347 * text matches the search pattern. The iterator is adjusted so that its
348 * current index (as returned by <tt>getOffset</tt>) is the match
349 * position if one was found. If a match is not found,
350 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
351 * adjusted to the index USEARCH_DONE
353 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
354 * result match is always less than <tt>position</tt>.
355 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
358 * @param position where search is to start from. If position is less
359 * than or greater than the text range for searching,
360 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
361 * @param status for errors if it occurs
362 * @return The character index of the first match preceding
363 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
368 int32_t preceding(int32_t position
, UErrorCode
&status
);
371 * Returns the index of the next point at which the text matches the
372 * search pattern, starting from the current position
373 * The iterator is adjusted so that its current index (as returned by
374 * <tt>getOffset</tt>) is the match position if one was found.
375 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
376 * the iterator will be adjusted to a position after the end of the text
378 * @param status for errors if it occurs
379 * @return The index of the next match after the current position,
380 * or <tt>USEARCH_DONE</tt> if there are no more matches.
384 int32_t next(UErrorCode
&status
);
387 * Returns the index of the previous point at which the string text
388 * matches the search pattern, starting at the current position.
389 * The iterator is adjusted so that its current index (as returned by
390 * <tt>getOffset</tt>) is the match position if one was found.
391 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
392 * the iterator will be adjusted to the index USEARCH_DONE
393 * @param status for errors if it occurs
394 * @return The index of the previous match before the current position,
395 * or <tt>USEARCH_DONE</tt> if there are no more matches.
399 int32_t previous(UErrorCode
&status
);
402 * Resets the iteration.
403 * Search will begin at the start of the text string if a forward
404 * iteration is initiated before a backwards iteration. Otherwise if a
405 * backwards iteration is initiated before a forwards iteration, the
406 * search will begin at the end of the text string.
409 virtual void reset();
412 // protected data members ---------------------------------------------
415 * C search data struct
422 * Currently the C++ breakiterator does not have getRules etc to reproduce
423 * another in C. Hence we keep the original around and do the verification
424 * at the end of the match. The user is responsible for deleting this
428 BreakIterator
*m_breakiterator_
;
431 * Unicode string version of the search text
434 UnicodeString m_text_
;
436 // protected constructors and destructors -----------------------------
439 * Default constructor.
440 * Initializes data to the default values.
446 * Constructor for use by subclasses.
447 * @param text The target text to be searched.
448 * @param breakiter A {@link BreakIterator} that is used to restrict the
449 * points at which matches are detected. If
450 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
451 * match, but the match's start or end index is not a
452 * boundary as determined by the <tt>BreakIterator</tt>,
453 * the match is rejected and <tt>handleNext</tt> or
454 * <tt>handlePrev</tt> is called again. If this parameter
455 * is <tt>NULL</tt>, no break detection is attempted.
460 SearchIterator(const UnicodeString
&text
,
461 BreakIterator
*breakiter
= NULL
);
464 * Constructor for use by subclasses.
466 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
467 * will be done during searching for this version. The block of text
468 * in <tt>CharacterIterator</tt> will be used as it is.
469 * @param text The target text to be searched.
470 * @param breakiter A {@link BreakIterator} that is used to restrict the
471 * points at which matches are detected. If
472 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
473 * match, but the match's start or end index is not a
474 * boundary as determined by the <tt>BreakIterator</tt>,
475 * the match is rejected and <tt>handleNext</tt> or
476 * <tt>handlePrev</tt> is called again. If this parameter
477 * is <tt>NULL</tt>, no break detection is attempted.
482 SearchIterator(CharacterIterator
&text
, BreakIterator
*breakiter
= NULL
);
484 // protected methods --------------------------------------------------
487 * Assignment operator. Sets this iterator to have the same behavior,
488 * and iterate over the same text, as the one passed in.
489 * @param that instance to be copied.
492 SearchIterator
& operator=(const SearchIterator
&that
);
495 * Abstract method which subclasses override to provide the mechanism
496 * for finding the next match in the target text. This allows different
497 * subclasses to provide different search algorithms.
499 * If a match is found, the implementation should return the index at
500 * which the match starts and should call
501 * <tt>setMatchLength</tt> with the number of characters
502 * in the target text that make up the match. If no match is found, the
503 * method should return USEARCH_DONE.
505 * @param position The index in the target text at which the search
507 * @param status for error codes if it occurs.
508 * @return index at which the match starts, else if match is not found
509 * USEARCH_DONE is returned
510 * @see #setMatchLength
513 virtual int32_t handleNext(int32_t position
, UErrorCode
&status
)
517 * Abstract method which subclasses override to provide the mechanism for
518 * finding the previous match in the target text. This allows different
519 * subclasses to provide different search algorithms.
521 * If a match is found, the implementation should return the index at
522 * which the match starts and should call
523 * <tt>setMatchLength</tt> with the number of characters
524 * in the target text that make up the match. If no match is found, the
525 * method should return USEARCH_DONE.
527 * @param position The index in the target text at which the search
529 * @param status for error codes if it occurs.
530 * @return index at which the match starts, else if match is not found
531 * USEARCH_DONE is returned
532 * @see #setMatchLength
535 virtual int32_t handlePrev(int32_t position
, UErrorCode
&status
)
539 * Sets the length of the currently matched string in the text string to
541 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
542 * methods should call this when they find a match in the target text.
543 * @param length length of the matched text.
548 virtual void setMatchLength(int32_t length
);
551 * Sets the offset of the currently matched string in the text string to
553 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
554 * methods should call this when they find a match in the target text.
555 * @param position start offset of the matched text.
560 virtual void setMatchStart(int32_t position
);
563 * sets match not found
566 void setMatchNotFound();
569 inline UBool
SearchIterator::operator!=(const SearchIterator
&that
) const
571 return !operator==(that
);
574 #endif // U_SHOW_CPLUSPLUS_API
576 #endif /* #if !UCONFIG_NO_COLLATION */