2 **********************************************************************
3 * Copyright (C) 2001-2003 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 03/22/2000 helena Creation.
7 **********************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_COLLATION
17 #include "unicode/uobject.h"
18 #include "unicode/unistr.h"
19 #include "unicode/chariter.h"
20 #include "unicode/brkiter.h"
21 #include "unicode/usearch.h"
30 typedef struct USearch USearch
;
35 * <tt>SearchIterator</tt> is an abstract base class that provides
36 * methods to search for a pattern within a text string. Instances of
37 * <tt>SearchIterator</tt> maintain a current position and scans over the
38 * target text, returning the indices the pattern is matched and the length
41 * <tt>SearchIterator</tt> defines a protocol for text searching.
42 * Subclasses provide concrete implementations of various search algorithms.
43 * For example, <tt>StringSearch</tt> implements language-sensitive pattern
44 * matching based on the comparison rules defined in a
45 * <tt>RuleBasedCollator</tt> object.
47 * Other options for searching includes using a BreakIterator to restrict
48 * the points at which matches are detected.
50 * <tt>SearchIterator</tt> provides an API that is similar to that of
51 * other text iteration classes such as <tt>BreakIterator</tt>. Using
52 * this class, it is easy to scan through text looking for all occurances of
53 * a given pattern. The following example uses a <tt>StringSearch</tt>
54 * object to find all instances of "fox" in the target string. Any other
55 * subclass of <tt>SearchIterator</tt> can be used in an identical
58 * UnicodeString target("The quick brown fox jumped over the lazy fox");
59 * UnicodeString pattern("fox");
61 * SearchIterator *iter = new StringSearch(pattern, target);
62 * UErrorCode error = U_ZERO_ERROR;
63 * for (int pos = iter->first(error); pos != USEARCH_DONE;
64 * pos = iter->next(error)) {
65 * printf("Found match at %d pos, length is %d\n", pos,
66 * iter.getMatchLength());
71 * @see RuleBasedCollator
73 class U_I18N_API SearchIterator
: public UObject
{
77 // public constructors and destructors -------------------------------
80 * Copy constructor that creates a SearchIterator instance with the same
81 * behavior, and iterating over the same text.
82 * @param other the SearchIterator instance to be copied.
85 SearchIterator(const SearchIterator
&other
);
88 * Destructor. Cleans up the search iterator data struct.
91 virtual ~SearchIterator();
93 // public get and set methods ----------------------------------------
96 * Sets the index to point to the given position, and clears any state
99 * This method takes the argument index and sets the position in the text
100 * string accordingly without checking if the index is pointing to a
101 * valid starting point to begin searching.
102 * @param position within the text to be set. If position is less
103 * than or greater than the text range for searching,
104 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
105 * @param status for errors if it occurs
108 virtual void setOffset(int32_t position
, UErrorCode
&status
) = 0;
111 * Return the current index in the text being searched.
112 * If the iteration has gone past the end of the text
113 * (or past the beginning for a backwards search), USEARCH_DONE
115 * @return current index in the text being searched.
118 virtual int32_t getOffset(void) const = 0;
121 * Sets the text searching attributes located in the enum
122 * USearchAttribute with values from the enum USearchAttributeValue.
123 * USEARCH_DEFAULT can be used for all attributes for resetting.
124 * @param attribute text attribute (enum USearchAttribute) to be set
125 * @param value text attribute value
126 * @param status for errors if it occurs
129 void setAttribute(USearchAttribute attribute
,
130 USearchAttributeValue value
,
134 * Gets the text searching attributes
135 * @param attribute text attribute (enum USearchAttribute) to be retrieve
136 * @return text attribute value
139 USearchAttributeValue
getAttribute(USearchAttribute attribute
) const;
142 * Returns the index to the match in the text string that was searched.
143 * This call returns a valid result only after a successful call to
144 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
145 * Just after construction, or after a searching method returns
146 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
148 * Use getMatchedLength to get the matched string length.
149 * @return index of a substring within the text string that is being
157 int32_t getMatchedStart(void) const;
160 * Returns the length of text in the string which matches the search
161 * pattern. This call returns a valid result only after a successful call
162 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
163 * Just after construction, or after a searching method returns
164 * <tt>USEARCH_DONE</tt>, this method will return 0.
165 * @return The length of the match in the target text, or 0 if there
166 * is no match currently.
173 int32_t getMatchedLength(void) const;
176 * Returns the text that was matched by the most recent call to
177 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
178 * If the iterator is not pointing at a valid match (e.g. just after
179 * construction or after <tt>USEARCH_DONE</tt> has been returned,
180 * returns an empty string.
181 * @param result stores the matched string or an empty string if a match
189 void getMatchedText(UnicodeString
&result
) const;
192 * Set the BreakIterator that will be used to restrict the points
193 * at which matches are detected. The user is responsible for deleting
195 * @param breakiter A BreakIterator that will be used to restrict the
196 * points at which matches are detected. If a match is
197 * found, but the match's start or end index is not a
198 * boundary as determined by the <tt>BreakIterator</tt>,
199 * the match will be rejected and another will be searched
200 * for. If this parameter is <tt>NULL</tt>, no break
201 * detection is attempted.
202 * @param status for errors if it occurs
206 void setBreakIterator(BreakIterator
*breakiter
, UErrorCode
&status
);
209 * Returns the BreakIterator that is used to restrict the points at
210 * which matches are detected. This will be the same object that was
211 * passed to the constructor or to <tt>setBreakIterator</tt>.
212 * Note that <tt>NULL</tt> is a legal value; it means that break
213 * detection should not be attempted.
214 * @return BreakIterator used to restrict matchings.
215 * @see #setBreakIterator
218 const BreakIterator
* getBreakIterator(void) const;
221 * Set the string text to be searched. Text iteration will hence begin at
222 * the start of the text string. This method is useful if you want to
223 * re-use an iterator to search for the same pattern within a different
224 * body of text. The user is responsible for deleting the text.
225 * @param text string to be searched.
226 * @param status for errors. If the text length is 0,
227 * an U_ILLEGAL_ARGUMENT_ERROR is returned.
230 virtual void setText(const UnicodeString
&text
, UErrorCode
&status
);
233 * Set the string text to be searched. Text iteration will hence begin at
234 * the start of the text string. This method is useful if you want to
235 * re-use an iterator to search for the same pattern within a different
238 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
239 * will be done during searching for this version. The block of text
240 * in <tt>CharacterIterator</tt> will be used as it is.
241 * The user is responsible for deleting the text.
242 * @param text string iterator to be searched.
243 * @param status for errors if any. If the text length is 0 then an
244 * U_ILLEGAL_ARGUMENT_ERROR is returned.
247 virtual void setText(CharacterIterator
&text
, UErrorCode
&status
);
250 * Return the string text to be searched.
251 * @return text string to be searched.
254 const UnicodeString
& getText(void) const;
256 // operator overloading ----------------------------------------------
260 * @param that SearchIterator instance to be compared.
261 * @return TRUE if both BreakIterators are of the same class, have the
262 * same behavior, terates over the same text and have the same
263 * attributes. FALSE otherwise.
266 virtual UBool
operator==(const SearchIterator
&that
) const;
269 * Not-equal operator.
270 * @param that SearchIterator instance to be compared.
271 * @return FALSE if operator== returns TRUE, and vice versa.
274 UBool
operator!=(const SearchIterator
&that
) const;
276 // public methods ----------------------------------------------------
279 * Returns a copy of SearchIterator with the same behavior, and
280 * iterating over the same text, as this one. Note that all data will be
281 * replicated, except for the text string to be searched.
282 * @return cloned object
285 virtual SearchIterator
* safeClone(void) const = 0;
288 * Returns the first index at which the string text matches the search
289 * pattern. The iterator is adjusted so that its current index (as
290 * returned by <tt>getOffset</tt>) is the match position if one
292 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
293 * the iterator will be adjusted to the index USEARCH_DONE
294 * @param status for errors if it occurs
295 * @return The character index of the first match, or
296 * <tt>USEARCH_DONE</tt> if there are no matches.
300 int32_t first(UErrorCode
&status
);
303 * Returns the first index greater than <tt>position</tt> at which the
304 * string text matches the search pattern. The iterator is adjusted so
305 * that its current index (as returned by <tt>getOffset</tt>) is the
306 * match position if one was found. If a match is not found,
307 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
308 * adjusted to the index USEARCH_DONE
309 * @param position where search if to start from. If position is less
310 * than or greater than the text range for searching,
311 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
312 * @param status for errors if it occurs
313 * @return The character index of the first match following
314 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
319 int32_t following(int32_t position
, UErrorCode
&status
);
322 * Returns the last index in the target text at which it matches the
323 * search pattern. The iterator is adjusted so that its current index
324 * (as returned by <tt>getOffset</tt>) is the match position if one was
326 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
327 * the iterator will be adjusted to the index USEARCH_DONE.
328 * @param status for errors if it occurs
329 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
330 * there are no matches.
334 int32_t last(UErrorCode
&status
);
337 * Returns the first index less than <tt>position</tt> at which the string
338 * text matches the search pattern. The iterator is adjusted so that its
339 * current index (as returned by <tt>getOffset</tt>) is the match
340 * position if one was found. If a match is not found,
341 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
342 * adjusted to the index USEARCH_DONE
343 * @param position where search is to start from. If position is less
344 * than or greater than the text range for searching,
345 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
346 * @param status for errors if it occurs
347 * @return The character index of the first match preceding
348 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
353 int32_t preceding(int32_t position
, UErrorCode
&status
);
356 * Returns the index of the next point at which the text matches the
357 * search pattern, starting from the current position
358 * The iterator is adjusted so that its current index (as returned by
359 * <tt>getOffset</tt>) is the match position if one was found.
360 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
361 * the iterator will be adjusted to a position after the end of the text
363 * @param status for errors if it occurs
364 * @return The index of the next match after the current position,
365 * or <tt>USEARCH_DONE</tt> if there are no more matches.
369 int32_t next(UErrorCode
&status
);
372 * Returns the index of the previous point at which the string text
373 * matches the search pattern, starting at the current position.
374 * The iterator is adjusted so that its current index (as returned by
375 * <tt>getOffset</tt>) is the match position if one was found.
376 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
377 * the iterator will be adjusted to the index USEARCH_DONE
378 * @param status for errors if it occurs
379 * @return The index of the previous match before the current position,
380 * or <tt>USEARCH_DONE</tt> if there are no more matches.
384 int32_t previous(UErrorCode
&status
);
387 * Resets the iteration.
388 * Search will begin at the start of the text string if a forward
389 * iteration is initiated before a backwards iteration. Otherwise if a
390 * backwards iteration is initiated before a forwards iteration, the
391 * search will begin at the end of the text string.
394 virtual void reset();
397 // protected data members ---------------------------------------------
400 * C search data struct
407 * Currently the C++ breakiterator does not have getRules etc to reproduce
408 * another in C. Hence we keep the original around and do the verification
409 * at the end of the match. The user is responsible for deleting this
413 BreakIterator
*m_breakiterator_
;
416 * Unicode string version of the search text
419 UnicodeString m_text_
;
421 // protected constructors and destructors -----------------------------
424 * Default constructor.
425 * Initializes data to the default values.
431 * Constructor for use by subclasses.
432 * @param text The target text to be searched.
433 * @param breakiter A {@link BreakIterator} that is used to restrict the
434 * points at which matches are detected. If
435 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
436 * match, but the match's start or end index is not a
437 * boundary as determined by the <tt>BreakIterator</tt>,
438 * the match is rejected and <tt>handleNext</tt> or
439 * <tt>handlePrev</tt> is called again. If this parameter
440 * is <tt>NULL</tt>, no break detection is attempted.
445 SearchIterator(const UnicodeString
&text
,
446 BreakIterator
*breakiter
= NULL
);
449 * Constructor for use by subclasses.
451 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
452 * will be done during searching for this version. The block of text
453 * in <tt>CharacterIterator</tt> will be used as it is.
454 * @param text The target text to be searched.
455 * @param breakiter A {@link BreakIterator} that is used to restrict the
456 * points at which matches are detected. If
457 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
458 * match, but the match's start or end index is not a
459 * boundary as determined by the <tt>BreakIterator</tt>,
460 * the match is rejected and <tt>handleNext</tt> or
461 * <tt>handlePrev</tt> is called again. If this parameter
462 * is <tt>NULL</tt>, no break detection is attempted.
467 SearchIterator(CharacterIterator
&text
, BreakIterator
*breakiter
= NULL
);
469 // protected methods --------------------------------------------------
472 * Assignment operator. Sets this iterator to have the same behavior,
473 * and iterate over the same text, as the one passed in.
474 * @param that instance to be copied.
477 SearchIterator
& operator=(const SearchIterator
&that
);
480 * Abstract method which subclasses override to provide the mechanism
481 * for finding the next match in the target text. This allows different
482 * subclasses to provide different search algorithms.
484 * If a match is found, the implementation should return the index at
485 * which the match starts and should call
486 * <tt>setMatchLength</tt> with the number of characters
487 * in the target text that make up the match. If no match is found, the
488 * method should return USEARCH_DONE.
490 * @param position The index in the target text at which the search
492 * @param status for error codes if it occurs.
493 * @return index at which the match starts, else if match is not found
494 * USEARCH_DONE is returned
495 * @see #setMatchLength
498 virtual int32_t handleNext(int32_t position
, UErrorCode
&status
)
502 * Abstract method which subclasses override to provide the mechanism for
503 * finding the previous match in the target text. This allows different
504 * subclasses to provide different search algorithms.
506 * If a match is found, the implementation should return the index at
507 * which the match starts and should call
508 * <tt>setMatchLength</tt> with the number of characters
509 * in the target text that make up the match. If no match is found, the
510 * method should return USEARCH_DONE.
512 * @param position The index in the target text at which the search
514 * @param status for error codes if it occurs.
515 * @return index at which the match starts, else if match is not found
516 * USEARCH_DONE is returned
517 * @see #setMatchLength
520 virtual int32_t handlePrev(int32_t position
, UErrorCode
&status
)
524 * Sets the length of the currently matched string in the text string to
526 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
527 * methods should call this when they find a match in the target text.
528 * @param length length of the matched text.
533 virtual void setMatchLength(int32_t length
);
536 * Sets the offset of the currently matched string in the text string to
538 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
539 * methods should call this when they find a match in the target text.
540 * @param position start offset of the matched text.
545 virtual void setMatchStart(int32_t position
);
548 * sets match not found
551 void setMatchNotFound();
554 inline UBool
SearchIterator::operator!=(const SearchIterator
&that
) const
556 return !operator==(that
);
560 #endif /* #if !UCONFIG_NO_COLLATION */