]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/unicode/search.h
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / i18n / unicode / search.h
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 2001-2003 IBM and others. All rights reserved.
b75a7d8f
A
4**********************************************************************
5* Date Name Description
6* 03/22/2000 helena Creation.
7**********************************************************************
8*/
9
10#ifndef SEARCH_H
11#define SEARCH_H
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_COLLATION
16
17#include "unicode/uobject.h"
18#include "unicode/unistr.h"
19#include "unicode/chariter.h"
20#include "unicode/brkiter.h"
21#include "unicode/usearch.h"
22
23/**
24* @stable ICU 2.0
25*/
26struct USearch;
27/**
28* @stable ICU 2.0
29*/
30typedef struct USearch USearch;
31
32U_NAMESPACE_BEGIN
33
34/**
35 * <tt>SearchIterator</tt> is an abstract base class that provides
36 * methods to search for a pattern within a text string. Instances of
37 * <tt>SearchIterator</tt> maintain a current position and scans over the
38 * target text, returning the indices the pattern is matched and the length
39 * of each match.
40 * <p>
41 * <tt>SearchIterator</tt> defines a protocol for text searching.
42 * Subclasses provide concrete implementations of various search algorithms.
43 * For example, <tt>StringSearch</tt> implements language-sensitive pattern
44 * matching based on the comparison rules defined in a
45 * <tt>RuleBasedCollator</tt> object.
46 * <p>
47 * Other options for searching includes using a BreakIterator to restrict
48 * the points at which matches are detected.
49 * <p>
50 * <tt>SearchIterator</tt> provides an API that is similar to that of
51 * other text iteration classes such as <tt>BreakIterator</tt>. Using
52 * this class, it is easy to scan through text looking for all occurances of
53 * a given pattern. The following example uses a <tt>StringSearch</tt>
54 * object to find all instances of "fox" in the target string. Any other
55 * subclass of <tt>SearchIterator</tt> can be used in an identical
56 * manner.
57 * <pre><code>
58 * UnicodeString target("The quick brown fox jumped over the lazy fox");
59 * UnicodeString pattern("fox");
60 *
61 * SearchIterator *iter = new StringSearch(pattern, target);
62 * UErrorCode error = U_ZERO_ERROR;
63 * for (int pos = iter->first(error); pos != USEARCH_DONE;
64 * pos = iter->next(error)) {
65 * printf("Found match at %d pos, length is %d\n", pos,
66 * iter.getMatchLength());
67 * }
68 * </code></pre>
69 *
70 * @see StringSearch
71 * @see RuleBasedCollator
72 */
73class U_I18N_API SearchIterator : public UObject {
74
75public:
76
77 // public constructors and destructors -------------------------------
78
79 /**
80 * Copy constructor that creates a SearchIterator instance with the same
81 * behavior, and iterating over the same text.
82 * @param other the SearchIterator instance to be copied.
83 * @stable ICU 2.0
84 */
85 SearchIterator(const SearchIterator &other);
86
87 /**
88 * Destructor. Cleans up the search iterator data struct.
89 * @stable ICU 2.0
90 */
91 virtual ~SearchIterator();
92
93 // public get and set methods ----------------------------------------
94
95 /**
96 * Sets the index to point to the given position, and clears any state
97 * that's affected.
98 * <p>
99 * This method takes the argument index and sets the position in the text
100 * string accordingly without checking if the index is pointing to a
101 * valid starting point to begin searching.
102 * @param position within the text to be set. If position is less
374ca955 103 * than or greater than the text range for searching,
b75a7d8f
A
104 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
105 * @param status for errors if it occurs
106 * @stable ICU 2.0
107 */
108 virtual void setOffset(int32_t position, UErrorCode &status) = 0;
109
110 /**
111 * Return the current index in the text being searched.
112 * If the iteration has gone past the end of the text
113 * (or past the beginning for a backwards search), USEARCH_DONE
114 * is returned.
115 * @return current index in the text being searched.
116 * @stable ICU 2.0
117 */
118 virtual int32_t getOffset(void) const = 0;
119
120 /**
121 * Sets the text searching attributes located in the enum
122 * USearchAttribute with values from the enum USearchAttributeValue.
123 * USEARCH_DEFAULT can be used for all attributes for resetting.
124 * @param attribute text attribute (enum USearchAttribute) to be set
125 * @param value text attribute value
126 * @param status for errors if it occurs
127 * @stable ICU 2.0
128 */
129 void setAttribute(USearchAttribute attribute,
130 USearchAttributeValue value,
131 UErrorCode &status);
132
133 /**
134 * Gets the text searching attributes
135 * @param attribute text attribute (enum USearchAttribute) to be retrieve
136 * @return text attribute value
137 * @stable ICU 2.0
138 */
139 USearchAttributeValue getAttribute(USearchAttribute attribute) const;
140
141 /**
142 * Returns the index to the match in the text string that was searched.
143 * This call returns a valid result only after a successful call to
144 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
145 * Just after construction, or after a searching method returns
146 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
147 * <p>
148 * Use getMatchedLength to get the matched string length.
149 * @return index of a substring within the text string that is being
150 * searched.
151 * @see #first
152 * @see #next
153 * @see #previous
154 * @see #last
155 * @stable ICU 2.0
156 */
157 int32_t getMatchedStart(void) const;
158
159 /**
160 * Returns the length of text in the string which matches the search
161 * pattern. This call returns a valid result only after a successful call
162 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
163 * Just after construction, or after a searching method returns
164 * <tt>USEARCH_DONE</tt>, this method will return 0.
165 * @return The length of the match in the target text, or 0 if there
166 * is no match currently.
167 * @see #first
168 * @see #next
169 * @see #previous
170 * @see #last
171 * @stable ICU 2.0
172 */
173 int32_t getMatchedLength(void) const;
174
175 /**
176 * Returns the text that was matched by the most recent call to
177 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
178 * If the iterator is not pointing at a valid match (e.g. just after
179 * construction or after <tt>USEARCH_DONE</tt> has been returned,
180 * returns an empty string.
181 * @param result stores the matched string or an empty string if a match
182 * is not found.
183 * @see #first
184 * @see #next
185 * @see #previous
186 * @see #last
187 * @stable ICU 2.0
188 */
189 void getMatchedText(UnicodeString &result) const;
190
191 /**
192 * Set the BreakIterator that will be used to restrict the points
193 * at which matches are detected. The user is responsible for deleting
194 * the breakiterator.
195 * @param breakiter A BreakIterator that will be used to restrict the
196 * points at which matches are detected. If a match is
197 * found, but the match's start or end index is not a
198 * boundary as determined by the <tt>BreakIterator</tt>,
199 * the match will be rejected and another will be searched
200 * for. If this parameter is <tt>NULL</tt>, no break
201 * detection is attempted.
202 * @param status for errors if it occurs
203 * @see BreakIterator
204 * @stable ICU 2.0
205 */
206 void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
207
208 /**
209 * Returns the BreakIterator that is used to restrict the points at
210 * which matches are detected. This will be the same object that was
211 * passed to the constructor or to <tt>setBreakIterator</tt>.
212 * Note that <tt>NULL</tt> is a legal value; it means that break
213 * detection should not be attempted.
214 * @return BreakIterator used to restrict matchings.
215 * @see #setBreakIterator
216 * @stable ICU 2.0
217 */
218 const BreakIterator * getBreakIterator(void) const;
219
220 /**
221 * Set the string text to be searched. Text iteration will hence begin at
222 * the start of the text string. This method is useful if you want to
223 * re-use an iterator to search for the same pattern within a different
224 * body of text. The user is responsible for deleting the text.
225 * @param text string to be searched.
226 * @param status for errors. If the text length is 0,
227 * an U_ILLEGAL_ARGUMENT_ERROR is returned.
228 * @stable ICU 2.0
229 */
230 virtual void setText(const UnicodeString &text, UErrorCode &status);
231
232 /**
233 * Set the string text to be searched. Text iteration will hence begin at
234 * the start of the text string. This method is useful if you want to
235 * re-use an iterator to search for the same pattern within a different
236 * body of text.
237 * <p>
238 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
239 * will be done during searching for this version. The block of text
240 * in <tt>CharacterIterator</tt> will be used as it is.
241 * The user is responsible for deleting the text.
242 * @param text string iterator to be searched.
243 * @param status for errors if any. If the text length is 0 then an
244 * U_ILLEGAL_ARGUMENT_ERROR is returned.
245 * @stable ICU 2.0
246 */
247 virtual void setText(CharacterIterator &text, UErrorCode &status);
248
249 /**
250 * Return the string text to be searched.
251 * @return text string to be searched.
252 * @stable ICU 2.0
253 */
254 const UnicodeString & getText(void) const;
255
256 // operator overloading ----------------------------------------------
257
258 /**
259 * Equality operator.
260 * @param that SearchIterator instance to be compared.
261 * @return TRUE if both BreakIterators are of the same class, have the
262 * same behavior, terates over the same text and have the same
263 * attributes. FALSE otherwise.
264 * @stable ICU 2.0
265 */
266 virtual UBool operator==(const SearchIterator &that) const;
267
268 /**
269 * Not-equal operator.
270 * @param that SearchIterator instance to be compared.
271 * @return FALSE if operator== returns TRUE, and vice versa.
272 * @stable ICU 2.0
273 */
274 UBool operator!=(const SearchIterator &that) const;
275
276 // public methods ----------------------------------------------------
277
278 /**
279 * Returns a copy of SearchIterator with the same behavior, and
280 * iterating over the same text, as this one. Note that all data will be
281 * replicated, except for the text string to be searched.
282 * @return cloned object
283 * @stable ICU 2.0
284 */
285 virtual SearchIterator* safeClone(void) const = 0;
286
287 /**
288 * Returns the first index at which the string text matches the search
289 * pattern. The iterator is adjusted so that its current index (as
290 * returned by <tt>getOffset</tt>) is the match position if one
291 * was found.
292 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
293 * the iterator will be adjusted to the index USEARCH_DONE
294 * @param status for errors if it occurs
295 * @return The character index of the first match, or
296 * <tt>USEARCH_DONE</tt> if there are no matches.
297 * @see #getOffset
298 * @stable ICU 2.0
299 */
300 int32_t first(UErrorCode &status);
301
302 /**
303 * Returns the first index greater than <tt>position</tt> at which the
304 * string text matches the search pattern. The iterator is adjusted so
305 * that its current index (as returned by <tt>getOffset</tt>) is the
306 * match position if one was found. If a match is not found,
307 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
308 * adjusted to the index USEARCH_DONE
309 * @param position where search if to start from. If position is less
374ca955 310 * than or greater than the text range for searching,
b75a7d8f
A
311 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
312 * @param status for errors if it occurs
313 * @return The character index of the first match following
314 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
315 * matches.
316 * @see #getOffset
317 * @stable ICU 2.0
318 */
319 int32_t following(int32_t position, UErrorCode &status);
320
321 /**
322 * Returns the last index in the target text at which it matches the
323 * search pattern. The iterator is adjusted so that its current index
324 * (as returned by <tt>getOffset</tt>) is the match position if one was
325 * found.
326 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
327 * the iterator will be adjusted to the index USEARCH_DONE.
328 * @param status for errors if it occurs
329 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
330 * there are no matches.
331 * @see #getOffset
332 * @stable ICU 2.0
333 */
334 int32_t last(UErrorCode &status);
335
336 /**
337 * Returns the first index less than <tt>position</tt> at which the string
338 * text matches the search pattern. The iterator is adjusted so that its
339 * current index (as returned by <tt>getOffset</tt>) is the match
340 * position if one was found. If a match is not found,
341 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
342 * adjusted to the index USEARCH_DONE
343 * @param position where search is to start from. If position is less
374ca955 344 * than or greater than the text range for searching,
b75a7d8f
A
345 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
346 * @param status for errors if it occurs
347 * @return The character index of the first match preceding
348 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
349 * no matches.
350 * @see #getOffset
351 * @stable ICU 2.0
352 */
353 int32_t preceding(int32_t position, UErrorCode &status);
354
355 /**
356 * Returns the index of the next point at which the text matches the
357 * search pattern, starting from the current position
358 * The iterator is adjusted so that its current index (as returned by
359 * <tt>getOffset</tt>) is the match position if one was found.
360 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
361 * the iterator will be adjusted to a position after the end of the text
362 * string.
363 * @param status for errors if it occurs
364 * @return The index of the next match after the current position,
365 * or <tt>USEARCH_DONE</tt> if there are no more matches.
366 * @see #getOffset
367 * @stable ICU 2.0
368 */
369 int32_t next(UErrorCode &status);
370
371 /**
372 * Returns the index of the previous point at which the string text
373 * matches the search pattern, starting at the current position.
374 * The iterator is adjusted so that its current index (as returned by
375 * <tt>getOffset</tt>) is the match position if one was found.
376 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
377 * the iterator will be adjusted to the index USEARCH_DONE
378 * @param status for errors if it occurs
379 * @return The index of the previous match before the current position,
380 * or <tt>USEARCH_DONE</tt> if there are no more matches.
381 * @see #getOffset
382 * @stable ICU 2.0
383 */
384 int32_t previous(UErrorCode &status);
385
386 /**
387 * Resets the iteration.
388 * Search will begin at the start of the text string if a forward
389 * iteration is initiated before a backwards iteration. Otherwise if a
390 * backwards iteration is initiated before a forwards iteration, the
391 * search will begin at the end of the text string.
392 * @stable ICU 2.0
393 */
394 virtual void reset();
395
396protected:
397 // protected data members ---------------------------------------------
398
399 /**
400 * C search data struct
401 * @stable ICU 2.0
402 */
403 USearch *m_search_;
404
405 /**
406 * Break iterator.
407 * Currently the C++ breakiterator does not have getRules etc to reproduce
408 * another in C. Hence we keep the original around and do the verification
409 * at the end of the match. The user is responsible for deleting this
410 * break iterator.
411 * @stable ICU 2.0
412 */
413 BreakIterator *m_breakiterator_;
414
415 /**
416 * Unicode string version of the search text
417 * @stable ICU 2.0
418 */
419 UnicodeString m_text_;
420
421 // protected constructors and destructors -----------------------------
422
423 /**
424 * Default constructor.
425 * Initializes data to the default values.
426 * @stable ICU 2.0
427 */
428 SearchIterator();
429
430 /**
431 * Constructor for use by subclasses.
432 * @param text The target text to be searched.
433 * @param breakiter A {@link BreakIterator} that is used to restrict the
434 * points at which matches are detected. If
435 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
436 * match, but the match's start or end index is not a
437 * boundary as determined by the <tt>BreakIterator</tt>,
438 * the match is rejected and <tt>handleNext</tt> or
439 * <tt>handlePrev</tt> is called again. If this parameter
440 * is <tt>NULL</tt>, no break detection is attempted.
441 * @see #handleNext
442 * @see #handlePrev
374ca955 443 * @stable ICU 2.0
b75a7d8f
A
444 */
445 SearchIterator(const UnicodeString &text,
446 BreakIterator *breakiter = NULL);
447
448 /**
449 * Constructor for use by subclasses.
450 * <p>
451 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
452 * will be done during searching for this version. The block of text
453 * in <tt>CharacterIterator</tt> will be used as it is.
454 * @param text The target text to be searched.
455 * @param breakiter A {@link BreakIterator} that is used to restrict the
456 * points at which matches are detected. If
457 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
458 * match, but the match's start or end index is not a
459 * boundary as determined by the <tt>BreakIterator</tt>,
460 * the match is rejected and <tt>handleNext</tt> or
461 * <tt>handlePrev</tt> is called again. If this parameter
462 * is <tt>NULL</tt>, no break detection is attempted.
463 * @see #handleNext
464 * @see #handlePrev
374ca955 465 * @stable ICU 2.0
b75a7d8f
A
466 */
467 SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
468
469 // protected methods --------------------------------------------------
470
471 /**
472 * Assignment operator. Sets this iterator to have the same behavior,
473 * and iterate over the same text, as the one passed in.
474 * @param that instance to be copied.
374ca955 475 * @stable ICU 2.0
b75a7d8f
A
476 */
477 SearchIterator & operator=(const SearchIterator &that);
478
479 /**
480 * Abstract method which subclasses override to provide the mechanism
481 * for finding the next match in the target text. This allows different
482 * subclasses to provide different search algorithms.
483 * <p>
484 * If a match is found, the implementation should return the index at
485 * which the match starts and should call
486 * <tt>setMatchLength</tt> with the number of characters
487 * in the target text that make up the match. If no match is found, the
488 * method should return USEARCH_DONE.
489 * <p>
490 * @param position The index in the target text at which the search
491 * should start.
492 * @param status for error codes if it occurs.
493 * @return index at which the match starts, else if match is not found
494 * USEARCH_DONE is returned
495 * @see #setMatchLength
374ca955 496 * @stable ICU 2.0
b75a7d8f
A
497 */
498 virtual int32_t handleNext(int32_t position, UErrorCode &status)
499 = 0;
500
501 /**
502 * Abstract method which subclasses override to provide the mechanism for
503 * finding the previous match in the target text. This allows different
504 * subclasses to provide different search algorithms.
505 * <p>
506 * If a match is found, the implementation should return the index at
507 * which the match starts and should call
508 * <tt>setMatchLength</tt> with the number of characters
509 * in the target text that make up the match. If no match is found, the
510 * method should return USEARCH_DONE.
511 * <p>
512 * @param position The index in the target text at which the search
513 * should start.
514 * @param status for error codes if it occurs.
515 * @return index at which the match starts, else if match is not found
516 * USEARCH_DONE is returned
517 * @see #setMatchLength
374ca955 518 * @stable ICU 2.0
b75a7d8f
A
519 */
520 virtual int32_t handlePrev(int32_t position, UErrorCode &status)
521 = 0;
522
523 /**
524 * Sets the length of the currently matched string in the text string to
525 * be searched.
526 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
527 * methods should call this when they find a match in the target text.
528 * @param length length of the matched text.
529 * @see #handleNext
530 * @see #handlePrev
374ca955 531 * @stable ICU 2.0
b75a7d8f
A
532 */
533 virtual void setMatchLength(int32_t length);
534
535 /**
536 * Sets the offset of the currently matched string in the text string to
537 * be searched.
538 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
539 * methods should call this when they find a match in the target text.
540 * @param position start offset of the matched text.
541 * @see #handleNext
542 * @see #handlePrev
374ca955 543 * @stable ICU 2.0
b75a7d8f
A
544 */
545 virtual void setMatchStart(int32_t position);
546
547 /**
548 * sets match not found
549 * @stable ICU 2.0
550 */
551 void setMatchNotFound();
552};
553
554inline UBool SearchIterator::operator!=(const SearchIterator &that) const
555{
556 return !operator==(that);
557}
558U_NAMESPACE_END
559
560#endif /* #if !UCONFIG_NO_COLLATION */
561
562#endif
563