]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ****************************************************************************** | |
4388f060 | 3 | * Copyright (C) 1996-2011, International Business Machines * |
729e4ab9 A |
4 | * Corporation and others. All Rights Reserved. * |
5 | ****************************************************************************** | |
6 | */ | |
7 | ||
8 | /** | |
9 | * \file | |
10 | * \brief C++ API: Boyer-Moore StringSearch technology preview | |
11 | * \internal ICU 4.0.1 technology preview | |
12 | */ | |
13 | ||
14 | #ifndef B_M_SEARCH_H | |
15 | #define B_M_SEARCH_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | |
20 | ||
21 | #include "unicode/uobject.h" | |
22 | #include "unicode/ucol.h" | |
23 | ||
24 | #include "unicode/colldata.h" | |
25 | ||
26 | U_NAMESPACE_BEGIN | |
27 | ||
28 | class BadCharacterTable; | |
29 | class GoodSuffixTable; | |
30 | class Target; | |
31 | ||
4388f060 | 32 | #ifndef U_HIDE_INTERNAL_API |
729e4ab9 A |
33 | /** |
34 | * BoyerMooreSearch | |
35 | * | |
36 | * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates | |
37 | * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them, | |
38 | * and a reference to the text being searched. | |
39 | * | |
40 | * To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>. | |
41 | * Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern | |
42 | * string and the target string. Then you call the <code>search</code> method. Here's a code sample: | |
43 | * | |
44 | * <pre> | |
45 | * void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target) | |
46 | * { | |
47 | * UErrorCode status = U_ZERO_ERROR; | |
48 | * CollData *collData = CollData::open(collator, status); | |
49 | * | |
50 | * if (U_FAILURE(status)) { | |
51 | * // could not create a CollData object | |
52 | * return; | |
53 | * } | |
54 | * | |
55 | * BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status); | |
56 | * | |
57 | * if (U_FAILURE(status)) { | |
58 | * // could not create a BoyerMooreSearch object | |
59 | * CollData::close(collData); | |
60 | * return; | |
61 | * } | |
62 | * | |
63 | * int32_t offset = 0, start = -1, end = -1; | |
64 | * | |
65 | * // Find all matches | |
66 | * while (search->search(offset, start, end)) { | |
67 | * // process the match between start and end | |
68 | * ... | |
69 | * // advance past the match | |
70 | * offset = end; | |
71 | * } | |
72 | * | |
73 | * // at this point, if offset == 0, there were no matches | |
74 | * if (offset == 0) { | |
75 | * // handle the case of no matches | |
76 | * } | |
77 | * | |
78 | * delete search; | |
79 | * CollData::close(collData); | |
80 | * | |
81 | * // CollData objects are cached, so the call to | |
82 | * // CollData::close doesn't delete the object. | |
83 | * // Call this if you don't need the object any more. | |
84 | * CollData::flushCollDataCache(); | |
85 | * } | |
86 | * </pre> | |
87 | * | |
88 | * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API. | |
89 | * | |
90 | * Knows linitations: | |
91 | * 1) Backwards searching has not been implemented. | |
92 | * | |
93 | * 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general, | |
94 | * this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored | |
95 | * to be equal to Han characters with the same pronounciation. Because this code ignroes | |
96 | * tailorings, searching for a Hangul character will not find a Han character and visa-versa. | |
97 | * | |
98 | * 3) In some cases, searching for a pattern that needs to be normalized and ends | |
99 | * in a discontiguous contraction may fail. The only known cases of this are with | |
100 | * the Tibetan script. For example searching for the pattern | |
101 | * "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've | |
102 | * been unable to find a pratical, real-world example of this failure.) | |
103 | * | |
104 | * @internal ICU 4.0.1 technology preview | |
105 | * | |
106 | * @see CollData | |
107 | */ | |
108 | class U_I18N_API BoyerMooreSearch : public UObject | |
109 | { | |
110 | public: | |
111 | /** | |
112 | * Construct a <code>BoyerMooreSearch</code> object. | |
113 | * | |
114 | * @param theData - A <code>CollData</code> object holding the Collator-sensitive data | |
115 | * @param patternString - the string for which to search | |
116 | * @param targetString - the string in which to search or <code>NULL</code> if youu will | |
117 | * set it later by calling <code>setTargetString</code>. | |
118 | * @param status - will be set if any errors occur. | |
119 | * | |
120 | * Note: if on return, status is set to an error code, | |
121 | * the only safe thing to do with this object is to call | |
122 | * the destructor. | |
123 | * | |
124 | * @internal ICU 4.0.1 technology preview | |
125 | */ | |
126 | BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status); | |
127 | ||
128 | /** | |
129 | * The desstructor | |
130 | * | |
131 | * @internal ICU 4.0.1 technology preview | |
132 | */ | |
133 | ~BoyerMooreSearch(); | |
134 | ||
135 | /** | |
136 | * Test the pattern to see if it generates any CEs. | |
137 | * | |
138 | * @return <code>TRUE</code> if the pattern string did not generate any CEs | |
139 | * | |
140 | * @internal ICU 4.0.1 technology preview | |
141 | */ | |
142 | UBool empty(); | |
143 | ||
144 | /** | |
145 | * Search for the pattern string in the target string. | |
146 | * | |
147 | * @param offset - the offset in the target string at which to begin the search | |
148 | * @param start - will be set to the starting offset of the match, or -1 if there's no match | |
149 | * @param end - will be set to the ending offset of the match, or -1 if there's no match | |
150 | * | |
151 | * @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise. | |
152 | * | |
153 | * @internal ICU 4.0.1 technology preview | |
154 | */ | |
155 | UBool search(int32_t offset, int32_t &start, int32_t &end); | |
156 | ||
157 | /** | |
158 | * Set the target string for the match. | |
159 | * | |
160 | * @param targetString - the new target string | |
161 | * @param status - will be set if any errors occur. | |
162 | * | |
163 | * @internal ICU 4.0.1 technology preview | |
164 | */ | |
165 | void setTargetString(const UnicodeString *targetString, UErrorCode &status); | |
166 | ||
167 | // **** no longer need these? **** | |
168 | /** | |
169 | * Return the <code>CollData</code> object used for searching | |
170 | * | |
171 | * @return the <code>CollData</code> object used for searching | |
172 | * | |
173 | * @internal ICU 4.0.1 technology preview | |
174 | */ | |
175 | CollData *getData(); | |
176 | ||
177 | /** | |
178 | * Return the CEs generated by the pattern string. | |
179 | * | |
180 | * @return a <code>CEList</code> object holding the CEs generated by the pattern string. | |
181 | * | |
182 | * @internal ICU 4.0.1 technology preview | |
183 | */ | |
184 | CEList *getPatternCEs(); | |
185 | ||
186 | /** | |
187 | * Return the <code>BadCharacterTable</code> object computed for the pattern string. | |
188 | * | |
189 | * @return the <code>BadCharacterTable</code> object. | |
190 | * | |
191 | * @internal ICU 4.0.1 technology preview | |
192 | */ | |
193 | BadCharacterTable *getBadCharacterTable(); | |
194 | ||
195 | /** | |
196 | * Return the <code>GoodSuffixTable</code> object computed for the pattern string. | |
197 | * | |
198 | * @return the <code>GoodSuffixTable</code> object computed for the pattern string. | |
199 | * | |
200 | * @internal ICU 4.0.1 technology preview | |
201 | */ | |
202 | GoodSuffixTable *getGoodSuffixTable(); | |
203 | ||
204 | /** | |
205 | * UObject glue... | |
206 | * @internal ICU 4.0.1 technology preview | |
207 | */ | |
208 | virtual UClassID getDynamicClassID() const; | |
209 | /** | |
210 | * UObject glue... | |
211 | * @internal ICU 4.0.1 technology preview | |
212 | */ | |
213 | static UClassID getStaticClassID(); | |
214 | ||
215 | private: | |
216 | CollData *data; | |
217 | CEList *patCEs; | |
218 | BadCharacterTable *badCharacterTable; | |
219 | GoodSuffixTable *goodSuffixTable; | |
220 | UnicodeString pattern; | |
221 | Target *target; | |
222 | }; | |
4388f060 | 223 | #endif /* U_HIDE_INTERNAL_API */ |
729e4ab9 A |
224 | |
225 | U_NAMESPACE_END | |
226 | ||
227 | #endif // #if !UCONFIG_NO_COLLATION | |
228 | #endif // #ifndef B_M_SEARCH_H |