]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ****************************************************************************** | |
3 | * Copyright (C) 1996-2010, International Business Machines * | |
4 | * Corporation and others. All Rights Reserved. * | |
5 | ****************************************************************************** | |
6 | */ | |
7 | ||
8 | /** | |
9 | * \file | |
10 | * \brief C++ API: Boyer-Moore StringSearch technology preview | |
11 | * \internal ICU 4.0.1 technology preview | |
12 | */ | |
13 | ||
14 | #ifndef B_M_SEARCH_H | |
15 | #define B_M_SEARCH_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | |
20 | ||
21 | #include "unicode/uobject.h" | |
22 | #include "unicode/ucol.h" | |
23 | ||
24 | #include "unicode/colldata.h" | |
25 | ||
26 | U_NAMESPACE_BEGIN | |
27 | ||
28 | class BadCharacterTable; | |
29 | class GoodSuffixTable; | |
30 | class Target; | |
31 | ||
32 | /** | |
33 | * BoyerMooreSearch | |
34 | * | |
35 | * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates | |
36 | * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them, | |
37 | * and a reference to the text being searched. | |
38 | * | |
39 | * To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>. | |
40 | * Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern | |
41 | * string and the target string. Then you call the <code>search</code> method. Here's a code sample: | |
42 | * | |
43 | * <pre> | |
44 | * void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target) | |
45 | * { | |
46 | * UErrorCode status = U_ZERO_ERROR; | |
47 | * CollData *collData = CollData::open(collator, status); | |
48 | * | |
49 | * if (U_FAILURE(status)) { | |
50 | * // could not create a CollData object | |
51 | * return; | |
52 | * } | |
53 | * | |
54 | * BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status); | |
55 | * | |
56 | * if (U_FAILURE(status)) { | |
57 | * // could not create a BoyerMooreSearch object | |
58 | * CollData::close(collData); | |
59 | * return; | |
60 | * } | |
61 | * | |
62 | * int32_t offset = 0, start = -1, end = -1; | |
63 | * | |
64 | * // Find all matches | |
65 | * while (search->search(offset, start, end)) { | |
66 | * // process the match between start and end | |
67 | * ... | |
68 | * // advance past the match | |
69 | * offset = end; | |
70 | * } | |
71 | * | |
72 | * // at this point, if offset == 0, there were no matches | |
73 | * if (offset == 0) { | |
74 | * // handle the case of no matches | |
75 | * } | |
76 | * | |
77 | * delete search; | |
78 | * CollData::close(collData); | |
79 | * | |
80 | * // CollData objects are cached, so the call to | |
81 | * // CollData::close doesn't delete the object. | |
82 | * // Call this if you don't need the object any more. | |
83 | * CollData::flushCollDataCache(); | |
84 | * } | |
85 | * </pre> | |
86 | * | |
87 | * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API. | |
88 | * | |
89 | * Knows linitations: | |
90 | * 1) Backwards searching has not been implemented. | |
91 | * | |
92 | * 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general, | |
93 | * this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored | |
94 | * to be equal to Han characters with the same pronounciation. Because this code ignroes | |
95 | * tailorings, searching for a Hangul character will not find a Han character and visa-versa. | |
96 | * | |
97 | * 3) In some cases, searching for a pattern that needs to be normalized and ends | |
98 | * in a discontiguous contraction may fail. The only known cases of this are with | |
99 | * the Tibetan script. For example searching for the pattern | |
100 | * "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've | |
101 | * been unable to find a pratical, real-world example of this failure.) | |
102 | * | |
103 | * @internal ICU 4.0.1 technology preview | |
104 | * | |
105 | * @see CollData | |
106 | */ | |
107 | class U_I18N_API BoyerMooreSearch : public UObject | |
108 | { | |
109 | public: | |
110 | /** | |
111 | * Construct a <code>BoyerMooreSearch</code> object. | |
112 | * | |
113 | * @param theData - A <code>CollData</code> object holding the Collator-sensitive data | |
114 | * @param patternString - the string for which to search | |
115 | * @param targetString - the string in which to search or <code>NULL</code> if youu will | |
116 | * set it later by calling <code>setTargetString</code>. | |
117 | * @param status - will be set if any errors occur. | |
118 | * | |
119 | * Note: if on return, status is set to an error code, | |
120 | * the only safe thing to do with this object is to call | |
121 | * the destructor. | |
122 | * | |
123 | * @internal ICU 4.0.1 technology preview | |
124 | */ | |
125 | BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status); | |
126 | ||
127 | /** | |
128 | * The desstructor | |
129 | * | |
130 | * @internal ICU 4.0.1 technology preview | |
131 | */ | |
132 | ~BoyerMooreSearch(); | |
133 | ||
134 | /** | |
135 | * Test the pattern to see if it generates any CEs. | |
136 | * | |
137 | * @return <code>TRUE</code> if the pattern string did not generate any CEs | |
138 | * | |
139 | * @internal ICU 4.0.1 technology preview | |
140 | */ | |
141 | UBool empty(); | |
142 | ||
143 | /** | |
144 | * Search for the pattern string in the target string. | |
145 | * | |
146 | * @param offset - the offset in the target string at which to begin the search | |
147 | * @param start - will be set to the starting offset of the match, or -1 if there's no match | |
148 | * @param end - will be set to the ending offset of the match, or -1 if there's no match | |
149 | * | |
150 | * @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise. | |
151 | * | |
152 | * @internal ICU 4.0.1 technology preview | |
153 | */ | |
154 | UBool search(int32_t offset, int32_t &start, int32_t &end); | |
155 | ||
156 | /** | |
157 | * Set the target string for the match. | |
158 | * | |
159 | * @param targetString - the new target string | |
160 | * @param status - will be set if any errors occur. | |
161 | * | |
162 | * @internal ICU 4.0.1 technology preview | |
163 | */ | |
164 | void setTargetString(const UnicodeString *targetString, UErrorCode &status); | |
165 | ||
166 | // **** no longer need these? **** | |
167 | /** | |
168 | * Return the <code>CollData</code> object used for searching | |
169 | * | |
170 | * @return the <code>CollData</code> object used for searching | |
171 | * | |
172 | * @internal ICU 4.0.1 technology preview | |
173 | */ | |
174 | CollData *getData(); | |
175 | ||
176 | /** | |
177 | * Return the CEs generated by the pattern string. | |
178 | * | |
179 | * @return a <code>CEList</code> object holding the CEs generated by the pattern string. | |
180 | * | |
181 | * @internal ICU 4.0.1 technology preview | |
182 | */ | |
183 | CEList *getPatternCEs(); | |
184 | ||
185 | /** | |
186 | * Return the <code>BadCharacterTable</code> object computed for the pattern string. | |
187 | * | |
188 | * @return the <code>BadCharacterTable</code> object. | |
189 | * | |
190 | * @internal ICU 4.0.1 technology preview | |
191 | */ | |
192 | BadCharacterTable *getBadCharacterTable(); | |
193 | ||
194 | /** | |
195 | * Return the <code>GoodSuffixTable</code> object computed for the pattern string. | |
196 | * | |
197 | * @return the <code>GoodSuffixTable</code> object computed for the pattern string. | |
198 | * | |
199 | * @internal ICU 4.0.1 technology preview | |
200 | */ | |
201 | GoodSuffixTable *getGoodSuffixTable(); | |
202 | ||
203 | /** | |
204 | * UObject glue... | |
205 | * @internal ICU 4.0.1 technology preview | |
206 | */ | |
207 | virtual UClassID getDynamicClassID() const; | |
208 | /** | |
209 | * UObject glue... | |
210 | * @internal ICU 4.0.1 technology preview | |
211 | */ | |
212 | static UClassID getStaticClassID(); | |
213 | ||
214 | private: | |
215 | CollData *data; | |
216 | CEList *patCEs; | |
217 | BadCharacterTable *badCharacterTable; | |
218 | GoodSuffixTable *goodSuffixTable; | |
219 | UnicodeString pattern; | |
220 | Target *target; | |
221 | }; | |
222 | ||
223 | U_NAMESPACE_END | |
224 | ||
225 | #endif // #if !UCONFIG_NO_COLLATION | |
226 | #endif // #ifndef B_M_SEARCH_H |