]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/strmatch.h
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / i18n / strmatch.h
1 /*
2 * Copyright (C) 2001-2004, International Business Machines Corporation
3 * and others. All Rights Reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/23/01 aliu Creation.
7 **********************************************************************
8 */
9 #ifndef STRMATCH_H
10 #define STRMATCH_H
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_TRANSLITERATION
15
16 #include "unicode/unistr.h"
17 #include "unicode/unifunct.h"
18 #include "unicode/unimatch.h"
19 #include "unicode/unirepl.h"
20
21 U_NAMESPACE_BEGIN
22
23 class TransliterationRuleData;
24
25 /**
26 * An object that matches a fixed input string, implementing the
27 * UnicodeMatcher API. This object also implements the
28 * UnicodeReplacer API, allowing it to emit the matched text as
29 * output. Since the match text may contain flexible match elements,
30 * such as UnicodeSets, the emitted text is not the match pattern, but
31 * instead a substring of the actual matched text. Following
32 * convention, the output text is the leftmost match seen up to this
33 * point.
34 *
35 * A StringMatcher may represent a segment, in which case it has a
36 * positive segment number. This affects how the matcher converts
37 * itself to a pattern but does not otherwise affect its function.
38 *
39 * A StringMatcher that is not a segment should not be used as a
40 * UnicodeReplacer.
41 */
42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
43
44 public:
45
46 /**
47 * Construct a matcher that matches the given pattern string.
48 * @param string the pattern to be matched, possibly containing
49 * stand-ins that represent nested UnicodeMatcher objects.
50 * @param start inclusive start index of text to be replaced
51 * @param limit exclusive end index of text to be replaced;
52 * must be greater than or equal to start
53 * @param segmentNum the segment number from 1..n, or 0 if this is
54 * not a segment.
55 * @param data context object mapping stand-ins to
56 * UnicodeMatcher objects.
57 */
58 StringMatcher(const UnicodeString& string,
59 int32_t start,
60 int32_t limit,
61 int32_t segmentNum,
62 const TransliterationRuleData& data);
63
64 /**
65 * Copy constructor
66 * @param o the object to be copied.
67 */
68 StringMatcher(const StringMatcher& o);
69
70 /**
71 * Destructor
72 */
73 virtual ~StringMatcher();
74
75 /**
76 * Implement UnicodeFunctor
77 * @return a copy of the object.
78 */
79 virtual UnicodeFunctor* clone() const;
80
81 /**
82 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
83 * and return the pointer.
84 * @return the UnicodeMatcher point.
85 */
86 virtual UnicodeMatcher* toMatcher() const;
87
88 /**
89 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
90 * and return the pointer.
91 * @return the UnicodeReplacer pointer.
92 */
93 virtual UnicodeReplacer* toReplacer() const;
94
95 /**
96 * Implement UnicodeMatcher
97 * @param text the text to be matched
98 * @param offset on input, the index into text at which to begin
99 * matching. On output, the limit of the matched text. The
100 * number of matched characters is the output value of offset
101 * minus the input value. Offset should always point to the
102 * HIGH SURROGATE (leading code unit) of a pair of surrogates,
103 * both on entry and upon return.
104 * @param limit the limit index of text to be matched. Greater
105 * than offset for a forward direction match, less than offset for
106 * a backward direction match. The last character to be
107 * considered for matching will be text.charAt(limit-1) in the
108 * forward direction or text.charAt(limit+1) in the backward
109 * direction.
110 * @param incremental if TRUE, then assume further characters may
111 * be inserted at limit and check for partial matching. Otherwise
112 * assume the text as given is complete.
113 * @return a match degree value indicating a full match, a partial
114 * match, or a mismatch. If incremental is FALSE then
115 * U_PARTIAL_MATCH should never be returned.
116 */
117 virtual UMatchDegree matches(const Replaceable& text,
118 int32_t& offset,
119 int32_t limit,
120 UBool incremental);
121
122 /**
123 * Implement UnicodeMatcher
124 * @param result Output param to receive the pattern.
125 * @param escapeUnprintable if True then escape the unprintable characters.
126 * @return A reference to 'result'.
127 */
128 virtual UnicodeString& toPattern(UnicodeString& result,
129 UBool escapeUnprintable = FALSE) const;
130
131 /**
132 * Implement UnicodeMatcher
133 * Returns TRUE if this matcher will match a character c, where c
134 * & 0xFF == v, at offset, in the forward direction (with limit >
135 * offset). This is used by <tt>RuleBasedTransliterator</tt> for
136 * indexing.
137 * @param v the given value
138 * @return TRUE if this matcher will match a character c,
139 * where c & 0xFF == v
140 */
141 virtual UBool matchesIndexValue(uint8_t v) const;
142
143 /**
144 * Implement UnicodeMatcher
145 */
146 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
147
148 /**
149 * Implement UnicodeFunctor
150 */
151 virtual void setData(const TransliterationRuleData*);
152
153 /**
154 * Replace characters in 'text' from 'start' to 'limit' with the
155 * output text of this object. Update the 'cursor' parameter to
156 * give the cursor position and return the length of the
157 * replacement text.
158 *
159 * @param text the text to be matched
160 * @param start inclusive start index of text to be replaced
161 * @param limit exclusive end index of text to be replaced;
162 * must be greater than or equal to start
163 * @param cursor output parameter for the cursor position.
164 * Not all replacer objects will update this, but in a complete
165 * tree of replacer objects, representing the entire output side
166 * of a transliteration rule, at least one must update it.
167 * @return the number of 16-bit code units in the text replacing
168 * the characters at offsets start..(limit-1) in text
169 */
170 virtual int32_t replace(Replaceable& text,
171 int32_t start,
172 int32_t limit,
173 int32_t& cursor);
174
175 /**
176 * Returns a string representation of this replacer. If the
177 * result of calling this function is passed to the appropriate
178 * parser, typically TransliteratorParser, it will produce another
179 * replacer that is equal to this one.
180 * @param result the string to receive the pattern. Previous
181 * contents will be deleted.
182 * @param escapeUnprintable if TRUE then convert unprintable
183 * character to their hex escape representations, \\uxxxx or
184 * \\Uxxxxxxxx. Unprintable characters are defined by
185 * Utility.isUnprintable().
186 * @return a reference to 'result'.
187 */
188 virtual UnicodeString& toReplacerPattern(UnicodeString& result,
189 UBool escapeUnprintable) const;
190
191 /**
192 * Remove any match data. This must be called before performing a
193 * set of matches with this segment.
194 */
195 void resetMatch();
196
197 /**
198 * ICU "poor man's RTTI", returns a UClassID for the actual class.
199 *
200 * @draft ICU 2.2
201 */
202 virtual UClassID getDynamicClassID() const;
203
204 /**
205 * ICU "poor man's RTTI", returns a UClassID for this class.
206 *
207 * @draft ICU 2.2
208 */
209 static UClassID U_EXPORT2 getStaticClassID();
210
211 /**
212 * Union the set of all characters that may output by this object
213 * into the given set.
214 * @param toUnionTo the set into which to union the output characters
215 */
216 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
217
218 private:
219
220 /**
221 * The text to be matched.
222 */
223 UnicodeString pattern;
224
225 /**
226 * Context object that maps stand-ins to matcher and replacer
227 * objects.
228 */
229 const TransliterationRuleData* data;
230
231 /**
232 * The segment number, 1-based, or 0 if not a segment.
233 */
234 int32_t segmentNumber;
235
236 /**
237 * Start offset, in the match text, of the <em>rightmost</em>
238 * match.
239 */
240 int32_t matchStart;
241
242 /**
243 * Limit offset, in the match text, of the <em>rightmost</em>
244 * match.
245 */
246 int32_t matchLimit;
247
248 };
249
250 U_NAMESPACE_END
251
252 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
253
254 #endif