]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/strmatch.h
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / strmatch.h
CommitLineData
b75a7d8f
A
1/*
2* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
3**********************************************************************
4* Date Name Description
5* 07/23/01 aliu Creation.
6**********************************************************************
7*/
8#ifndef STRMATCH_H
9#define STRMATCH_H
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/unistr.h"
16#include "unicode/unifunct.h"
17#include "unicode/unimatch.h"
18#include "unicode/unirepl.h"
19
20U_NAMESPACE_BEGIN
21
22class TransliterationRuleData;
23
24/**
25 * An object that matches a fixed input string, implementing the
26 * UnicodeMatcher API. This object also implements the
27 * UnicodeReplacer API, allowing it to emit the matched text as
28 * output. Since the match text may contain flexible match elements,
29 * such as UnicodeSets, the emitted text is not the match pattern, but
30 * instead a substring of the actual matched text. Following
31 * convention, the output text is the leftmost match seen up to this
32 * point.
33 *
34 * A StringMatcher may represent a segment, in which case it has a
35 * positive segment number. This affects how the matcher converts
36 * itself to a pattern but does not otherwise affect its function.
37 *
38 * A StringMatcher that is not a segment should not be used as a
39 * UnicodeReplacer.
40 */
41class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
42
43 public:
44
45 /**
46 * Construct a matcher that matches the given pattern string.
47 * @param string the pattern to be matched, possibly containing
48 * stand-ins that represent nested UnicodeMatcher objects.
49 * @param start inclusive start index of text to be replaced
50 * @param limit exclusive end index of text to be replaced;
51 * must be greater than or equal to start
52 * @param segmentNum the segment number from 1..n, or 0 if this is
53 * not a segment.
54 * @param data context object mapping stand-ins to
55 * UnicodeMatcher objects.
56 */
57 StringMatcher(const UnicodeString& string,
58 int32_t start,
59 int32_t limit,
60 int32_t segmentNum,
61 const TransliterationRuleData& data);
62
63 /**
64 * Copy constructor
65 * @param o the object to be copied.
66 */
67 StringMatcher(const StringMatcher& o);
68
69 /**
70 * Destructor
71 */
72 virtual ~StringMatcher();
73
74 /**
75 * Implement UnicodeFunctor
76 * @return a copy of the object.
77 */
78 virtual UnicodeFunctor* clone() const;
79
80 /**
81 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
82 * and return the pointer.
83 * @return the UnicodeMatcher point.
84 */
85 virtual UnicodeMatcher* toMatcher() const;
86
87 /**
88 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
89 * and return the pointer.
90 * @return the UnicodeReplacer pointer.
91 */
92 virtual UnicodeReplacer* toReplacer() const;
93
94 /**
95 * Implement UnicodeMatcher
96 * @param text the text to be matched
97 * @param offset on input, the index into text at which to begin
98 * matching. On output, the limit of the matched text. The
99 * number of matched characters is the output value of offset
100 * minus the input value. Offset should always point to the
101 * HIGH SURROGATE (leading code unit) of a pair of surrogates,
102 * both on entry and upon return.
103 * @param limit the limit index of text to be matched. Greater
104 * than offset for a forward direction match, less than offset for
105 * a backward direction match. The last character to be
106 * considered for matching will be text.charAt(limit-1) in the
107 * forward direction or text.charAt(limit+1) in the backward
108 * direction.
109 * @param incremental if TRUE, then assume further characters may
110 * be inserted at limit and check for partial matching. Otherwise
111 * assume the text as given is complete.
112 * @return a match degree value indicating a full match, a partial
113 * match, or a mismatch. If incremental is FALSE then
114 * U_PARTIAL_MATCH should never be returned.
115 */
116 virtual UMatchDegree matches(const Replaceable& text,
117 int32_t& offset,
118 int32_t limit,
119 UBool incremental);
120
121 /**
122 * Implement UnicodeMatcher
123 * @param result Output param to receive the pattern.
124 * @param escapeUnprintable if True then escape the unprintable characters.
125 * @return A reference to 'result'.
126 */
127 virtual UnicodeString& toPattern(UnicodeString& result,
128 UBool escapeUnprintable = FALSE) const;
129
130 /**
131 * Implement UnicodeMatcher
132 * Returns TRUE if this matcher will match a character c, where c
133 * & 0xFF == v, at offset, in the forward direction (with limit >
134 * offset). This is used by <tt>RuleBasedTransliterator</tt> for
135 * indexing.
136 * @param v the given value
137 * @return TRUE if this matcher will match a character c,
138 * where c & 0xFF == v
139 */
140 virtual UBool matchesIndexValue(uint8_t v) const;
141
142 /**
143 * Implement UnicodeMatcher
144 */
145 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
146
147 /**
148 * Implement UnicodeFunctor
149 */
150 virtual void setData(const TransliterationRuleData*);
151
152 /**
153 * Replace characters in 'text' from 'start' to 'limit' with the
154 * output text of this object. Update the 'cursor' parameter to
155 * give the cursor position and return the length of the
156 * replacement text.
157 *
158 * @param text the text to be matched
159 * @param start inclusive start index of text to be replaced
160 * @param limit exclusive end index of text to be replaced;
161 * must be greater than or equal to start
162 * @param cursor output parameter for the cursor position.
163 * Not all replacer objects will update this, but in a complete
164 * tree of replacer objects, representing the entire output side
165 * of a transliteration rule, at least one must update it.
166 * @return the number of 16-bit code units in the text replacing
167 * the characters at offsets start..(limit-1) in text
168 */
169 virtual int32_t replace(Replaceable& text,
170 int32_t start,
171 int32_t limit,
172 int32_t& cursor);
173
174 /**
175 * Returns a string representation of this replacer. If the
176 * result of calling this function is passed to the appropriate
177 * parser, typically TransliteratorParser, it will produce another
178 * replacer that is equal to this one.
179 * @param result the string to receive the pattern. Previous
180 * contents will be deleted.
181 * @param escapeUnprintable if TRUE then convert unprintable
182 * character to their hex escape representations, \\uxxxx or
183 * \\Uxxxxxxxx. Unprintable characters are defined by
184 * Utility.isUnprintable().
185 * @return a reference to 'result'.
186 */
187 virtual UnicodeString& toReplacerPattern(UnicodeString& result,
188 UBool escapeUnprintable) const;
189
190 /**
191 * Remove any match data. This must be called before performing a
192 * set of matches with this segment.
193 */
194 void resetMatch();
195
196 /**
197 * ICU "poor man's RTTI", returns a UClassID for the actual class.
198 *
199 * @draft ICU 2.2
200 */
201 virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
202
203 /**
204 * ICU "poor man's RTTI", returns a UClassID for this class.
205 *
206 * @draft ICU 2.2
207 */
208 static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
209
210 /**
211 * Union the set of all characters that may output by this object
212 * into the given set.
213 * @param toUnionTo the set into which to union the output characters
214 */
215 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
216
217 private:
218
219 /**
220 * The text to be matched.
221 */
222 UnicodeString pattern;
223
224 /**
225 * Context object that maps stand-ins to matcher and replacer
226 * objects.
227 */
228 const TransliterationRuleData* data;
229
230 /**
231 * The segment number, 1-based, or 0 if not a segment.
232 */
233 int32_t segmentNumber;
234
235 /**
236 * Start offset, in the match text, of the <em>rightmost</em>
237 * match.
238 */
239 int32_t matchStart;
240
241 /**
242 * Limit offset, in the match text, of the <em>rightmost</em>
243 * match.
244 */
245 int32_t matchLimit;
246
247 /**
248 * The address of this static class variable serves as this class's ID
249 * for ICU "poor man's RTTI".
250 */
251 static const char fgClassID;
252};
253
254U_NAMESPACE_END
255
256#endif /* #if !UCONFIG_NO_TRANSLITERATION */
257
258#endif