]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ********************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 1999-2015, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 11/17/99 aliu Creation. | |
10 | ********************************************************************** | |
11 | */ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_TRANSLITERATION | |
16 | ||
17 | #include "unicode/rep.h" | |
18 | #include "unicode/uniset.h" | |
19 | #include "rbt_pars.h" | |
20 | #include "rbt_data.h" | |
21 | #include "rbt_rule.h" | |
22 | #include "rbt.h" | |
2ca993e8 | 23 | #include "mutex.h" |
374ca955 | 24 | #include "umutex.h" |
b75a7d8f A |
25 | |
26 | U_NAMESPACE_BEGIN | |
27 | ||
374ca955 A |
28 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) |
29 | ||
51004dcb | 30 | static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER; |
374ca955 | 31 | static Replaceable *gLockedText = NULL; |
b75a7d8f A |
32 | |
33 | void RuleBasedTransliterator::_construct(const UnicodeString& rules, | |
34 | UTransDirection direction, | |
35 | UParseError& parseError, | |
36 | UErrorCode& status) { | |
374ca955 | 37 | fData = 0; |
b75a7d8f A |
38 | isDataOwned = TRUE; |
39 | if (U_FAILURE(status)) { | |
40 | return; | |
41 | } | |
42 | ||
73c04bcf | 43 | TransliteratorParser parser(status); |
b75a7d8f A |
44 | parser.parse(rules, direction, parseError, status); |
45 | if (U_FAILURE(status)) { | |
46 | return; | |
47 | } | |
48 | ||
73c04bcf A |
49 | if (parser.idBlockVector.size() != 0 || |
50 | parser.compoundFilter != NULL || | |
51 | parser.dataVector.size() == 0) { | |
b75a7d8f A |
52 | status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT |
53 | return; | |
54 | } | |
55 | ||
73c04bcf | 56 | fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
374ca955 A |
57 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
58 | } | |
59 | ||
60 | /** | |
61 | * Constructs a new transliterator from the given rules. | |
62 | * @param id the id for the transliterator. | |
63 | * @param rules rules, separated by ';' | |
64 | * @param direction either FORWARD or REVERSE. | |
65 | * @param adoptedFilter the filter for this transliterator. | |
66 | * @param parseError Struct to recieve information on position | |
67 | * of error if an error is encountered | |
68 | * @param status Output param set to success/failure code. | |
69 | * @exception IllegalArgumentException if rules are malformed | |
70 | * or direction is invalid. | |
71 | */ | |
72 | RuleBasedTransliterator::RuleBasedTransliterator( | |
73 | const UnicodeString& id, | |
74 | const UnicodeString& rules, | |
75 | UTransDirection direction, | |
76 | UnicodeFilter* adoptedFilter, | |
77 | UParseError& parseError, | |
78 | UErrorCode& status) : | |
79 | Transliterator(id, adoptedFilter) { | |
80 | _construct(rules, direction,parseError,status); | |
81 | } | |
82 | ||
83 | /** | |
84 | * Constructs a new transliterator from the given rules. | |
85 | * @param id the id for the transliterator. | |
86 | * @param rules rules, separated by ';' | |
87 | * @param direction either FORWARD or REVERSE. | |
88 | * @param adoptedFilter the filter for this transliterator. | |
89 | * @param status Output param set to success/failure code. | |
90 | * @exception IllegalArgumentException if rules are malformed | |
91 | * or direction is invalid. | |
92 | */ | |
46f4442e | 93 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
374ca955 A |
94 | const UnicodeString& id, |
95 | const UnicodeString& rules, | |
96 | UTransDirection direction, | |
97 | UnicodeFilter* adoptedFilter, | |
98 | UErrorCode& status) : | |
99 | Transliterator(id, adoptedFilter) { | |
100 | UParseError parseError; | |
101 | _construct(rules, direction,parseError, status); | |
46f4442e | 102 | }*/ |
374ca955 A |
103 | |
104 | /** | |
105 | * Covenience constructor with no filter. | |
106 | */ | |
46f4442e | 107 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
374ca955 A |
108 | const UnicodeString& id, |
109 | const UnicodeString& rules, | |
110 | UTransDirection direction, | |
111 | UErrorCode& status) : | |
112 | Transliterator(id, 0) { | |
113 | UParseError parseError; | |
114 | _construct(rules, direction,parseError, status); | |
46f4442e | 115 | }*/ |
374ca955 A |
116 | |
117 | /** | |
118 | * Covenience constructor with no filter and FORWARD direction. | |
119 | */ | |
46f4442e | 120 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
374ca955 A |
121 | const UnicodeString& id, |
122 | const UnicodeString& rules, | |
123 | UErrorCode& status) : | |
124 | Transliterator(id, 0) { | |
125 | UParseError parseError; | |
126 | _construct(rules, UTRANS_FORWARD, parseError, status); | |
46f4442e | 127 | }*/ |
374ca955 A |
128 | |
129 | /** | |
130 | * Covenience constructor with FORWARD direction. | |
131 | */ | |
46f4442e | 132 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
374ca955 A |
133 | const UnicodeString& id, |
134 | const UnicodeString& rules, | |
135 | UnicodeFilter* adoptedFilter, | |
136 | UErrorCode& status) : | |
137 | Transliterator(id, adoptedFilter) { | |
138 | UParseError parseError; | |
139 | _construct(rules, UTRANS_FORWARD,parseError, status); | |
46f4442e | 140 | }*/ |
b75a7d8f A |
141 | |
142 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, | |
143 | const TransliterationRuleData* theData, | |
144 | UnicodeFilter* adoptedFilter) : | |
145 | Transliterator(id, adoptedFilter), | |
374ca955 | 146 | fData((TransliterationRuleData*)theData), // cast away const |
b75a7d8f | 147 | isDataOwned(FALSE) { |
374ca955 | 148 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
b75a7d8f A |
149 | } |
150 | ||
151 | /** | |
152 | * Internal constructor. | |
153 | */ | |
154 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, | |
155 | TransliterationRuleData* theData, | |
156 | UBool isDataAdopted) : | |
157 | Transliterator(id, 0), | |
374ca955 | 158 | fData(theData), |
b75a7d8f | 159 | isDataOwned(isDataAdopted) { |
374ca955 | 160 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
b75a7d8f A |
161 | } |
162 | ||
163 | /** | |
164 | * Copy constructor. | |
165 | */ | |
166 | RuleBasedTransliterator::RuleBasedTransliterator( | |
167 | const RuleBasedTransliterator& other) : | |
374ca955 | 168 | Transliterator(other), fData(other.fData), |
b75a7d8f A |
169 | isDataOwned(other.isDataOwned) { |
170 | ||
171 | // The data object may or may not be owned. If it is not owned we | |
172 | // share it; it is invariant. If it is owned, it's still | |
173 | // invariant, but we need to copy it to prevent double-deletion. | |
174 | // If this becomes a performance issue (if people do a lot of RBT | |
175 | // copying -- unlikely) we can reference count the data object. | |
176 | ||
177 | // Only do a deep copy if this is owned data, that is, data that | |
178 | // will be later deleted. System transliterators contain | |
179 | // non-owned data. | |
180 | if (isDataOwned) { | |
374ca955 | 181 | fData = new TransliterationRuleData(*other.fData); |
b75a7d8f A |
182 | } |
183 | } | |
184 | ||
185 | /** | |
186 | * Destructor. | |
187 | */ | |
188 | RuleBasedTransliterator::~RuleBasedTransliterator() { | |
189 | // Delete the data object only if we own it. | |
190 | if (isDataOwned) { | |
374ca955 | 191 | delete fData; |
b75a7d8f A |
192 | } |
193 | } | |
194 | ||
195 | Transliterator* // Covariant return NOT ALLOWED (for portability) | |
196 | RuleBasedTransliterator::clone(void) const { | |
197 | return new RuleBasedTransliterator(*this); | |
198 | } | |
199 | ||
200 | /** | |
201 | * Implements {@link Transliterator#handleTransliterate}. | |
202 | */ | |
203 | void | |
204 | RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, | |
205 | UBool isIncremental) const { | |
206 | /* We keep contextStart and contextLimit fixed the entire time, | |
207 | * relative to the text -- contextLimit may move numerically if | |
208 | * text is inserted or removed. The start offset moves toward | |
209 | * limit, with replacements happening under it. | |
210 | * | |
211 | * Example: rules 1. ab>x|y | |
212 | * 2. yc>z | |
213 | * | |
214 | * |eabcd begin - no match, advance start | |
215 | * e|abcd match rule 1 - change text & adjust start | |
216 | * ex|ycd match rule 2 - change text & adjust start | |
217 | * exz|d no match, advance start | |
218 | * exzd| done | |
219 | */ | |
220 | ||
221 | /* A rule like | |
222 | * a>b|a | |
223 | * creates an infinite loop. To prevent that, we put an arbitrary | |
224 | * limit on the number of iterations that we take, one that is | |
225 | * high enough that any reasonable rules are ok, but low enough to | |
226 | * prevent a server from hanging. The limit is 16 times the | |
227 | * number of characters n, unless n is so large that 16n exceeds a | |
228 | * uint32_t. | |
229 | */ | |
230 | uint32_t loopCount = 0; | |
231 | uint32_t loopLimit = index.limit - index.start; | |
232 | if (loopLimit >= 0x10000000) { | |
233 | loopLimit = 0xFFFFFFFF; | |
234 | } else { | |
235 | loopLimit <<= 4; | |
236 | } | |
237 | ||
374ca955 A |
238 | // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent |
239 | // operations must be prevented. | |
240 | // A Complication: compound transliterators can result in recursive entries to this | |
241 | // function, sometimes with different "This" objects, always with the same text. | |
242 | // Double-locking must be prevented in these cases. | |
243 | // | |
244 | ||
374ca955 | 245 | UBool lockedMutexAtThisLevel = FALSE; |
2ca993e8 A |
246 | |
247 | // Test whether this request is operating on the same text string as | |
248 | // some other transliteration that is still in progress and holding the | |
249 | // transliteration mutex. If so, do not lock the transliteration | |
250 | // mutex again. | |
251 | // | |
252 | // gLockedText variable is protected by the global ICU mutex. | |
253 | // Shared RBT data protected by transliteratorDataMutex. | |
254 | // | |
255 | // TODO(andy): Need a better scheme for handling this. | |
256 | UBool needToLock; | |
257 | { | |
258 | Mutex m; | |
57a6839d | 259 | needToLock = (&text != gLockedText); |
2ca993e8 A |
260 | } |
261 | if (needToLock) { | |
262 | umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. | |
263 | Mutex m; | |
264 | gLockedText = &text; | |
265 | lockedMutexAtThisLevel = TRUE; | |
374ca955 A |
266 | } |
267 | ||
46f4442e A |
268 | // Check to make sure we don't dereference a null pointer. |
269 | if (fData != NULL) { | |
270 | while (index.start < index.limit && | |
271 | loopCount <= loopLimit && | |
272 | fData->ruleSet.transliterate(text, index, isIncremental)) { | |
273 | ++loopCount; | |
274 | } | |
b75a7d8f | 275 | } |
374ca955 | 276 | if (lockedMutexAtThisLevel) { |
2ca993e8 A |
277 | { |
278 | Mutex m; | |
279 | gLockedText = NULL; | |
280 | } | |
374ca955 A |
281 | umtx_unlock(&transliteratorDataMutex); |
282 | } | |
b75a7d8f A |
283 | } |
284 | ||
285 | UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, | |
286 | UBool escapeUnprintable) const { | |
374ca955 | 287 | return fData->ruleSet.toRules(rulesSource, escapeUnprintable); |
b75a7d8f A |
288 | } |
289 | ||
290 | /** | |
291 | * Implement Transliterator framework | |
292 | */ | |
293 | void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { | |
374ca955 | 294 | fData->ruleSet.getSourceTargetSet(result, FALSE); |
b75a7d8f A |
295 | } |
296 | ||
297 | /** | |
298 | * Override Transliterator framework | |
299 | */ | |
300 | UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { | |
374ca955 | 301 | return fData->ruleSet.getSourceTargetSet(result, TRUE); |
b75a7d8f A |
302 | } |
303 | ||
304 | U_NAMESPACE_END | |
305 | ||
306 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |