2 **********************************************************************
3 * Copyright (C) 1999-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/rep.h"
16 #include "unicode/uniset.h"
25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator
)
27 static UMTX transliteratorDataMutex
= NULL
;
28 static Replaceable
*gLockedText
= NULL
;
30 void RuleBasedTransliterator::_construct(const UnicodeString
& rules
,
31 UTransDirection direction
,
32 UParseError
& parseError
,
36 if (U_FAILURE(status
)) {
40 TransliteratorParser
parser(status
);
41 parser
.parse(rules
, direction
, parseError
, status
);
42 if (U_FAILURE(status
)) {
46 if (parser
.idBlockVector
.size() != 0 ||
47 parser
.compoundFilter
!= NULL
||
48 parser
.dataVector
.size() == 0) {
49 status
= U_INVALID_RBT_SYNTAX
; // ::ID blocks disallowed in RBT
53 fData
= (TransliterationRuleData
*)parser
.dataVector
.orphanElementAt(0);
54 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
58 * Constructs a new transliterator from the given rules.
59 * @param id the id for the transliterator.
60 * @param rules rules, separated by ';'
61 * @param direction either FORWARD or REVERSE.
62 * @param adoptedFilter the filter for this transliterator.
63 * @param parseError Struct to recieve information on position
64 * of error if an error is encountered
65 * @param status Output param set to success/failure code.
66 * @exception IllegalArgumentException if rules are malformed
67 * or direction is invalid.
69 RuleBasedTransliterator::RuleBasedTransliterator(
70 const UnicodeString
& id
,
71 const UnicodeString
& rules
,
72 UTransDirection direction
,
73 UnicodeFilter
* adoptedFilter
,
74 UParseError
& parseError
,
76 Transliterator(id
, adoptedFilter
) {
77 _construct(rules
, direction
,parseError
,status
);
81 * Constructs a new transliterator from the given rules.
82 * @param id the id for the transliterator.
83 * @param rules rules, separated by ';'
84 * @param direction either FORWARD or REVERSE.
85 * @param adoptedFilter the filter for this transliterator.
86 * @param status Output param set to success/failure code.
87 * @exception IllegalArgumentException if rules are malformed
88 * or direction is invalid.
90 /*RuleBasedTransliterator::RuleBasedTransliterator(
91 const UnicodeString& id,
92 const UnicodeString& rules,
93 UTransDirection direction,
94 UnicodeFilter* adoptedFilter,
96 Transliterator(id, adoptedFilter) {
97 UParseError parseError;
98 _construct(rules, direction,parseError, status);
102 * Covenience constructor with no filter.
104 /*RuleBasedTransliterator::RuleBasedTransliterator(
105 const UnicodeString& id,
106 const UnicodeString& rules,
107 UTransDirection direction,
108 UErrorCode& status) :
109 Transliterator(id, 0) {
110 UParseError parseError;
111 _construct(rules, direction,parseError, status);
115 * Covenience constructor with no filter and FORWARD direction.
117 /*RuleBasedTransliterator::RuleBasedTransliterator(
118 const UnicodeString& id,
119 const UnicodeString& rules,
120 UErrorCode& status) :
121 Transliterator(id, 0) {
122 UParseError parseError;
123 _construct(rules, UTRANS_FORWARD, parseError, status);
127 * Covenience constructor with FORWARD direction.
129 /*RuleBasedTransliterator::RuleBasedTransliterator(
130 const UnicodeString& id,
131 const UnicodeString& rules,
132 UnicodeFilter* adoptedFilter,
133 UErrorCode& status) :
134 Transliterator(id, adoptedFilter) {
135 UParseError parseError;
136 _construct(rules, UTRANS_FORWARD,parseError, status);
139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString
& id
,
140 const TransliterationRuleData
* theData
,
141 UnicodeFilter
* adoptedFilter
) :
142 Transliterator(id
, adoptedFilter
),
143 fData((TransliterationRuleData
*)theData
), // cast away const
145 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
149 * Internal constructor.
151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString
& id
,
152 TransliterationRuleData
* theData
,
153 UBool isDataAdopted
) :
154 Transliterator(id
, 0),
156 isDataOwned(isDataAdopted
) {
157 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
163 RuleBasedTransliterator::RuleBasedTransliterator(
164 const RuleBasedTransliterator
& other
) :
165 Transliterator(other
), fData(other
.fData
),
166 isDataOwned(other
.isDataOwned
) {
168 // The data object may or may not be owned. If it is not owned we
169 // share it; it is invariant. If it is owned, it's still
170 // invariant, but we need to copy it to prevent double-deletion.
171 // If this becomes a performance issue (if people do a lot of RBT
172 // copying -- unlikely) we can reference count the data object.
174 // Only do a deep copy if this is owned data, that is, data that
175 // will be later deleted. System transliterators contain
178 fData
= new TransliterationRuleData(*other
.fData
);
185 RuleBasedTransliterator::~RuleBasedTransliterator() {
186 // Delete the data object only if we own it.
192 Transliterator
* // Covariant return NOT ALLOWED (for portability)
193 RuleBasedTransliterator::clone(void) const {
194 return new RuleBasedTransliterator(*this);
198 * Implements {@link Transliterator#handleTransliterate}.
201 RuleBasedTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& index
,
202 UBool isIncremental
) const {
203 /* We keep contextStart and contextLimit fixed the entire time,
204 * relative to the text -- contextLimit may move numerically if
205 * text is inserted or removed. The start offset moves toward
206 * limit, with replacements happening under it.
208 * Example: rules 1. ab>x|y
211 * |eabcd begin - no match, advance start
212 * e|abcd match rule 1 - change text & adjust start
213 * ex|ycd match rule 2 - change text & adjust start
214 * exz|d no match, advance start
220 * creates an infinite loop. To prevent that, we put an arbitrary
221 * limit on the number of iterations that we take, one that is
222 * high enough that any reasonable rules are ok, but low enough to
223 * prevent a server from hanging. The limit is 16 times the
224 * number of characters n, unless n is so large that 16n exceeds a
227 uint32_t loopCount
= 0;
228 uint32_t loopLimit
= index
.limit
- index
.start
;
229 if (loopLimit
>= 0x10000000) {
230 loopLimit
= 0xFFFFFFFF;
235 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
236 // operations must be prevented.
237 // A Complication: compound transliterators can result in recursive entries to this
238 // function, sometimes with different "This" objects, always with the same text.
239 // Double-locking must be prevented in these cases.
242 // If the transliteration data is exclusively owned by this transliterator object,
243 // we don't need to do any locking. No sharing between transliterators is possible,
244 // so no concurrent access from multiple threads is possible.
245 UBool lockedMutexAtThisLevel
= FALSE
;
246 if (isDataOwned
== FALSE
) {
247 // Test whether this request is operating on the same text string as some
248 // some other transliteration that is still in progress and holding the
249 // transliteration mutex. If so, do not lock the transliteration
252 UMTX_CHECK(NULL
, (&text
!= gLockedText
), needToLock
);
254 umtx_lock(&transliteratorDataMutex
);
256 lockedMutexAtThisLevel
= TRUE
;
260 // Check to make sure we don't dereference a null pointer.
262 while (index
.start
< index
.limit
&&
263 loopCount
<= loopLimit
&&
264 fData
->ruleSet
.transliterate(text
, index
, isIncremental
)) {
268 if (lockedMutexAtThisLevel
) {
270 umtx_unlock(&transliteratorDataMutex
);
274 UnicodeString
& RuleBasedTransliterator::toRules(UnicodeString
& rulesSource
,
275 UBool escapeUnprintable
) const {
276 return fData
->ruleSet
.toRules(rulesSource
, escapeUnprintable
);
280 * Implement Transliterator framework
282 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet
& result
) const {
283 fData
->ruleSet
.getSourceTargetSet(result
, FALSE
);
287 * Override Transliterator framework
289 UnicodeSet
& RuleBasedTransliterator::getTargetSet(UnicodeSet
& result
) const {
290 return fData
->ruleSet
.getSourceTargetSet(result
, TRUE
);
295 #endif /* #if !UCONFIG_NO_TRANSLITERATION */