2 **********************************************************************
3 * Copyright (C) 1999-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/rep.h"
16 #include "unicode/uniset.h"
25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator
)
27 static UMTX transliteratorDataMutex
= NULL
;
28 static Replaceable
*gLockedText
= NULL
;
30 void RuleBasedTransliterator::_construct(const UnicodeString
& rules
,
31 UTransDirection direction
,
32 UParseError
& parseError
,
36 if (U_FAILURE(status
)) {
40 TransliteratorParser parser
;
41 parser
.parse(rules
, direction
, parseError
, status
);
42 if (U_FAILURE(status
)) {
46 if (parser
.idBlock
.length() != 0 ||
47 parser
.compoundFilter
!= NULL
) {
48 status
= U_INVALID_RBT_SYNTAX
; // ::ID blocks disallowed in RBT
52 fData
= parser
.orphanData();
53 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
57 * Constructs a new transliterator from the given rules.
58 * @param id the id for the transliterator.
59 * @param rules rules, separated by ';'
60 * @param direction either FORWARD or REVERSE.
61 * @param adoptedFilter the filter for this transliterator.
62 * @param parseError Struct to recieve information on position
63 * of error if an error is encountered
64 * @param status Output param set to success/failure code.
65 * @exception IllegalArgumentException if rules are malformed
66 * or direction is invalid.
68 RuleBasedTransliterator::RuleBasedTransliterator(
69 const UnicodeString
& id
,
70 const UnicodeString
& rules
,
71 UTransDirection direction
,
72 UnicodeFilter
* adoptedFilter
,
73 UParseError
& parseError
,
75 Transliterator(id
, adoptedFilter
) {
76 _construct(rules
, direction
,parseError
,status
);
80 * Constructs a new transliterator from the given rules.
81 * @param id the id for the transliterator.
82 * @param rules rules, separated by ';'
83 * @param direction either FORWARD or REVERSE.
84 * @param adoptedFilter the filter for this transliterator.
85 * @param status Output param set to success/failure code.
86 * @exception IllegalArgumentException if rules are malformed
87 * or direction is invalid.
89 RuleBasedTransliterator::RuleBasedTransliterator(
90 const UnicodeString
& id
,
91 const UnicodeString
& rules
,
92 UTransDirection direction
,
93 UnicodeFilter
* adoptedFilter
,
95 Transliterator(id
, adoptedFilter
) {
96 UParseError parseError
;
97 _construct(rules
, direction
,parseError
, status
);
101 * Covenience constructor with no filter.
103 RuleBasedTransliterator::RuleBasedTransliterator(
104 const UnicodeString
& id
,
105 const UnicodeString
& rules
,
106 UTransDirection direction
,
107 UErrorCode
& status
) :
108 Transliterator(id
, 0) {
109 UParseError parseError
;
110 _construct(rules
, direction
,parseError
, status
);
114 * Covenience constructor with no filter and FORWARD direction.
116 RuleBasedTransliterator::RuleBasedTransliterator(
117 const UnicodeString
& id
,
118 const UnicodeString
& rules
,
119 UErrorCode
& status
) :
120 Transliterator(id
, 0) {
121 UParseError parseError
;
122 _construct(rules
, UTRANS_FORWARD
, parseError
, status
);
126 * Covenience constructor with FORWARD direction.
128 RuleBasedTransliterator::RuleBasedTransliterator(
129 const UnicodeString
& id
,
130 const UnicodeString
& rules
,
131 UnicodeFilter
* adoptedFilter
,
132 UErrorCode
& status
) :
133 Transliterator(id
, adoptedFilter
) {
134 UParseError parseError
;
135 _construct(rules
, UTRANS_FORWARD
,parseError
, status
);
138 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString
& id
,
139 const TransliterationRuleData
* theData
,
140 UnicodeFilter
* adoptedFilter
) :
141 Transliterator(id
, adoptedFilter
),
142 fData((TransliterationRuleData
*)theData
), // cast away const
144 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
148 * Internal constructor.
150 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString
& id
,
151 TransliterationRuleData
* theData
,
152 UBool isDataAdopted
) :
153 Transliterator(id
, 0),
155 isDataOwned(isDataAdopted
) {
156 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
162 RuleBasedTransliterator::RuleBasedTransliterator(
163 const RuleBasedTransliterator
& other
) :
164 Transliterator(other
), fData(other
.fData
),
165 isDataOwned(other
.isDataOwned
) {
167 // The data object may or may not be owned. If it is not owned we
168 // share it; it is invariant. If it is owned, it's still
169 // invariant, but we need to copy it to prevent double-deletion.
170 // If this becomes a performance issue (if people do a lot of RBT
171 // copying -- unlikely) we can reference count the data object.
173 // Only do a deep copy if this is owned data, that is, data that
174 // will be later deleted. System transliterators contain
177 fData
= new TransliterationRuleData(*other
.fData
);
184 RuleBasedTransliterator::~RuleBasedTransliterator() {
185 // Delete the data object only if we own it.
191 Transliterator
* // Covariant return NOT ALLOWED (for portability)
192 RuleBasedTransliterator::clone(void) const {
193 return new RuleBasedTransliterator(*this);
197 * Implements {@link Transliterator#handleTransliterate}.
200 RuleBasedTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& index
,
201 UBool isIncremental
) const {
202 /* We keep contextStart and contextLimit fixed the entire time,
203 * relative to the text -- contextLimit may move numerically if
204 * text is inserted or removed. The start offset moves toward
205 * limit, with replacements happening under it.
207 * Example: rules 1. ab>x|y
210 * |eabcd begin - no match, advance start
211 * e|abcd match rule 1 - change text & adjust start
212 * ex|ycd match rule 2 - change text & adjust start
213 * exz|d no match, advance start
219 * creates an infinite loop. To prevent that, we put an arbitrary
220 * limit on the number of iterations that we take, one that is
221 * high enough that any reasonable rules are ok, but low enough to
222 * prevent a server from hanging. The limit is 16 times the
223 * number of characters n, unless n is so large that 16n exceeds a
226 uint32_t loopCount
= 0;
227 uint32_t loopLimit
= index
.limit
- index
.start
;
228 if (loopLimit
>= 0x10000000) {
229 loopLimit
= 0xFFFFFFFF;
234 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
235 // operations must be prevented.
236 // A Complication: compound transliterators can result in recursive entries to this
237 // function, sometimes with different "This" objects, always with the same text.
238 // Double-locking must be prevented in these cases.
241 // If the transliteration data is exclusively owned by this transliterator object,
242 // we don't need to do any locking. No sharing between transliterators is possible,
243 // so no concurrent access from multiple threads is possible.
244 UBool lockedMutexAtThisLevel
= FALSE
;
245 if (isDataOwned
== FALSE
) {
247 // Test whether this request is operating on the same text string as some
248 // some other transliteration that is still in progress and holding the
249 // transliteration mutex. If so, do not lock the transliteration
251 UBool needToLock
= (&text
!= gLockedText
);
254 umtx_lock(&transliteratorDataMutex
);
256 lockedMutexAtThisLevel
= TRUE
;
261 while (index
.start
< index
.limit
&&
262 loopCount
<= loopLimit
&&
263 fData
->ruleSet
.transliterate(text
, index
, isIncremental
)) {
266 if (lockedMutexAtThisLevel
) {
268 umtx_unlock(&transliteratorDataMutex
);
272 UnicodeString
& RuleBasedTransliterator::toRules(UnicodeString
& rulesSource
,
273 UBool escapeUnprintable
) const {
274 return fData
->ruleSet
.toRules(rulesSource
, escapeUnprintable
);
278 * Implement Transliterator framework
280 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet
& result
) const {
281 fData
->ruleSet
.getSourceTargetSet(result
, FALSE
);
285 * Override Transliterator framework
287 UnicodeSet
& RuleBasedTransliterator::getTargetSet(UnicodeSet
& result
) const {
288 return fData
->ruleSet
.getSourceTargetSet(result
, TRUE
);
293 #endif /* #if !UCONFIG_NO_TRANSLITERATION */