1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/17/99 aliu Creation.
10 **********************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_TRANSLITERATION
17 #include "unicode/rep.h"
18 #include "unicode/uniset.h"
28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator
)
30 static UMutex transliteratorDataMutex
= U_MUTEX_INITIALIZER
;
31 static Replaceable
*gLockedText
= NULL
;
33 void RuleBasedTransliterator::_construct(const UnicodeString
& rules
,
34 UTransDirection direction
,
35 UParseError
& parseError
,
39 if (U_FAILURE(status
)) {
43 TransliteratorParser
parser(status
);
44 parser
.parse(rules
, direction
, parseError
, status
);
45 if (U_FAILURE(status
)) {
49 if (parser
.idBlockVector
.size() != 0 ||
50 parser
.compoundFilter
!= NULL
||
51 parser
.dataVector
.size() == 0) {
52 status
= U_INVALID_RBT_SYNTAX
; // ::ID blocks disallowed in RBT
56 fData
= (TransliterationRuleData
*)parser
.dataVector
.orphanElementAt(0);
57 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
61 * Constructs a new transliterator from the given rules.
62 * @param id the id for the transliterator.
63 * @param rules rules, separated by ';'
64 * @param direction either FORWARD or REVERSE.
65 * @param adoptedFilter the filter for this transliterator.
66 * @param parseError Struct to recieve information on position
67 * of error if an error is encountered
68 * @param status Output param set to success/failure code.
69 * @exception IllegalArgumentException if rules are malformed
70 * or direction is invalid.
72 RuleBasedTransliterator::RuleBasedTransliterator(
73 const UnicodeString
& id
,
74 const UnicodeString
& rules
,
75 UTransDirection direction
,
76 UnicodeFilter
* adoptedFilter
,
77 UParseError
& parseError
,
79 Transliterator(id
, adoptedFilter
) {
80 _construct(rules
, direction
,parseError
,status
);
84 * Constructs a new transliterator from the given rules.
85 * @param id the id for the transliterator.
86 * @param rules rules, separated by ';'
87 * @param direction either FORWARD or REVERSE.
88 * @param adoptedFilter the filter for this transliterator.
89 * @param status Output param set to success/failure code.
90 * @exception IllegalArgumentException if rules are malformed
91 * or direction is invalid.
93 /*RuleBasedTransliterator::RuleBasedTransliterator(
94 const UnicodeString& id,
95 const UnicodeString& rules,
96 UTransDirection direction,
97 UnicodeFilter* adoptedFilter,
99 Transliterator(id, adoptedFilter) {
100 UParseError parseError;
101 _construct(rules, direction,parseError, status);
105 * Covenience constructor with no filter.
107 /*RuleBasedTransliterator::RuleBasedTransliterator(
108 const UnicodeString& id,
109 const UnicodeString& rules,
110 UTransDirection direction,
111 UErrorCode& status) :
112 Transliterator(id, 0) {
113 UParseError parseError;
114 _construct(rules, direction,parseError, status);
118 * Covenience constructor with no filter and FORWARD direction.
120 /*RuleBasedTransliterator::RuleBasedTransliterator(
121 const UnicodeString& id,
122 const UnicodeString& rules,
123 UErrorCode& status) :
124 Transliterator(id, 0) {
125 UParseError parseError;
126 _construct(rules, UTRANS_FORWARD, parseError, status);
130 * Covenience constructor with FORWARD direction.
132 /*RuleBasedTransliterator::RuleBasedTransliterator(
133 const UnicodeString& id,
134 const UnicodeString& rules,
135 UnicodeFilter* adoptedFilter,
136 UErrorCode& status) :
137 Transliterator(id, adoptedFilter) {
138 UParseError parseError;
139 _construct(rules, UTRANS_FORWARD,parseError, status);
142 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString
& id
,
143 const TransliterationRuleData
* theData
,
144 UnicodeFilter
* adoptedFilter
) :
145 Transliterator(id
, adoptedFilter
),
146 fData((TransliterationRuleData
*)theData
), // cast away const
148 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
152 * Internal constructor.
154 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString
& id
,
155 TransliterationRuleData
* theData
,
156 UBool isDataAdopted
) :
157 Transliterator(id
, 0),
159 isDataOwned(isDataAdopted
) {
160 setMaximumContextLength(fData
->ruleSet
.getMaximumContextLength());
166 RuleBasedTransliterator::RuleBasedTransliterator(
167 const RuleBasedTransliterator
& other
) :
168 Transliterator(other
), fData(other
.fData
),
169 isDataOwned(other
.isDataOwned
) {
171 // The data object may or may not be owned. If it is not owned we
172 // share it; it is invariant. If it is owned, it's still
173 // invariant, but we need to copy it to prevent double-deletion.
174 // If this becomes a performance issue (if people do a lot of RBT
175 // copying -- unlikely) we can reference count the data object.
177 // Only do a deep copy if this is owned data, that is, data that
178 // will be later deleted. System transliterators contain
181 fData
= new TransliterationRuleData(*other
.fData
);
188 RuleBasedTransliterator::~RuleBasedTransliterator() {
189 // Delete the data object only if we own it.
195 Transliterator
* // Covariant return NOT ALLOWED (for portability)
196 RuleBasedTransliterator::clone(void) const {
197 return new RuleBasedTransliterator(*this);
201 * Implements {@link Transliterator#handleTransliterate}.
204 RuleBasedTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& index
,
205 UBool isIncremental
) const {
206 /* We keep contextStart and contextLimit fixed the entire time,
207 * relative to the text -- contextLimit may move numerically if
208 * text is inserted or removed. The start offset moves toward
209 * limit, with replacements happening under it.
211 * Example: rules 1. ab>x|y
214 * |eabcd begin - no match, advance start
215 * e|abcd match rule 1 - change text & adjust start
216 * ex|ycd match rule 2 - change text & adjust start
217 * exz|d no match, advance start
223 * creates an infinite loop. To prevent that, we put an arbitrary
224 * limit on the number of iterations that we take, one that is
225 * high enough that any reasonable rules are ok, but low enough to
226 * prevent a server from hanging. The limit is 16 times the
227 * number of characters n, unless n is so large that 16n exceeds a
230 uint32_t loopCount
= 0;
231 uint32_t loopLimit
= index
.limit
- index
.start
;
232 if (loopLimit
>= 0x10000000) {
233 loopLimit
= 0xFFFFFFFF;
238 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
239 // operations must be prevented.
240 // A Complication: compound transliterators can result in recursive entries to this
241 // function, sometimes with different "This" objects, always with the same text.
242 // Double-locking must be prevented in these cases.
245 UBool lockedMutexAtThisLevel
= FALSE
;
247 // Test whether this request is operating on the same text string as
248 // some other transliteration that is still in progress and holding the
249 // transliteration mutex. If so, do not lock the transliteration
252 // gLockedText variable is protected by the global ICU mutex.
253 // Shared RBT data protected by transliteratorDataMutex.
255 // TODO(andy): Need a better scheme for handling this.
259 needToLock
= (&text
!= gLockedText
);
262 umtx_lock(&transliteratorDataMutex
); // Contention, longish waits possible here.
265 lockedMutexAtThisLevel
= TRUE
;
268 // Check to make sure we don't dereference a null pointer.
270 while (index
.start
< index
.limit
&&
271 loopCount
<= loopLimit
&&
272 fData
->ruleSet
.transliterate(text
, index
, isIncremental
)) {
276 if (lockedMutexAtThisLevel
) {
281 umtx_unlock(&transliteratorDataMutex
);
285 UnicodeString
& RuleBasedTransliterator::toRules(UnicodeString
& rulesSource
,
286 UBool escapeUnprintable
) const {
287 return fData
->ruleSet
.toRules(rulesSource
, escapeUnprintable
);
291 * Implement Transliterator framework
293 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet
& result
) const {
294 fData
->ruleSet
.getSourceTargetSet(result
, FALSE
);
298 * Override Transliterator framework
300 UnicodeSet
& RuleBasedTransliterator::getTargetSet(UnicodeSet
& result
) const {
301 return fData
->ruleSet
.getSourceTargetSet(result
, TRUE
);
306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */