]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/rbt_rule.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / rbt_rule.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
3* Copyright (C) 1999-2001, International Business Machines
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* Date Name Description
7* 11/17/99 aliu Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/rep.h"
16#include "unicode/unifilt.h"
17#include "unicode/uniset.h"
18#include "rbt_rule.h"
19#include "rbt_data.h"
20#include "cmemory.h"
21#include "strmatch.h"
22#include "strrepl.h"
23#include "util.h"
24
25static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "
26
27U_NAMESPACE_BEGIN
28
29/**
30 * Construct a new rule with the given input, output text, and other
31 * attributes. A cursor position may be specified for the output text.
32 * @param input input string, including key and optional ante and
33 * post context
34 * @param anteContextPos offset into input to end of ante context, or -1 if
35 * none. Must be <= input.length() if not -1.
36 * @param postContextPos offset into input to start of post context, or -1
37 * if none. Must be <= input.length() if not -1, and must be >=
38 * anteContextPos.
39 * @param output output string
40 * @param cursorPosition offset into output at which cursor is located, or -1 if
41 * none. If less than zero, then the cursor is placed after the
42 * <code>output</code>; that is, -1 is equivalent to
43 * <code>output.length()</code>. If greater than
44 * <code>output.length()</code> then an exception is thrown.
45 * @param segs array of UnicodeFunctors corresponding to input pattern
46 * segments, or null if there are none. The array itself is adopted,
47 * but the pointers within it are not.
48 * @param segsCount number of elements in segs[]
49 * @param anchorStart TRUE if the the rule is anchored on the left to
50 * the context start
51 * @param anchorEnd TRUE if the rule is anchored on the right to the
52 * context limit
53 */
54TransliterationRule::TransliterationRule(const UnicodeString& input,
55 int32_t anteContextPos, int32_t postContextPos,
56 const UnicodeString& outputStr,
57 int32_t cursorPosition, int32_t cursorOffset,
58 UnicodeFunctor** segs,
59 int32_t segsCount,
60 UBool anchorStart, UBool anchorEnd,
61 const TransliterationRuleData* theData,
62 UErrorCode& status) :
63 UMemory(),
64 segments(0),
65 data(theData) {
66
67 if (U_FAILURE(status)) {
68 return;
69 }
70 // Do range checks only when warranted to save time
71 if (anteContextPos < 0) {
72 anteContextLength = 0;
73 } else {
74 if (anteContextPos > input.length()) {
75 // throw new IllegalArgumentException("Invalid ante context");
76 status = U_ILLEGAL_ARGUMENT_ERROR;
77 return;
78 }
79 anteContextLength = anteContextPos;
80 }
81 if (postContextPos < 0) {
82 keyLength = input.length() - anteContextLength;
83 } else {
84 if (postContextPos < anteContextLength ||
85 postContextPos > input.length()) {
86 // throw new IllegalArgumentException("Invalid post context");
87 status = U_ILLEGAL_ARGUMENT_ERROR;
88 return;
89 }
90 keyLength = postContextPos - anteContextLength;
91 }
92 if (cursorPosition < 0) {
93 cursorPosition = outputStr.length();
94 } else if (cursorPosition > outputStr.length()) {
95 // throw new IllegalArgumentException("Invalid cursor position");
96 status = U_ILLEGAL_ARGUMENT_ERROR;
97 return;
98 }
99 // We don't validate the segments array. The caller must
100 // guarantee that the segments are well-formed (that is, that
101 // all $n references in the output refer to indices of this
102 // array, and that no array elements are null).
103 this->segments = segs;
104 this->segmentsCount = segsCount;
105
106 pattern = input;
107 flags = 0;
108 if (anchorStart) {
109 flags |= ANCHOR_START;
110 }
111 if (anchorEnd) {
112 flags |= ANCHOR_END;
113 }
114
115 anteContext = NULL;
116 if (anteContextLength > 0) {
117 anteContext = new StringMatcher(pattern, 0, anteContextLength,
118 FALSE, *data);
119 /* test for NULL */
120 if (anteContext == 0) {
121 status = U_MEMORY_ALLOCATION_ERROR;
122 return;
123 }
124 }
125
126 key = NULL;
127 if (keyLength > 0) {
128 key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,
129 FALSE, *data);
130 /* test for NULL */
131 if (key == 0) {
132 status = U_MEMORY_ALLOCATION_ERROR;
133 return;
134 }
135 }
136
137 int32_t postContextLength = pattern.length() - keyLength - anteContextLength;
138 postContext = NULL;
139 if (postContextLength > 0) {
140 postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),
141 FALSE, *data);
142 /* test for NULL */
143 if (postContext == 0) {
144 status = U_MEMORY_ALLOCATION_ERROR;
145 return;
146 }
147 }
148
149 this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);
150 /* test for NULL */
151 if (this->output == 0) {
152 status = U_MEMORY_ALLOCATION_ERROR;
153 return;
154 }
155}
156
157/**
158 * Copy constructor.
159 */
160TransliterationRule::TransliterationRule(TransliterationRule& other) :
161 UMemory(other),
162 anteContext(NULL),
163 key(NULL),
164 postContext(NULL),
165 pattern(other.pattern),
166 anteContextLength(other.anteContextLength),
167 keyLength(other.keyLength),
168 flags(other.flags),
169 data(other.data) {
170
171 segments = NULL;
172 segmentsCount = 0;
173 if (other.segmentsCount > 0) {
174 segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));
175 uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
176 }
177
178 if (other.anteContext != NULL) {
179 anteContext = (StringMatcher*) other.anteContext->clone();
180 }
181 if (other.key != NULL) {
182 key = (StringMatcher*) other.key->clone();
183 }
184 if (other.postContext != NULL) {
185 postContext = (StringMatcher*) other.postContext->clone();
186 }
187 output = other.output->clone();
188}
189
190TransliterationRule::~TransliterationRule() {
191 uprv_free(segments);
192 delete anteContext;
193 delete key;
194 delete postContext;
195 delete output;
196}
197
198/**
199 * Return the preceding context length. This method is needed to
200 * support the <code>Transliterator</code> method
201 * <code>getMaximumContextLength()</code>. Internally, this is
202 * implemented as the anteContextLength, optionally plus one if
203 * there is a start anchor. The one character anchor gap is
204 * needed to make repeated incremental transliteration with
205 * anchors work.
206 */
207int32_t TransliterationRule::getContextLength(void) const {
208 return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
209}
210
211/**
212 * Internal method. Returns 8-bit index value for this rule.
213 * This is the low byte of the first character of the key,
214 * unless the first character of the key is a set. If it's a
215 * set, or otherwise can match multiple keys, the index value is -1.
216 */
217int16_t TransliterationRule::getIndexValue() const {
218 if (anteContextLength == pattern.length()) {
219 // A pattern with just ante context {such as foo)>bar} can
220 // match any key.
221 return -1;
222 }
223 UChar32 c = pattern.char32At(anteContextLength);
224 return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);
225}
226
227/**
228 * Internal method. Returns true if this rule matches the given
229 * index value. The index value is an 8-bit integer, 0..255,
230 * representing the low byte of the first character of the key.
231 * It matches this rule if it matches the first character of the
232 * key, or if the first character of the key is a set, and the set
233 * contains any character with a low byte equal to the index
234 * value. If the rule contains only ante context, as in foo)>bar,
235 * then it will match any key.
236 */
237UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
238 // Delegate to the key, or if there is none, to the postContext.
239 // If there is neither then we match any key; return true.
240 UnicodeMatcher *m = (key != NULL) ? key : postContext;
241 return (m != NULL) ? m->matchesIndexValue(v) : TRUE;
242}
243
244/**
245 * Return true if this rule masks another rule. If r1 masks r2 then
246 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
247 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
248 * "[c]a>x" masks "[dc]a>y".
249 */
250UBool TransliterationRule::masks(const TransliterationRule& r2) const {
251 /* Rule r1 masks rule r2 if the string formed of the
252 * antecontext, key, and postcontext overlaps in the following
253 * way:
254 *
255 * r1: aakkkpppp
256 * r2: aaakkkkkpppp
257 * ^
258 *
259 * The strings must be aligned at the first character of the
260 * key. The length of r1 to the left of the alignment point
261 * must be <= the length of r2 to the left; ditto for the
262 * right. The characters of r1 must equal (or be a superset
263 * of) the corresponding characters of r2. The superset
264 * operation should be performed to check for UnicodeSet
265 * masking.
266 *
267 * Anchors: Two patterns that differ only in anchors only
268 * mask one another if they are exactly equal, and r2 has
269 * all the anchors r1 has (optionally, plus some). Here Y
270 * means the row masks the column, N means it doesn't.
271 *
272 * ab ^ab ab$ ^ab$
273 * ab Y Y Y Y
274 * ^ab N Y N Y
275 * ab$ N N Y Y
276 * ^ab$ N N N Y
277 *
278 * Post context: {a}b masks ab, but not vice versa, since {a}b
279 * matches everything ab matches, and {a}b matches {|a|}b but ab
280 * does not. Pre context is different (a{b} does not align with
281 * ab).
282 */
283
284 /* LIMITATION of the current mask algorithm: Some rule
285 * maskings are currently not detected. For example,
286 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO
287 */
288
289 int32_t len = pattern.length();
290 int32_t left = anteContextLength;
291 int32_t left2 = r2.anteContextLength;
292 int32_t right = len - left;
293 int32_t right2 = r2.pattern.length() - left2;
294
295 // TODO Clean this up -- some logic might be combinable with the
296 // next statement.
297
298 // Test for anchor masking
299 if (left == left2 && right == right2 &&
300 keyLength <= r2.keyLength &&
301 0 == r2.pattern.compare(0, len, pattern)) {
302 // The following boolean logic implements the table above
303 return (flags == r2.flags) ||
304 (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
305 ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
306 }
307
308 return left <= left2 &&
309 (right < right2 ||
310 (right == right2 && keyLength <= r2.keyLength)) &&
311 0 == r2.pattern.compare(left2 - left, len, pattern);
312}
313
314static inline int32_t posBefore(const Replaceable& str, int32_t pos) {
315 return (pos > 0) ?
316 pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) :
317 pos - 1;
318}
319
320static inline int32_t posAfter(const Replaceable& str, int32_t pos) {
321 return (pos >= 0 && pos < str.length()) ?
322 pos + UTF_CHAR_LENGTH(str.char32At(pos)) :
323 pos + 1;
324}
325
326/**
327 * Attempt a match and replacement at the given position. Return
328 * the degree of match between this rule and the given text. The
329 * degree of match may be mismatch, a partial match, or a full
330 * match. A mismatch means at least one character of the text
331 * does not match the context or key. A partial match means some
332 * context and key characters match, but the text is not long
333 * enough to match all of them. A full match means all context
334 * and key characters match.
335 *
336 * If a full match is obtained, perform a replacement, update pos,
337 * and return U_MATCH. Otherwise both text and pos are unchanged.
338 *
339 * @param text the text
340 * @param pos the position indices
341 * @param incremental if TRUE, test for partial matches that may
342 * be completed by additional text inserted at pos.limit.
343 * @return one of <code>U_MISMATCH</code>,
344 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
345 * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
346 */
347UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
348 UTransPosition& pos,
349 UBool incremental) const {
350 // Matching and replacing are done in one method because the
351 // replacement operation needs information obtained during the
352 // match. Another way to do this is to have the match method
353 // create a match result struct with relevant offsets, and to pass
354 // this into the replace method.
355
356 // ============================ MATCH ===========================
357
358 // Reset segment match data
359 if (segments != NULL) {
360 for (int32_t i=0; i<segmentsCount; ++i) {
361 ((StringMatcher*) segments[i])->resetMatch();
362 }
363 }
364
365// int32_t lenDelta, keyLimit;
366 int32_t keyLimit;
367
368 // ------------------------ Ante Context ------------------------
369
370 // A mismatch in the ante context, or with the start anchor,
371 // is an outright U_MISMATCH regardless of whether we are
372 // incremental or not.
373 int32_t oText; // offset into 'text'
374// int32_t newStart = 0;
375 int32_t minOText;
376
377 // Note (1): We process text in 16-bit code units, rather than
378 // 32-bit code points. This works because stand-ins are
379 // always in the BMP and because we are doing a literal match
380 // operation, which can be done 16-bits at a time.
381
382 int32_t anteLimit = posBefore(text, pos.contextStart);
383
384 UMatchDegree match;
385
386 // Start reverse match at char before pos.start
387 oText = posBefore(text, pos.start);
388
389 if (anteContext != NULL) {
390 match = anteContext->matches(text, oText, anteLimit, FALSE);
391 if (match != U_MATCH) {
392 return U_MISMATCH;
393 }
394 }
395
396 minOText = posAfter(text, oText);
397
398 // ------------------------ Start Anchor ------------------------
399
400 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
401 return U_MISMATCH;
402 }
403
404 // -------------------- Key and Post Context --------------------
405
406 oText = pos.start;
407
408 if (key != NULL) {
409 match = key->matches(text, oText, pos.limit, incremental);
410 if (match != U_MATCH) {
411 return match;
412 }
413 }
414
415 keyLimit = oText;
416
417 if (postContext != NULL) {
418 if (incremental && keyLimit == pos.limit) {
419 // The key matches just before pos.limit, and there is
420 // a postContext. Since we are in incremental mode,
421 // we must assume more characters may be inserted at
422 // pos.limit -- this is a partial match.
423 return U_PARTIAL_MATCH;
424 }
425
426 match = postContext->matches(text, oText, pos.contextLimit, incremental);
427 if (match != U_MATCH) {
428 return match;
429 }
430 }
431
432 // ------------------------- Stop Anchor ------------------------
433
434 if (((flags & ANCHOR_END)) != 0) {
435 if (oText != pos.contextLimit) {
436 return U_MISMATCH;
437 }
438 if (incremental) {
439 return U_PARTIAL_MATCH;
440 }
441 }
442
443 // =========================== REPLACE ==========================
444
445 // We have a full match. The key is between pos.start and
446 // keyLimit.
447
448 int32_t newStart;
449 int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);
450 int32_t lenDelta = newLength - (keyLimit - pos.start);
451
452 oText += lenDelta;
453 pos.limit += lenDelta;
454 pos.contextLimit += lenDelta;
455 // Restrict new value of start to [minOText, min(oText, pos.limit)].
456 pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
457 return U_MATCH;
458}
459
460/**
461 * Create a source string that represents this rule. Append it to the
462 * given string.
463 */
464UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
465 UBool escapeUnprintable) const {
466
467 // Accumulate special characters (and non-specials following them)
468 // into quoteBuf. Append quoteBuf, within single quotes, when
469 // a non-quoted element must be inserted.
470 UnicodeString str, quoteBuf;
471
472 // Do not emit the braces '{' '}' around the pattern if there
473 // is neither anteContext nor postContext.
474 UBool emitBraces =
475 (anteContext != NULL) || (postContext != NULL);
476
477 // Emit start anchor
478 if ((flags & ANCHOR_START) != 0) {
479 rule.append((UChar)94/*^*/);
480 }
481
482 // Emit the input pattern
483 ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
484
485 if (emitBraces) {
486 ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
487 }
488
489 ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);
490
491 if (emitBraces) {
492 ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
493 }
494
495 ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
496
497 // Emit end anchor
498 if ((flags & ANCHOR_END) != 0) {
499 rule.append((UChar)36/*$*/);
500 }
501
502 ICU_Utility::appendToRule(rule, FORWARD_OP, TRUE, escapeUnprintable, quoteBuf);
503
504 // Emit the output pattern
505
506 ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),
507 TRUE, escapeUnprintable, quoteBuf);
508
509 ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
510
511 return rule;
512}
513
514void TransliterationRule::setData(const TransliterationRuleData* d) {
515 data = d;
516 if (anteContext != NULL) anteContext->setData(d);
517 if (postContext != NULL) postContext->setData(d);
518 if (key != NULL) key->setData(d);
519 // assert(output != NULL);
520 output->setData(d);
521 // Don't have to do segments since they are in the context or key
522}
523
524/**
525 * Union the set of all characters that may be modified by this rule
526 * into the given set.
527 */
528void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
529 int32_t limit = anteContextLength + keyLength;
530 for (int32_t i=anteContextLength; i<limit; ) {
531 UChar32 ch = pattern.char32At(i);
532 i += UTF_CHAR_LENGTH(ch);
533 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
534 if (matcher == NULL) {
535 toUnionTo.add(ch);
536 } else {
537 matcher->addMatchSetTo(toUnionTo);
538 }
539 }
540}
541
542/**
543 * Union the set of all characters that may be emitted by this rule
544 * into the given set.
545 */
546void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {
547 output->toReplacer()->addReplacementSetTo(toUnionTo);
548}
549
550U_NAMESPACE_END
551
552#endif /* #if !UCONFIG_NO_TRANSLITERATION */
553
554//eof