]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ********************************************************************** | |
3 | * Copyright (C) 1999-2011, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 11/17/99 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "unicode/rep.h" | |
16 | #include "unicode/unifilt.h" | |
17 | #include "unicode/uniset.h" | |
18 | #include "unicode/utf16.h" | |
19 | #include "rbt_rule.h" | |
20 | #include "rbt_data.h" | |
21 | #include "cmemory.h" | |
22 | #include "strmatch.h" | |
23 | #include "strrepl.h" | |
24 | #include "util.h" | |
25 | #include "putilimp.h" | |
26 | ||
27 | static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " | |
28 | ||
29 | U_NAMESPACE_BEGIN | |
30 | ||
31 | /** | |
32 | * Construct a new rule with the given input, output text, and other | |
33 | * attributes. A cursor position may be specified for the output text. | |
34 | * @param input input string, including key and optional ante and | |
35 | * post context | |
36 | * @param anteContextPos offset into input to end of ante context, or -1 if | |
37 | * none. Must be <= input.length() if not -1. | |
38 | * @param postContextPos offset into input to start of post context, or -1 | |
39 | * if none. Must be <= input.length() if not -1, and must be >= | |
40 | * anteContextPos. | |
41 | * @param output output string | |
42 | * @param cursorPosition offset into output at which cursor is located, or -1 if | |
43 | * none. If less than zero, then the cursor is placed after the | |
44 | * <code>output</code>; that is, -1 is equivalent to | |
45 | * <code>output.length()</code>. If greater than | |
46 | * <code>output.length()</code> then an exception is thrown. | |
47 | * @param segs array of UnicodeFunctors corresponding to input pattern | |
48 | * segments, or null if there are none. The array itself is adopted, | |
49 | * but the pointers within it are not. | |
50 | * @param segsCount number of elements in segs[] | |
51 | * @param anchorStart TRUE if the the rule is anchored on the left to | |
52 | * the context start | |
53 | * @param anchorEnd TRUE if the rule is anchored on the right to the | |
54 | * context limit | |
55 | */ | |
56 | TransliterationRule::TransliterationRule(const UnicodeString& input, | |
57 | int32_t anteContextPos, int32_t postContextPos, | |
58 | const UnicodeString& outputStr, | |
59 | int32_t cursorPosition, int32_t cursorOffset, | |
60 | UnicodeFunctor** segs, | |
61 | int32_t segsCount, | |
62 | UBool anchorStart, UBool anchorEnd, | |
63 | const TransliterationRuleData* theData, | |
64 | UErrorCode& status) : | |
65 | UMemory(), | |
66 | segments(0), | |
67 | data(theData) { | |
68 | ||
69 | if (U_FAILURE(status)) { | |
70 | return; | |
71 | } | |
72 | // Do range checks only when warranted to save time | |
73 | if (anteContextPos < 0) { | |
74 | anteContextLength = 0; | |
75 | } else { | |
76 | if (anteContextPos > input.length()) { | |
77 | // throw new IllegalArgumentException("Invalid ante context"); | |
78 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
79 | return; | |
80 | } | |
81 | anteContextLength = anteContextPos; | |
82 | } | |
83 | if (postContextPos < 0) { | |
84 | keyLength = input.length() - anteContextLength; | |
85 | } else { | |
86 | if (postContextPos < anteContextLength || | |
87 | postContextPos > input.length()) { | |
88 | // throw new IllegalArgumentException("Invalid post context"); | |
89 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
90 | return; | |
91 | } | |
92 | keyLength = postContextPos - anteContextLength; | |
93 | } | |
94 | if (cursorPosition < 0) { | |
95 | cursorPosition = outputStr.length(); | |
96 | } else if (cursorPosition > outputStr.length()) { | |
97 | // throw new IllegalArgumentException("Invalid cursor position"); | |
98 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
99 | return; | |
100 | } | |
101 | // We don't validate the segments array. The caller must | |
102 | // guarantee that the segments are well-formed (that is, that | |
103 | // all $n references in the output refer to indices of this | |
104 | // array, and that no array elements are null). | |
105 | this->segments = segs; | |
106 | this->segmentsCount = segsCount; | |
107 | ||
108 | pattern = input; | |
109 | flags = 0; | |
110 | if (anchorStart) { | |
111 | flags |= ANCHOR_START; | |
112 | } | |
113 | if (anchorEnd) { | |
114 | flags |= ANCHOR_END; | |
115 | } | |
116 | ||
117 | anteContext = NULL; | |
118 | if (anteContextLength > 0) { | |
119 | anteContext = new StringMatcher(pattern, 0, anteContextLength, | |
120 | FALSE, *data); | |
121 | /* test for NULL */ | |
122 | if (anteContext == 0) { | |
123 | status = U_MEMORY_ALLOCATION_ERROR; | |
124 | return; | |
125 | } | |
126 | } | |
127 | ||
128 | key = NULL; | |
129 | if (keyLength > 0) { | |
130 | key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, | |
131 | FALSE, *data); | |
132 | /* test for NULL */ | |
133 | if (key == 0) { | |
134 | status = U_MEMORY_ALLOCATION_ERROR; | |
135 | return; | |
136 | } | |
137 | } | |
138 | ||
139 | int32_t postContextLength = pattern.length() - keyLength - anteContextLength; | |
140 | postContext = NULL; | |
141 | if (postContextLength > 0) { | |
142 | postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), | |
143 | FALSE, *data); | |
144 | /* test for NULL */ | |
145 | if (postContext == 0) { | |
146 | status = U_MEMORY_ALLOCATION_ERROR; | |
147 | return; | |
148 | } | |
149 | } | |
150 | ||
151 | this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); | |
152 | /* test for NULL */ | |
153 | if (this->output == 0) { | |
154 | status = U_MEMORY_ALLOCATION_ERROR; | |
155 | return; | |
156 | } | |
157 | } | |
158 | ||
159 | /** | |
160 | * Copy constructor. | |
161 | */ | |
162 | TransliterationRule::TransliterationRule(TransliterationRule& other) : | |
163 | UMemory(other), | |
164 | anteContext(NULL), | |
165 | key(NULL), | |
166 | postContext(NULL), | |
167 | pattern(other.pattern), | |
168 | anteContextLength(other.anteContextLength), | |
169 | keyLength(other.keyLength), | |
170 | flags(other.flags), | |
171 | data(other.data) { | |
172 | ||
173 | segments = NULL; | |
174 | segmentsCount = 0; | |
175 | if (other.segmentsCount > 0) { | |
176 | segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); | |
177 | uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); | |
178 | } | |
179 | ||
180 | if (other.anteContext != NULL) { | |
181 | anteContext = (StringMatcher*) other.anteContext->clone(); | |
182 | } | |
183 | if (other.key != NULL) { | |
184 | key = (StringMatcher*) other.key->clone(); | |
185 | } | |
186 | if (other.postContext != NULL) { | |
187 | postContext = (StringMatcher*) other.postContext->clone(); | |
188 | } | |
189 | output = other.output->clone(); | |
190 | } | |
191 | ||
192 | TransliterationRule::~TransliterationRule() { | |
193 | uprv_free(segments); | |
194 | delete anteContext; | |
195 | delete key; | |
196 | delete postContext; | |
197 | delete output; | |
198 | } | |
199 | ||
200 | /** | |
201 | * Return the preceding context length. This method is needed to | |
202 | * support the <code>Transliterator</code> method | |
203 | * <code>getMaximumContextLength()</code>. Internally, this is | |
204 | * implemented as the anteContextLength, optionally plus one if | |
205 | * there is a start anchor. The one character anchor gap is | |
206 | * needed to make repeated incremental transliteration with | |
207 | * anchors work. | |
208 | */ | |
209 | int32_t TransliterationRule::getContextLength(void) const { | |
210 | return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); | |
211 | } | |
212 | ||
213 | /** | |
214 | * Internal method. Returns 8-bit index value for this rule. | |
215 | * This is the low byte of the first character of the key, | |
216 | * unless the first character of the key is a set. If it's a | |
217 | * set, or otherwise can match multiple keys, the index value is -1. | |
218 | */ | |
219 | int16_t TransliterationRule::getIndexValue() const { | |
220 | if (anteContextLength == pattern.length()) { | |
221 | // A pattern with just ante context {such as foo)>bar} can | |
222 | // match any key. | |
223 | return -1; | |
224 | } | |
225 | UChar32 c = pattern.char32At(anteContextLength); | |
226 | return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); | |
227 | } | |
228 | ||
229 | /** | |
230 | * Internal method. Returns true if this rule matches the given | |
231 | * index value. The index value is an 8-bit integer, 0..255, | |
232 | * representing the low byte of the first character of the key. | |
233 | * It matches this rule if it matches the first character of the | |
234 | * key, or if the first character of the key is a set, and the set | |
235 | * contains any character with a low byte equal to the index | |
236 | * value. If the rule contains only ante context, as in foo)>bar, | |
237 | * then it will match any key. | |
238 | */ | |
239 | UBool TransliterationRule::matchesIndexValue(uint8_t v) const { | |
240 | // Delegate to the key, or if there is none, to the postContext. | |
241 | // If there is neither then we match any key; return true. | |
242 | UnicodeMatcher *m = (key != NULL) ? key : postContext; | |
243 | return (m != NULL) ? m->matchesIndexValue(v) : TRUE; | |
244 | } | |
245 | ||
246 | /** | |
247 | * Return true if this rule masks another rule. If r1 masks r2 then | |
248 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks | |
249 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". | |
250 | * "[c]a>x" masks "[dc]a>y". | |
251 | */ | |
252 | UBool TransliterationRule::masks(const TransliterationRule& r2) const { | |
253 | /* Rule r1 masks rule r2 if the string formed of the | |
254 | * antecontext, key, and postcontext overlaps in the following | |
255 | * way: | |
256 | * | |
257 | * r1: aakkkpppp | |
258 | * r2: aaakkkkkpppp | |
259 | * ^ | |
260 | * | |
261 | * The strings must be aligned at the first character of the | |
262 | * key. The length of r1 to the left of the alignment point | |
263 | * must be <= the length of r2 to the left; ditto for the | |
264 | * right. The characters of r1 must equal (or be a superset | |
265 | * of) the corresponding characters of r2. The superset | |
266 | * operation should be performed to check for UnicodeSet | |
267 | * masking. | |
268 | * | |
269 | * Anchors: Two patterns that differ only in anchors only | |
270 | * mask one another if they are exactly equal, and r2 has | |
271 | * all the anchors r1 has (optionally, plus some). Here Y | |
272 | * means the row masks the column, N means it doesn't. | |
273 | * | |
274 | * ab ^ab ab$ ^ab$ | |
275 | * ab Y Y Y Y | |
276 | * ^ab N Y N Y | |
277 | * ab$ N N Y Y | |
278 | * ^ab$ N N N Y | |
279 | * | |
280 | * Post context: {a}b masks ab, but not vice versa, since {a}b | |
281 | * matches everything ab matches, and {a}b matches {|a|}b but ab | |
282 | * does not. Pre context is different (a{b} does not align with | |
283 | * ab). | |
284 | */ | |
285 | ||
286 | /* LIMITATION of the current mask algorithm: Some rule | |
287 | * maskings are currently not detected. For example, | |
288 | * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO | |
289 | */ | |
290 | ||
291 | int32_t len = pattern.length(); | |
292 | int32_t left = anteContextLength; | |
293 | int32_t left2 = r2.anteContextLength; | |
294 | int32_t right = len - left; | |
295 | int32_t right2 = r2.pattern.length() - left2; | |
296 | int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); | |
297 | ||
298 | // TODO Clean this up -- some logic might be combinable with the | |
299 | // next statement. | |
300 | ||
301 | // Test for anchor masking | |
302 | if (left == left2 && right == right2 && | |
303 | keyLength <= r2.keyLength && | |
304 | 0 == cachedCompare) { | |
305 | // The following boolean logic implements the table above | |
306 | return (flags == r2.flags) || | |
307 | (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || | |
308 | ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); | |
309 | } | |
310 | ||
311 | return left <= left2 && | |
312 | (right < right2 || | |
313 | (right == right2 && keyLength <= r2.keyLength)) && | |
314 | (0 == cachedCompare); | |
315 | } | |
316 | ||
317 | static inline int32_t posBefore(const Replaceable& str, int32_t pos) { | |
318 | return (pos > 0) ? | |
319 | pos - U16_LENGTH(str.char32At(pos-1)) : | |
320 | pos - 1; | |
321 | } | |
322 | ||
323 | static inline int32_t posAfter(const Replaceable& str, int32_t pos) { | |
324 | return (pos >= 0 && pos < str.length()) ? | |
325 | pos + U16_LENGTH(str.char32At(pos)) : | |
326 | pos + 1; | |
327 | } | |
328 | ||
329 | /** | |
330 | * Attempt a match and replacement at the given position. Return | |
331 | * the degree of match between this rule and the given text. The | |
332 | * degree of match may be mismatch, a partial match, or a full | |
333 | * match. A mismatch means at least one character of the text | |
334 | * does not match the context or key. A partial match means some | |
335 | * context and key characters match, but the text is not long | |
336 | * enough to match all of them. A full match means all context | |
337 | * and key characters match. | |
338 | * | |
339 | * If a full match is obtained, perform a replacement, update pos, | |
340 | * and return U_MATCH. Otherwise both text and pos are unchanged. | |
341 | * | |
342 | * @param text the text | |
343 | * @param pos the position indices | |
344 | * @param incremental if TRUE, test for partial matches that may | |
345 | * be completed by additional text inserted at pos.limit. | |
346 | * @return one of <code>U_MISMATCH</code>, | |
347 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If | |
348 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. | |
349 | */ | |
350 | UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, | |
351 | UTransPosition& pos, | |
352 | UBool incremental) const { | |
353 | // Matching and replacing are done in one method because the | |
354 | // replacement operation needs information obtained during the | |
355 | // match. Another way to do this is to have the match method | |
356 | // create a match result struct with relevant offsets, and to pass | |
357 | // this into the replace method. | |
358 | ||
359 | // ============================ MATCH =========================== | |
360 | ||
361 | // Reset segment match data | |
362 | if (segments != NULL) { | |
363 | for (int32_t i=0; i<segmentsCount; ++i) { | |
364 | ((StringMatcher*) segments[i])->resetMatch(); | |
365 | } | |
366 | } | |
367 | ||
368 | // int32_t lenDelta, keyLimit; | |
369 | int32_t keyLimit; | |
370 | ||
371 | // ------------------------ Ante Context ------------------------ | |
372 | ||
373 | // A mismatch in the ante context, or with the start anchor, | |
374 | // is an outright U_MISMATCH regardless of whether we are | |
375 | // incremental or not. | |
376 | int32_t oText; // offset into 'text' | |
377 | // int32_t newStart = 0; | |
378 | int32_t minOText; | |
379 | ||
380 | // Note (1): We process text in 16-bit code units, rather than | |
381 | // 32-bit code points. This works because stand-ins are | |
382 | // always in the BMP and because we are doing a literal match | |
383 | // operation, which can be done 16-bits at a time. | |
384 | ||
385 | int32_t anteLimit = posBefore(text, pos.contextStart); | |
386 | ||
387 | UMatchDegree match; | |
388 | ||
389 | // Start reverse match at char before pos.start | |
390 | oText = posBefore(text, pos.start); | |
391 | ||
392 | if (anteContext != NULL) { | |
393 | match = anteContext->matches(text, oText, anteLimit, FALSE); | |
394 | if (match != U_MATCH) { | |
395 | return U_MISMATCH; | |
396 | } | |
397 | } | |
398 | ||
399 | minOText = posAfter(text, oText); | |
400 | ||
401 | // ------------------------ Start Anchor ------------------------ | |
402 | ||
403 | if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { | |
404 | return U_MISMATCH; | |
405 | } | |
406 | ||
407 | // -------------------- Key and Post Context -------------------- | |
408 | ||
409 | oText = pos.start; | |
410 | ||
411 | if (key != NULL) { | |
412 | match = key->matches(text, oText, pos.limit, incremental); | |
413 | if (match != U_MATCH) { | |
414 | return match; | |
415 | } | |
416 | } | |
417 | ||
418 | keyLimit = oText; | |
419 | ||
420 | if (postContext != NULL) { | |
421 | if (incremental && keyLimit == pos.limit) { | |
422 | // The key matches just before pos.limit, and there is | |
423 | // a postContext. Since we are in incremental mode, | |
424 | // we must assume more characters may be inserted at | |
425 | // pos.limit -- this is a partial match. | |
426 | return U_PARTIAL_MATCH; | |
427 | } | |
428 | ||
429 | match = postContext->matches(text, oText, pos.contextLimit, incremental); | |
430 | if (match != U_MATCH) { | |
431 | return match; | |
432 | } | |
433 | } | |
434 | ||
435 | // ------------------------- Stop Anchor ------------------------ | |
436 | ||
437 | if (((flags & ANCHOR_END)) != 0) { | |
438 | if (oText != pos.contextLimit) { | |
439 | return U_MISMATCH; | |
440 | } | |
441 | if (incremental) { | |
442 | return U_PARTIAL_MATCH; | |
443 | } | |
444 | } | |
445 | ||
446 | // =========================== REPLACE ========================== | |
447 | ||
448 | // We have a full match. The key is between pos.start and | |
449 | // keyLimit. | |
450 | ||
451 | int32_t newStart; | |
452 | int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); | |
453 | int32_t lenDelta = newLength - (keyLimit - pos.start); | |
454 | ||
455 | oText += lenDelta; | |
456 | pos.limit += lenDelta; | |
457 | pos.contextLimit += lenDelta; | |
458 | // Restrict new value of start to [minOText, min(oText, pos.limit)]. | |
459 | pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); | |
460 | return U_MATCH; | |
461 | } | |
462 | ||
463 | /** | |
464 | * Create a source string that represents this rule. Append it to the | |
465 | * given string. | |
466 | */ | |
467 | UnicodeString& TransliterationRule::toRule(UnicodeString& rule, | |
468 | UBool escapeUnprintable) const { | |
469 | ||
470 | // Accumulate special characters (and non-specials following them) | |
471 | // into quoteBuf. Append quoteBuf, within single quotes, when | |
472 | // a non-quoted element must be inserted. | |
473 | UnicodeString str, quoteBuf; | |
474 | ||
475 | // Do not emit the braces '{' '}' around the pattern if there | |
476 | // is neither anteContext nor postContext. | |
477 | UBool emitBraces = | |
478 | (anteContext != NULL) || (postContext != NULL); | |
479 | ||
480 | // Emit start anchor | |
481 | if ((flags & ANCHOR_START) != 0) { | |
482 | rule.append((UChar)94/*^*/); | |
483 | } | |
484 | ||
485 | // Emit the input pattern | |
486 | ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); | |
487 | ||
488 | if (emitBraces) { | |
489 | ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); | |
490 | } | |
491 | ||
492 | ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); | |
493 | ||
494 | if (emitBraces) { | |
495 | ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); | |
496 | } | |
497 | ||
498 | ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); | |
499 | ||
500 | // Emit end anchor | |
501 | if ((flags & ANCHOR_END) != 0) { | |
502 | rule.append((UChar)36/*$*/); | |
503 | } | |
504 | ||
505 | ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); | |
506 | ||
507 | // Emit the output pattern | |
508 | ||
509 | ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), | |
510 | TRUE, escapeUnprintable, quoteBuf); | |
511 | ||
512 | ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); | |
513 | ||
514 | return rule; | |
515 | } | |
516 | ||
517 | void TransliterationRule::setData(const TransliterationRuleData* d) { | |
518 | data = d; | |
519 | if (anteContext != NULL) anteContext->setData(d); | |
520 | if (postContext != NULL) postContext->setData(d); | |
521 | if (key != NULL) key->setData(d); | |
522 | // assert(output != NULL); | |
523 | output->setData(d); | |
524 | // Don't have to do segments since they are in the context or key | |
525 | } | |
526 | ||
527 | /** | |
528 | * Union the set of all characters that may be modified by this rule | |
529 | * into the given set. | |
530 | */ | |
531 | void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { | |
532 | int32_t limit = anteContextLength + keyLength; | |
533 | for (int32_t i=anteContextLength; i<limit; ) { | |
534 | UChar32 ch = pattern.char32At(i); | |
535 | i += U16_LENGTH(ch); | |
536 | const UnicodeMatcher* matcher = data->lookupMatcher(ch); | |
537 | if (matcher == NULL) { | |
538 | toUnionTo.add(ch); | |
539 | } else { | |
540 | matcher->addMatchSetTo(toUnionTo); | |
541 | } | |
542 | } | |
543 | } | |
544 | ||
545 | /** | |
546 | * Union the set of all characters that may be emitted by this rule | |
547 | * into the given set. | |
548 | */ | |
549 | void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { | |
550 | output->toReplacer()->addReplacementSetTo(toUnionTo); | |
551 | } | |
552 | ||
553 | U_NAMESPACE_END | |
554 | ||
555 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
556 | ||
557 | //eof |