]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | /* |
374ca955 | 2 | ********************************************************************** |
46f4442e | 3 | * Copyright (C) 1999-2008, International Business Machines |
374ca955 A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 11/17/99 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
b75a7d8f A |
10 | |
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "unicode/rep.h" | |
16 | #include "unicode/unifilt.h" | |
17 | #include "unicode/uniset.h" | |
18 | #include "rbt_rule.h" | |
19 | #include "rbt_data.h" | |
20 | #include "cmemory.h" | |
21 | #include "strmatch.h" | |
22 | #include "strrepl.h" | |
23 | #include "util.h" | |
374ca955 | 24 | #include "putilimp.h" |
b75a7d8f A |
25 | |
26 | static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " | |
27 | ||
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | /** | |
31 | * Construct a new rule with the given input, output text, and other | |
32 | * attributes. A cursor position may be specified for the output text. | |
33 | * @param input input string, including key and optional ante and | |
34 | * post context | |
35 | * @param anteContextPos offset into input to end of ante context, or -1 if | |
36 | * none. Must be <= input.length() if not -1. | |
37 | * @param postContextPos offset into input to start of post context, or -1 | |
38 | * if none. Must be <= input.length() if not -1, and must be >= | |
39 | * anteContextPos. | |
40 | * @param output output string | |
41 | * @param cursorPosition offset into output at which cursor is located, or -1 if | |
42 | * none. If less than zero, then the cursor is placed after the | |
43 | * <code>output</code>; that is, -1 is equivalent to | |
44 | * <code>output.length()</code>. If greater than | |
45 | * <code>output.length()</code> then an exception is thrown. | |
46 | * @param segs array of UnicodeFunctors corresponding to input pattern | |
47 | * segments, or null if there are none. The array itself is adopted, | |
48 | * but the pointers within it are not. | |
49 | * @param segsCount number of elements in segs[] | |
50 | * @param anchorStart TRUE if the the rule is anchored on the left to | |
51 | * the context start | |
52 | * @param anchorEnd TRUE if the rule is anchored on the right to the | |
53 | * context limit | |
54 | */ | |
55 | TransliterationRule::TransliterationRule(const UnicodeString& input, | |
56 | int32_t anteContextPos, int32_t postContextPos, | |
57 | const UnicodeString& outputStr, | |
58 | int32_t cursorPosition, int32_t cursorOffset, | |
59 | UnicodeFunctor** segs, | |
60 | int32_t segsCount, | |
61 | UBool anchorStart, UBool anchorEnd, | |
62 | const TransliterationRuleData* theData, | |
63 | UErrorCode& status) : | |
64 | UMemory(), | |
65 | segments(0), | |
66 | data(theData) { | |
67 | ||
68 | if (U_FAILURE(status)) { | |
69 | return; | |
70 | } | |
71 | // Do range checks only when warranted to save time | |
72 | if (anteContextPos < 0) { | |
73 | anteContextLength = 0; | |
74 | } else { | |
75 | if (anteContextPos > input.length()) { | |
76 | // throw new IllegalArgumentException("Invalid ante context"); | |
77 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
78 | return; | |
79 | } | |
80 | anteContextLength = anteContextPos; | |
81 | } | |
82 | if (postContextPos < 0) { | |
83 | keyLength = input.length() - anteContextLength; | |
84 | } else { | |
85 | if (postContextPos < anteContextLength || | |
86 | postContextPos > input.length()) { | |
87 | // throw new IllegalArgumentException("Invalid post context"); | |
88 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
89 | return; | |
90 | } | |
91 | keyLength = postContextPos - anteContextLength; | |
92 | } | |
93 | if (cursorPosition < 0) { | |
94 | cursorPosition = outputStr.length(); | |
95 | } else if (cursorPosition > outputStr.length()) { | |
96 | // throw new IllegalArgumentException("Invalid cursor position"); | |
97 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
98 | return; | |
99 | } | |
100 | // We don't validate the segments array. The caller must | |
101 | // guarantee that the segments are well-formed (that is, that | |
102 | // all $n references in the output refer to indices of this | |
103 | // array, and that no array elements are null). | |
104 | this->segments = segs; | |
105 | this->segmentsCount = segsCount; | |
106 | ||
107 | pattern = input; | |
108 | flags = 0; | |
109 | if (anchorStart) { | |
110 | flags |= ANCHOR_START; | |
111 | } | |
112 | if (anchorEnd) { | |
113 | flags |= ANCHOR_END; | |
114 | } | |
115 | ||
116 | anteContext = NULL; | |
117 | if (anteContextLength > 0) { | |
118 | anteContext = new StringMatcher(pattern, 0, anteContextLength, | |
119 | FALSE, *data); | |
120 | /* test for NULL */ | |
121 | if (anteContext == 0) { | |
122 | status = U_MEMORY_ALLOCATION_ERROR; | |
123 | return; | |
124 | } | |
125 | } | |
126 | ||
127 | key = NULL; | |
128 | if (keyLength > 0) { | |
129 | key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, | |
130 | FALSE, *data); | |
131 | /* test for NULL */ | |
132 | if (key == 0) { | |
133 | status = U_MEMORY_ALLOCATION_ERROR; | |
134 | return; | |
135 | } | |
136 | } | |
137 | ||
138 | int32_t postContextLength = pattern.length() - keyLength - anteContextLength; | |
139 | postContext = NULL; | |
140 | if (postContextLength > 0) { | |
141 | postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), | |
142 | FALSE, *data); | |
143 | /* test for NULL */ | |
144 | if (postContext == 0) { | |
145 | status = U_MEMORY_ALLOCATION_ERROR; | |
146 | return; | |
147 | } | |
148 | } | |
149 | ||
150 | this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); | |
151 | /* test for NULL */ | |
152 | if (this->output == 0) { | |
153 | status = U_MEMORY_ALLOCATION_ERROR; | |
154 | return; | |
155 | } | |
156 | } | |
157 | ||
158 | /** | |
159 | * Copy constructor. | |
160 | */ | |
161 | TransliterationRule::TransliterationRule(TransliterationRule& other) : | |
162 | UMemory(other), | |
163 | anteContext(NULL), | |
164 | key(NULL), | |
165 | postContext(NULL), | |
166 | pattern(other.pattern), | |
167 | anteContextLength(other.anteContextLength), | |
168 | keyLength(other.keyLength), | |
169 | flags(other.flags), | |
170 | data(other.data) { | |
171 | ||
172 | segments = NULL; | |
173 | segmentsCount = 0; | |
174 | if (other.segmentsCount > 0) { | |
175 | segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); | |
176 | uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); | |
177 | } | |
178 | ||
179 | if (other.anteContext != NULL) { | |
180 | anteContext = (StringMatcher*) other.anteContext->clone(); | |
181 | } | |
182 | if (other.key != NULL) { | |
183 | key = (StringMatcher*) other.key->clone(); | |
184 | } | |
185 | if (other.postContext != NULL) { | |
186 | postContext = (StringMatcher*) other.postContext->clone(); | |
187 | } | |
188 | output = other.output->clone(); | |
189 | } | |
190 | ||
191 | TransliterationRule::~TransliterationRule() { | |
192 | uprv_free(segments); | |
193 | delete anteContext; | |
194 | delete key; | |
195 | delete postContext; | |
196 | delete output; | |
197 | } | |
198 | ||
199 | /** | |
200 | * Return the preceding context length. This method is needed to | |
201 | * support the <code>Transliterator</code> method | |
202 | * <code>getMaximumContextLength()</code>. Internally, this is | |
203 | * implemented as the anteContextLength, optionally plus one if | |
204 | * there is a start anchor. The one character anchor gap is | |
205 | * needed to make repeated incremental transliteration with | |
206 | * anchors work. | |
207 | */ | |
208 | int32_t TransliterationRule::getContextLength(void) const { | |
209 | return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); | |
210 | } | |
211 | ||
212 | /** | |
213 | * Internal method. Returns 8-bit index value for this rule. | |
214 | * This is the low byte of the first character of the key, | |
215 | * unless the first character of the key is a set. If it's a | |
216 | * set, or otherwise can match multiple keys, the index value is -1. | |
217 | */ | |
218 | int16_t TransliterationRule::getIndexValue() const { | |
219 | if (anteContextLength == pattern.length()) { | |
220 | // A pattern with just ante context {such as foo)>bar} can | |
221 | // match any key. | |
222 | return -1; | |
223 | } | |
224 | UChar32 c = pattern.char32At(anteContextLength); | |
225 | return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); | |
226 | } | |
227 | ||
228 | /** | |
229 | * Internal method. Returns true if this rule matches the given | |
230 | * index value. The index value is an 8-bit integer, 0..255, | |
231 | * representing the low byte of the first character of the key. | |
232 | * It matches this rule if it matches the first character of the | |
233 | * key, or if the first character of the key is a set, and the set | |
234 | * contains any character with a low byte equal to the index | |
235 | * value. If the rule contains only ante context, as in foo)>bar, | |
236 | * then it will match any key. | |
237 | */ | |
238 | UBool TransliterationRule::matchesIndexValue(uint8_t v) const { | |
239 | // Delegate to the key, or if there is none, to the postContext. | |
240 | // If there is neither then we match any key; return true. | |
241 | UnicodeMatcher *m = (key != NULL) ? key : postContext; | |
242 | return (m != NULL) ? m->matchesIndexValue(v) : TRUE; | |
243 | } | |
244 | ||
245 | /** | |
246 | * Return true if this rule masks another rule. If r1 masks r2 then | |
247 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks | |
248 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". | |
249 | * "[c]a>x" masks "[dc]a>y". | |
250 | */ | |
251 | UBool TransliterationRule::masks(const TransliterationRule& r2) const { | |
252 | /* Rule r1 masks rule r2 if the string formed of the | |
253 | * antecontext, key, and postcontext overlaps in the following | |
254 | * way: | |
255 | * | |
256 | * r1: aakkkpppp | |
257 | * r2: aaakkkkkpppp | |
258 | * ^ | |
259 | * | |
260 | * The strings must be aligned at the first character of the | |
261 | * key. The length of r1 to the left of the alignment point | |
262 | * must be <= the length of r2 to the left; ditto for the | |
263 | * right. The characters of r1 must equal (or be a superset | |
264 | * of) the corresponding characters of r2. The superset | |
265 | * operation should be performed to check for UnicodeSet | |
266 | * masking. | |
267 | * | |
268 | * Anchors: Two patterns that differ only in anchors only | |
269 | * mask one another if they are exactly equal, and r2 has | |
270 | * all the anchors r1 has (optionally, plus some). Here Y | |
271 | * means the row masks the column, N means it doesn't. | |
272 | * | |
273 | * ab ^ab ab$ ^ab$ | |
274 | * ab Y Y Y Y | |
275 | * ^ab N Y N Y | |
276 | * ab$ N N Y Y | |
277 | * ^ab$ N N N Y | |
278 | * | |
279 | * Post context: {a}b masks ab, but not vice versa, since {a}b | |
280 | * matches everything ab matches, and {a}b matches {|a|}b but ab | |
281 | * does not. Pre context is different (a{b} does not align with | |
282 | * ab). | |
283 | */ | |
284 | ||
285 | /* LIMITATION of the current mask algorithm: Some rule | |
286 | * maskings are currently not detected. For example, | |
287 | * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO | |
288 | */ | |
289 | ||
290 | int32_t len = pattern.length(); | |
291 | int32_t left = anteContextLength; | |
292 | int32_t left2 = r2.anteContextLength; | |
293 | int32_t right = len - left; | |
294 | int32_t right2 = r2.pattern.length() - left2; | |
46f4442e | 295 | int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); |
b75a7d8f A |
296 | |
297 | // TODO Clean this up -- some logic might be combinable with the | |
298 | // next statement. | |
299 | ||
300 | // Test for anchor masking | |
301 | if (left == left2 && right == right2 && | |
302 | keyLength <= r2.keyLength && | |
46f4442e | 303 | 0 == cachedCompare) { |
b75a7d8f A |
304 | // The following boolean logic implements the table above |
305 | return (flags == r2.flags) || | |
306 | (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || | |
307 | ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); | |
308 | } | |
309 | ||
310 | return left <= left2 && | |
311 | (right < right2 || | |
312 | (right == right2 && keyLength <= r2.keyLength)) && | |
46f4442e | 313 | (0 == cachedCompare); |
b75a7d8f A |
314 | } |
315 | ||
316 | static inline int32_t posBefore(const Replaceable& str, int32_t pos) { | |
317 | return (pos > 0) ? | |
318 | pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) : | |
319 | pos - 1; | |
320 | } | |
321 | ||
322 | static inline int32_t posAfter(const Replaceable& str, int32_t pos) { | |
323 | return (pos >= 0 && pos < str.length()) ? | |
324 | pos + UTF_CHAR_LENGTH(str.char32At(pos)) : | |
325 | pos + 1; | |
326 | } | |
327 | ||
328 | /** | |
329 | * Attempt a match and replacement at the given position. Return | |
330 | * the degree of match between this rule and the given text. The | |
331 | * degree of match may be mismatch, a partial match, or a full | |
332 | * match. A mismatch means at least one character of the text | |
333 | * does not match the context or key. A partial match means some | |
334 | * context and key characters match, but the text is not long | |
335 | * enough to match all of them. A full match means all context | |
336 | * and key characters match. | |
337 | * | |
338 | * If a full match is obtained, perform a replacement, update pos, | |
339 | * and return U_MATCH. Otherwise both text and pos are unchanged. | |
340 | * | |
341 | * @param text the text | |
342 | * @param pos the position indices | |
343 | * @param incremental if TRUE, test for partial matches that may | |
344 | * be completed by additional text inserted at pos.limit. | |
345 | * @return one of <code>U_MISMATCH</code>, | |
346 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If | |
347 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. | |
348 | */ | |
349 | UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, | |
350 | UTransPosition& pos, | |
351 | UBool incremental) const { | |
352 | // Matching and replacing are done in one method because the | |
353 | // replacement operation needs information obtained during the | |
354 | // match. Another way to do this is to have the match method | |
355 | // create a match result struct with relevant offsets, and to pass | |
356 | // this into the replace method. | |
357 | ||
358 | // ============================ MATCH =========================== | |
359 | ||
360 | // Reset segment match data | |
361 | if (segments != NULL) { | |
362 | for (int32_t i=0; i<segmentsCount; ++i) { | |
363 | ((StringMatcher*) segments[i])->resetMatch(); | |
364 | } | |
365 | } | |
366 | ||
367 | // int32_t lenDelta, keyLimit; | |
368 | int32_t keyLimit; | |
369 | ||
370 | // ------------------------ Ante Context ------------------------ | |
371 | ||
372 | // A mismatch in the ante context, or with the start anchor, | |
373 | // is an outright U_MISMATCH regardless of whether we are | |
374 | // incremental or not. | |
375 | int32_t oText; // offset into 'text' | |
376 | // int32_t newStart = 0; | |
377 | int32_t minOText; | |
378 | ||
379 | // Note (1): We process text in 16-bit code units, rather than | |
380 | // 32-bit code points. This works because stand-ins are | |
381 | // always in the BMP and because we are doing a literal match | |
382 | // operation, which can be done 16-bits at a time. | |
383 | ||
384 | int32_t anteLimit = posBefore(text, pos.contextStart); | |
385 | ||
386 | UMatchDegree match; | |
387 | ||
388 | // Start reverse match at char before pos.start | |
389 | oText = posBefore(text, pos.start); | |
390 | ||
391 | if (anteContext != NULL) { | |
392 | match = anteContext->matches(text, oText, anteLimit, FALSE); | |
393 | if (match != U_MATCH) { | |
394 | return U_MISMATCH; | |
395 | } | |
396 | } | |
397 | ||
398 | minOText = posAfter(text, oText); | |
399 | ||
400 | // ------------------------ Start Anchor ------------------------ | |
401 | ||
402 | if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { | |
403 | return U_MISMATCH; | |
404 | } | |
405 | ||
406 | // -------------------- Key and Post Context -------------------- | |
407 | ||
408 | oText = pos.start; | |
409 | ||
410 | if (key != NULL) { | |
411 | match = key->matches(text, oText, pos.limit, incremental); | |
412 | if (match != U_MATCH) { | |
413 | return match; | |
414 | } | |
415 | } | |
416 | ||
417 | keyLimit = oText; | |
418 | ||
419 | if (postContext != NULL) { | |
420 | if (incremental && keyLimit == pos.limit) { | |
421 | // The key matches just before pos.limit, and there is | |
422 | // a postContext. Since we are in incremental mode, | |
423 | // we must assume more characters may be inserted at | |
424 | // pos.limit -- this is a partial match. | |
425 | return U_PARTIAL_MATCH; | |
426 | } | |
427 | ||
428 | match = postContext->matches(text, oText, pos.contextLimit, incremental); | |
429 | if (match != U_MATCH) { | |
430 | return match; | |
431 | } | |
432 | } | |
433 | ||
434 | // ------------------------- Stop Anchor ------------------------ | |
435 | ||
436 | if (((flags & ANCHOR_END)) != 0) { | |
437 | if (oText != pos.contextLimit) { | |
438 | return U_MISMATCH; | |
439 | } | |
440 | if (incremental) { | |
441 | return U_PARTIAL_MATCH; | |
442 | } | |
443 | } | |
444 | ||
445 | // =========================== REPLACE ========================== | |
446 | ||
447 | // We have a full match. The key is between pos.start and | |
448 | // keyLimit. | |
449 | ||
450 | int32_t newStart; | |
451 | int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); | |
452 | int32_t lenDelta = newLength - (keyLimit - pos.start); | |
453 | ||
454 | oText += lenDelta; | |
455 | pos.limit += lenDelta; | |
456 | pos.contextLimit += lenDelta; | |
457 | // Restrict new value of start to [minOText, min(oText, pos.limit)]. | |
458 | pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); | |
459 | return U_MATCH; | |
460 | } | |
461 | ||
462 | /** | |
463 | * Create a source string that represents this rule. Append it to the | |
464 | * given string. | |
465 | */ | |
466 | UnicodeString& TransliterationRule::toRule(UnicodeString& rule, | |
467 | UBool escapeUnprintable) const { | |
468 | ||
469 | // Accumulate special characters (and non-specials following them) | |
470 | // into quoteBuf. Append quoteBuf, within single quotes, when | |
471 | // a non-quoted element must be inserted. | |
472 | UnicodeString str, quoteBuf; | |
473 | ||
474 | // Do not emit the braces '{' '}' around the pattern if there | |
475 | // is neither anteContext nor postContext. | |
476 | UBool emitBraces = | |
477 | (anteContext != NULL) || (postContext != NULL); | |
478 | ||
479 | // Emit start anchor | |
480 | if ((flags & ANCHOR_START) != 0) { | |
481 | rule.append((UChar)94/*^*/); | |
482 | } | |
483 | ||
484 | // Emit the input pattern | |
485 | ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); | |
486 | ||
487 | if (emitBraces) { | |
488 | ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); | |
489 | } | |
490 | ||
491 | ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); | |
492 | ||
493 | if (emitBraces) { | |
494 | ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); | |
495 | } | |
496 | ||
497 | ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); | |
498 | ||
499 | // Emit end anchor | |
500 | if ((flags & ANCHOR_END) != 0) { | |
501 | rule.append((UChar)36/*$*/); | |
502 | } | |
503 | ||
504 | ICU_Utility::appendToRule(rule, FORWARD_OP, TRUE, escapeUnprintable, quoteBuf); | |
505 | ||
506 | // Emit the output pattern | |
507 | ||
508 | ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), | |
509 | TRUE, escapeUnprintable, quoteBuf); | |
510 | ||
511 | ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); | |
512 | ||
513 | return rule; | |
514 | } | |
515 | ||
516 | void TransliterationRule::setData(const TransliterationRuleData* d) { | |
517 | data = d; | |
518 | if (anteContext != NULL) anteContext->setData(d); | |
519 | if (postContext != NULL) postContext->setData(d); | |
520 | if (key != NULL) key->setData(d); | |
521 | // assert(output != NULL); | |
522 | output->setData(d); | |
523 | // Don't have to do segments since they are in the context or key | |
524 | } | |
525 | ||
526 | /** | |
527 | * Union the set of all characters that may be modified by this rule | |
528 | * into the given set. | |
529 | */ | |
530 | void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { | |
531 | int32_t limit = anteContextLength + keyLength; | |
532 | for (int32_t i=anteContextLength; i<limit; ) { | |
46f4442e A |
533 | UChar32 ch = pattern.char32At(i); |
534 | i += UTF_CHAR_LENGTH(ch); | |
535 | const UnicodeMatcher* matcher = data->lookupMatcher(ch); | |
536 | if (matcher == NULL) { | |
537 | toUnionTo.add(ch); | |
538 | } else { | |
539 | matcher->addMatchSetTo(toUnionTo); | |
540 | } | |
b75a7d8f A |
541 | } |
542 | } | |
543 | ||
544 | /** | |
545 | * Union the set of all characters that may be emitted by this rule | |
546 | * into the given set. | |
547 | */ | |
548 | void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { | |
549 | output->toReplacer()->addReplacementSetTo(toUnionTo); | |
550 | } | |
551 | ||
552 | U_NAMESPACE_END | |
553 | ||
554 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
555 | ||
556 | //eof |