]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f | 3 | /* |
374ca955 | 4 | ********************************************************************** |
4388f060 | 5 | * Copyright (C) 1999-2011, International Business Machines |
374ca955 A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 11/17/99 aliu Creation. | |
10 | ********************************************************************** | |
11 | */ | |
b75a7d8f A |
12 | |
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_TRANSLITERATION | |
16 | ||
17 | #include "unicode/rep.h" | |
18 | #include "unicode/unifilt.h" | |
19 | #include "unicode/uniset.h" | |
4388f060 | 20 | #include "unicode/utf16.h" |
b75a7d8f A |
21 | #include "rbt_rule.h" |
22 | #include "rbt_data.h" | |
23 | #include "cmemory.h" | |
24 | #include "strmatch.h" | |
25 | #include "strrepl.h" | |
26 | #include "util.h" | |
374ca955 | 27 | #include "putilimp.h" |
b75a7d8f A |
28 | |
29 | static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " | |
30 | ||
31 | U_NAMESPACE_BEGIN | |
32 | ||
33 | /** | |
34 | * Construct a new rule with the given input, output text, and other | |
35 | * attributes. A cursor position may be specified for the output text. | |
36 | * @param input input string, including key and optional ante and | |
37 | * post context | |
38 | * @param anteContextPos offset into input to end of ante context, or -1 if | |
39 | * none. Must be <= input.length() if not -1. | |
40 | * @param postContextPos offset into input to start of post context, or -1 | |
41 | * if none. Must be <= input.length() if not -1, and must be >= | |
42 | * anteContextPos. | |
43 | * @param output output string | |
44 | * @param cursorPosition offset into output at which cursor is located, or -1 if | |
45 | * none. If less than zero, then the cursor is placed after the | |
46 | * <code>output</code>; that is, -1 is equivalent to | |
47 | * <code>output.length()</code>. If greater than | |
48 | * <code>output.length()</code> then an exception is thrown. | |
49 | * @param segs array of UnicodeFunctors corresponding to input pattern | |
50 | * segments, or null if there are none. The array itself is adopted, | |
51 | * but the pointers within it are not. | |
52 | * @param segsCount number of elements in segs[] | |
53 | * @param anchorStart TRUE if the the rule is anchored on the left to | |
54 | * the context start | |
55 | * @param anchorEnd TRUE if the rule is anchored on the right to the | |
56 | * context limit | |
57 | */ | |
58 | TransliterationRule::TransliterationRule(const UnicodeString& input, | |
59 | int32_t anteContextPos, int32_t postContextPos, | |
60 | const UnicodeString& outputStr, | |
61 | int32_t cursorPosition, int32_t cursorOffset, | |
62 | UnicodeFunctor** segs, | |
63 | int32_t segsCount, | |
64 | UBool anchorStart, UBool anchorEnd, | |
65 | const TransliterationRuleData* theData, | |
66 | UErrorCode& status) : | |
67 | UMemory(), | |
68 | segments(0), | |
69 | data(theData) { | |
70 | ||
71 | if (U_FAILURE(status)) { | |
72 | return; | |
73 | } | |
74 | // Do range checks only when warranted to save time | |
75 | if (anteContextPos < 0) { | |
76 | anteContextLength = 0; | |
77 | } else { | |
78 | if (anteContextPos > input.length()) { | |
79 | // throw new IllegalArgumentException("Invalid ante context"); | |
80 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
81 | return; | |
82 | } | |
83 | anteContextLength = anteContextPos; | |
84 | } | |
85 | if (postContextPos < 0) { | |
86 | keyLength = input.length() - anteContextLength; | |
87 | } else { | |
88 | if (postContextPos < anteContextLength || | |
89 | postContextPos > input.length()) { | |
90 | // throw new IllegalArgumentException("Invalid post context"); | |
91 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
92 | return; | |
93 | } | |
94 | keyLength = postContextPos - anteContextLength; | |
95 | } | |
96 | if (cursorPosition < 0) { | |
97 | cursorPosition = outputStr.length(); | |
98 | } else if (cursorPosition > outputStr.length()) { | |
99 | // throw new IllegalArgumentException("Invalid cursor position"); | |
100 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
101 | return; | |
102 | } | |
103 | // We don't validate the segments array. The caller must | |
104 | // guarantee that the segments are well-formed (that is, that | |
105 | // all $n references in the output refer to indices of this | |
106 | // array, and that no array elements are null). | |
107 | this->segments = segs; | |
108 | this->segmentsCount = segsCount; | |
109 | ||
110 | pattern = input; | |
111 | flags = 0; | |
112 | if (anchorStart) { | |
113 | flags |= ANCHOR_START; | |
114 | } | |
115 | if (anchorEnd) { | |
116 | flags |= ANCHOR_END; | |
117 | } | |
118 | ||
119 | anteContext = NULL; | |
120 | if (anteContextLength > 0) { | |
121 | anteContext = new StringMatcher(pattern, 0, anteContextLength, | |
122 | FALSE, *data); | |
123 | /* test for NULL */ | |
124 | if (anteContext == 0) { | |
125 | status = U_MEMORY_ALLOCATION_ERROR; | |
126 | return; | |
127 | } | |
128 | } | |
129 | ||
130 | key = NULL; | |
131 | if (keyLength > 0) { | |
132 | key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, | |
133 | FALSE, *data); | |
134 | /* test for NULL */ | |
135 | if (key == 0) { | |
136 | status = U_MEMORY_ALLOCATION_ERROR; | |
137 | return; | |
138 | } | |
139 | } | |
140 | ||
141 | int32_t postContextLength = pattern.length() - keyLength - anteContextLength; | |
142 | postContext = NULL; | |
143 | if (postContextLength > 0) { | |
144 | postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), | |
145 | FALSE, *data); | |
146 | /* test for NULL */ | |
147 | if (postContext == 0) { | |
148 | status = U_MEMORY_ALLOCATION_ERROR; | |
149 | return; | |
150 | } | |
151 | } | |
152 | ||
153 | this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); | |
154 | /* test for NULL */ | |
155 | if (this->output == 0) { | |
156 | status = U_MEMORY_ALLOCATION_ERROR; | |
157 | return; | |
158 | } | |
159 | } | |
160 | ||
161 | /** | |
162 | * Copy constructor. | |
163 | */ | |
164 | TransliterationRule::TransliterationRule(TransliterationRule& other) : | |
165 | UMemory(other), | |
166 | anteContext(NULL), | |
167 | key(NULL), | |
168 | postContext(NULL), | |
169 | pattern(other.pattern), | |
170 | anteContextLength(other.anteContextLength), | |
171 | keyLength(other.keyLength), | |
172 | flags(other.flags), | |
173 | data(other.data) { | |
174 | ||
175 | segments = NULL; | |
176 | segmentsCount = 0; | |
177 | if (other.segmentsCount > 0) { | |
178 | segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); | |
a62d09fc | 179 | uprv_memcpy(segments, other.segments, (size_t)other.segmentsCount*sizeof(segments[0])); |
b75a7d8f A |
180 | } |
181 | ||
182 | if (other.anteContext != NULL) { | |
183 | anteContext = (StringMatcher*) other.anteContext->clone(); | |
184 | } | |
185 | if (other.key != NULL) { | |
186 | key = (StringMatcher*) other.key->clone(); | |
187 | } | |
188 | if (other.postContext != NULL) { | |
189 | postContext = (StringMatcher*) other.postContext->clone(); | |
190 | } | |
191 | output = other.output->clone(); | |
192 | } | |
193 | ||
194 | TransliterationRule::~TransliterationRule() { | |
195 | uprv_free(segments); | |
196 | delete anteContext; | |
197 | delete key; | |
198 | delete postContext; | |
199 | delete output; | |
200 | } | |
201 | ||
202 | /** | |
203 | * Return the preceding context length. This method is needed to | |
204 | * support the <code>Transliterator</code> method | |
205 | * <code>getMaximumContextLength()</code>. Internally, this is | |
206 | * implemented as the anteContextLength, optionally plus one if | |
207 | * there is a start anchor. The one character anchor gap is | |
208 | * needed to make repeated incremental transliteration with | |
209 | * anchors work. | |
210 | */ | |
211 | int32_t TransliterationRule::getContextLength(void) const { | |
212 | return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); | |
213 | } | |
214 | ||
215 | /** | |
216 | * Internal method. Returns 8-bit index value for this rule. | |
217 | * This is the low byte of the first character of the key, | |
218 | * unless the first character of the key is a set. If it's a | |
219 | * set, or otherwise can match multiple keys, the index value is -1. | |
220 | */ | |
221 | int16_t TransliterationRule::getIndexValue() const { | |
222 | if (anteContextLength == pattern.length()) { | |
223 | // A pattern with just ante context {such as foo)>bar} can | |
224 | // match any key. | |
225 | return -1; | |
226 | } | |
227 | UChar32 c = pattern.char32At(anteContextLength); | |
228 | return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); | |
229 | } | |
230 | ||
231 | /** | |
232 | * Internal method. Returns true if this rule matches the given | |
233 | * index value. The index value is an 8-bit integer, 0..255, | |
234 | * representing the low byte of the first character of the key. | |
235 | * It matches this rule if it matches the first character of the | |
236 | * key, or if the first character of the key is a set, and the set | |
237 | * contains any character with a low byte equal to the index | |
238 | * value. If the rule contains only ante context, as in foo)>bar, | |
239 | * then it will match any key. | |
240 | */ | |
241 | UBool TransliterationRule::matchesIndexValue(uint8_t v) const { | |
242 | // Delegate to the key, or if there is none, to the postContext. | |
243 | // If there is neither then we match any key; return true. | |
244 | UnicodeMatcher *m = (key != NULL) ? key : postContext; | |
245 | return (m != NULL) ? m->matchesIndexValue(v) : TRUE; | |
246 | } | |
247 | ||
248 | /** | |
249 | * Return true if this rule masks another rule. If r1 masks r2 then | |
250 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks | |
251 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". | |
252 | * "[c]a>x" masks "[dc]a>y". | |
253 | */ | |
254 | UBool TransliterationRule::masks(const TransliterationRule& r2) const { | |
255 | /* Rule r1 masks rule r2 if the string formed of the | |
256 | * antecontext, key, and postcontext overlaps in the following | |
257 | * way: | |
258 | * | |
259 | * r1: aakkkpppp | |
260 | * r2: aaakkkkkpppp | |
261 | * ^ | |
262 | * | |
263 | * The strings must be aligned at the first character of the | |
264 | * key. The length of r1 to the left of the alignment point | |
265 | * must be <= the length of r2 to the left; ditto for the | |
266 | * right. The characters of r1 must equal (or be a superset | |
267 | * of) the corresponding characters of r2. The superset | |
268 | * operation should be performed to check for UnicodeSet | |
269 | * masking. | |
270 | * | |
271 | * Anchors: Two patterns that differ only in anchors only | |
272 | * mask one another if they are exactly equal, and r2 has | |
273 | * all the anchors r1 has (optionally, plus some). Here Y | |
274 | * means the row masks the column, N means it doesn't. | |
275 | * | |
276 | * ab ^ab ab$ ^ab$ | |
277 | * ab Y Y Y Y | |
278 | * ^ab N Y N Y | |
279 | * ab$ N N Y Y | |
280 | * ^ab$ N N N Y | |
281 | * | |
282 | * Post context: {a}b masks ab, but not vice versa, since {a}b | |
283 | * matches everything ab matches, and {a}b matches {|a|}b but ab | |
284 | * does not. Pre context is different (a{b} does not align with | |
285 | * ab). | |
286 | */ | |
287 | ||
288 | /* LIMITATION of the current mask algorithm: Some rule | |
289 | * maskings are currently not detected. For example, | |
290 | * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO | |
291 | */ | |
292 | ||
293 | int32_t len = pattern.length(); | |
294 | int32_t left = anteContextLength; | |
295 | int32_t left2 = r2.anteContextLength; | |
296 | int32_t right = len - left; | |
297 | int32_t right2 = r2.pattern.length() - left2; | |
46f4442e | 298 | int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); |
b75a7d8f A |
299 | |
300 | // TODO Clean this up -- some logic might be combinable with the | |
301 | // next statement. | |
302 | ||
303 | // Test for anchor masking | |
304 | if (left == left2 && right == right2 && | |
305 | keyLength <= r2.keyLength && | |
46f4442e | 306 | 0 == cachedCompare) { |
b75a7d8f A |
307 | // The following boolean logic implements the table above |
308 | return (flags == r2.flags) || | |
309 | (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || | |
310 | ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); | |
311 | } | |
312 | ||
313 | return left <= left2 && | |
314 | (right < right2 || | |
315 | (right == right2 && keyLength <= r2.keyLength)) && | |
46f4442e | 316 | (0 == cachedCompare); |
b75a7d8f A |
317 | } |
318 | ||
319 | static inline int32_t posBefore(const Replaceable& str, int32_t pos) { | |
320 | return (pos > 0) ? | |
4388f060 | 321 | pos - U16_LENGTH(str.char32At(pos-1)) : |
b75a7d8f A |
322 | pos - 1; |
323 | } | |
324 | ||
325 | static inline int32_t posAfter(const Replaceable& str, int32_t pos) { | |
326 | return (pos >= 0 && pos < str.length()) ? | |
4388f060 | 327 | pos + U16_LENGTH(str.char32At(pos)) : |
b75a7d8f A |
328 | pos + 1; |
329 | } | |
330 | ||
331 | /** | |
332 | * Attempt a match and replacement at the given position. Return | |
333 | * the degree of match between this rule and the given text. The | |
334 | * degree of match may be mismatch, a partial match, or a full | |
335 | * match. A mismatch means at least one character of the text | |
336 | * does not match the context or key. A partial match means some | |
337 | * context and key characters match, but the text is not long | |
338 | * enough to match all of them. A full match means all context | |
339 | * and key characters match. | |
340 | * | |
341 | * If a full match is obtained, perform a replacement, update pos, | |
342 | * and return U_MATCH. Otherwise both text and pos are unchanged. | |
343 | * | |
344 | * @param text the text | |
345 | * @param pos the position indices | |
346 | * @param incremental if TRUE, test for partial matches that may | |
347 | * be completed by additional text inserted at pos.limit. | |
348 | * @return one of <code>U_MISMATCH</code>, | |
349 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If | |
350 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. | |
351 | */ | |
352 | UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, | |
353 | UTransPosition& pos, | |
354 | UBool incremental) const { | |
355 | // Matching and replacing are done in one method because the | |
356 | // replacement operation needs information obtained during the | |
357 | // match. Another way to do this is to have the match method | |
358 | // create a match result struct with relevant offsets, and to pass | |
359 | // this into the replace method. | |
360 | ||
361 | // ============================ MATCH =========================== | |
362 | ||
363 | // Reset segment match data | |
364 | if (segments != NULL) { | |
365 | for (int32_t i=0; i<segmentsCount; ++i) { | |
366 | ((StringMatcher*) segments[i])->resetMatch(); | |
367 | } | |
368 | } | |
369 | ||
370 | // int32_t lenDelta, keyLimit; | |
371 | int32_t keyLimit; | |
372 | ||
373 | // ------------------------ Ante Context ------------------------ | |
374 | ||
375 | // A mismatch in the ante context, or with the start anchor, | |
376 | // is an outright U_MISMATCH regardless of whether we are | |
377 | // incremental or not. | |
378 | int32_t oText; // offset into 'text' | |
379 | // int32_t newStart = 0; | |
380 | int32_t minOText; | |
381 | ||
382 | // Note (1): We process text in 16-bit code units, rather than | |
383 | // 32-bit code points. This works because stand-ins are | |
384 | // always in the BMP and because we are doing a literal match | |
385 | // operation, which can be done 16-bits at a time. | |
386 | ||
387 | int32_t anteLimit = posBefore(text, pos.contextStart); | |
388 | ||
389 | UMatchDegree match; | |
390 | ||
391 | // Start reverse match at char before pos.start | |
392 | oText = posBefore(text, pos.start); | |
393 | ||
394 | if (anteContext != NULL) { | |
395 | match = anteContext->matches(text, oText, anteLimit, FALSE); | |
396 | if (match != U_MATCH) { | |
397 | return U_MISMATCH; | |
398 | } | |
399 | } | |
400 | ||
401 | minOText = posAfter(text, oText); | |
402 | ||
403 | // ------------------------ Start Anchor ------------------------ | |
404 | ||
405 | if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { | |
406 | return U_MISMATCH; | |
407 | } | |
408 | ||
409 | // -------------------- Key and Post Context -------------------- | |
410 | ||
411 | oText = pos.start; | |
412 | ||
413 | if (key != NULL) { | |
414 | match = key->matches(text, oText, pos.limit, incremental); | |
415 | if (match != U_MATCH) { | |
416 | return match; | |
417 | } | |
418 | } | |
419 | ||
420 | keyLimit = oText; | |
421 | ||
422 | if (postContext != NULL) { | |
423 | if (incremental && keyLimit == pos.limit) { | |
424 | // The key matches just before pos.limit, and there is | |
425 | // a postContext. Since we are in incremental mode, | |
426 | // we must assume more characters may be inserted at | |
427 | // pos.limit -- this is a partial match. | |
428 | return U_PARTIAL_MATCH; | |
429 | } | |
430 | ||
431 | match = postContext->matches(text, oText, pos.contextLimit, incremental); | |
432 | if (match != U_MATCH) { | |
433 | return match; | |
434 | } | |
435 | } | |
436 | ||
437 | // ------------------------- Stop Anchor ------------------------ | |
438 | ||
439 | if (((flags & ANCHOR_END)) != 0) { | |
440 | if (oText != pos.contextLimit) { | |
441 | return U_MISMATCH; | |
442 | } | |
443 | if (incremental) { | |
444 | return U_PARTIAL_MATCH; | |
445 | } | |
446 | } | |
447 | ||
448 | // =========================== REPLACE ========================== | |
449 | ||
450 | // We have a full match. The key is between pos.start and | |
451 | // keyLimit. | |
452 | ||
453 | int32_t newStart; | |
454 | int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); | |
455 | int32_t lenDelta = newLength - (keyLimit - pos.start); | |
456 | ||
457 | oText += lenDelta; | |
458 | pos.limit += lenDelta; | |
459 | pos.contextLimit += lenDelta; | |
460 | // Restrict new value of start to [minOText, min(oText, pos.limit)]. | |
461 | pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); | |
462 | return U_MATCH; | |
463 | } | |
464 | ||
465 | /** | |
466 | * Create a source string that represents this rule. Append it to the | |
467 | * given string. | |
468 | */ | |
469 | UnicodeString& TransliterationRule::toRule(UnicodeString& rule, | |
470 | UBool escapeUnprintable) const { | |
471 | ||
472 | // Accumulate special characters (and non-specials following them) | |
473 | // into quoteBuf. Append quoteBuf, within single quotes, when | |
474 | // a non-quoted element must be inserted. | |
475 | UnicodeString str, quoteBuf; | |
476 | ||
477 | // Do not emit the braces '{' '}' around the pattern if there | |
478 | // is neither anteContext nor postContext. | |
479 | UBool emitBraces = | |
480 | (anteContext != NULL) || (postContext != NULL); | |
481 | ||
482 | // Emit start anchor | |
483 | if ((flags & ANCHOR_START) != 0) { | |
484 | rule.append((UChar)94/*^*/); | |
485 | } | |
486 | ||
487 | // Emit the input pattern | |
488 | ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); | |
489 | ||
490 | if (emitBraces) { | |
491 | ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); | |
492 | } | |
493 | ||
494 | ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); | |
495 | ||
496 | if (emitBraces) { | |
497 | ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); | |
498 | } | |
499 | ||
500 | ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); | |
501 | ||
502 | // Emit end anchor | |
503 | if ((flags & ANCHOR_END) != 0) { | |
504 | rule.append((UChar)36/*$*/); | |
505 | } | |
506 | ||
4388f060 | 507 | ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); |
b75a7d8f A |
508 | |
509 | // Emit the output pattern | |
510 | ||
511 | ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), | |
512 | TRUE, escapeUnprintable, quoteBuf); | |
513 | ||
514 | ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); | |
515 | ||
516 | return rule; | |
517 | } | |
518 | ||
519 | void TransliterationRule::setData(const TransliterationRuleData* d) { | |
520 | data = d; | |
521 | if (anteContext != NULL) anteContext->setData(d); | |
522 | if (postContext != NULL) postContext->setData(d); | |
523 | if (key != NULL) key->setData(d); | |
524 | // assert(output != NULL); | |
525 | output->setData(d); | |
526 | // Don't have to do segments since they are in the context or key | |
527 | } | |
528 | ||
529 | /** | |
530 | * Union the set of all characters that may be modified by this rule | |
531 | * into the given set. | |
532 | */ | |
533 | void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { | |
534 | int32_t limit = anteContextLength + keyLength; | |
535 | for (int32_t i=anteContextLength; i<limit; ) { | |
46f4442e | 536 | UChar32 ch = pattern.char32At(i); |
4388f060 | 537 | i += U16_LENGTH(ch); |
46f4442e A |
538 | const UnicodeMatcher* matcher = data->lookupMatcher(ch); |
539 | if (matcher == NULL) { | |
540 | toUnionTo.add(ch); | |
541 | } else { | |
542 | matcher->addMatchSetTo(toUnionTo); | |
543 | } | |
b75a7d8f A |
544 | } |
545 | } | |
546 | ||
547 | /** | |
548 | * Union the set of all characters that may be emitted by this rule | |
549 | * into the given set. | |
550 | */ | |
551 | void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { | |
552 | output->toReplacer()->addReplacementSetTo(toUnionTo); | |
553 | } | |
554 | ||
555 | U_NAMESPACE_END | |
556 | ||
557 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
558 | ||
559 | //eof |