icuSources/i18n/strrepl.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2002-2012, International Business Machines Corporation
   4 *   and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   01/21/2002  aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uniset.h"
  16 #include "unicode/utf16.h"
  17 #include "strrepl.h"
  18 #include "rbt_data.h"
  19 #include "util.h"
  20
  21 U_NAMESPACE_BEGIN
  22
  23 UnicodeReplacer::~UnicodeReplacer() {}
  24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
  25
  26 /**
  27  * Construct a StringReplacer that sets the emits the given output
  28  * text and sets the cursor to the given position.
  29  * @param theOutput text that will replace input text when the
  30  * replace() method is called.  May contain stand-in characters
  31  * that represent nested replacers.
  32  * @param theCursorPos cursor position that will be returned by
  33  * the replace() method
  34  * @param theData transliterator context object that translates
  35  * stand-in characters to UnicodeReplacer objects
  36  */
  37 StringReplacer::StringReplacer(const UnicodeString& theOutput,
  38                                int32_t theCursorPos,
  39                                const TransliterationRuleData* theData) {
  40     output = theOutput;
  41     cursorPos = theCursorPos;
  42     hasCursor = TRUE;
  43     data = theData;
  44     isComplex = TRUE;
  45 }
  46
  47 /**
  48  * Construct a StringReplacer that sets the emits the given output
  49  * text and does not modify the cursor.
  50  * @param theOutput text that will replace input text when the
  51  * replace() method is called.  May contain stand-in characters
  52  * that represent nested replacers.
  53  * @param theData transliterator context object that translates
  54  * stand-in characters to UnicodeReplacer objects
  55  */
  56 StringReplacer::StringReplacer(const UnicodeString& theOutput,
  57                                const TransliterationRuleData* theData) {
  58     output = theOutput;
  59     cursorPos = 0;
  60     hasCursor = FALSE;
  61     data = theData;
  62     isComplex = TRUE;
  63 }
  64
  65 /**
  66  * Copy constructor.
  67  */
  68 StringReplacer::StringReplacer(const StringReplacer& other) :
  69     UnicodeFunctor(other),
  70     UnicodeReplacer(other)
  71 {
  72     output = other.output;
  73     cursorPos = other.cursorPos;
  74     hasCursor = other.hasCursor;
  75     data = other.data;
  76     isComplex = other.isComplex;
  77 }
  78
  79 /**
  80  * Destructor
  81  */
  82 StringReplacer::~StringReplacer() {
  83 }
  84
  85 /**
  86  * Implement UnicodeFunctor
  87  */
  88 UnicodeFunctor* StringReplacer::clone() const {
  89     return new StringReplacer(*this);
  90 }
  91
  92 /**
  93  * Implement UnicodeFunctor
  94  */
  95 UnicodeReplacer* StringReplacer::toReplacer() const {
  96   return const_cast<StringReplacer *>(this);
  97 }
  98
  99 /**
 100  * UnicodeReplacer API
 101  */
 102 int32_t StringReplacer::replace(Replaceable& text,
 103                                 int32_t start,
 104                                 int32_t limit,
 105                                 int32_t& cursor) {
 106     int32_t outLen;
 107     int32_t newStart = 0;
 108
 109     // NOTE: It should be possible to _always_ run the complex
 110     // processing code; just slower.  If not, then there is a bug
 111     // in the complex processing code.
 112
 113     // Simple (no nested replacers) Processing Code :
 114     if (!isComplex) {
 115         text.handleReplaceBetween(start, limit, output);
 116         outLen = output.length();
 117
 118         // Setup default cursor position (for cursorPos within output)
 119         newStart = cursorPos;
 120     }
 121
 122     // Complex (nested replacers) Processing Code :
 123     else {
 124         /* When there are segments to be copied, use the Replaceable.copy()
 125          * API in order to retain out-of-band data.  Copy everything to the
 126          * end of the string, then copy them back over the key.  This preserves
 127          * the integrity of indices into the key and surrounding context while
 128          * generating the output text.
 129          */
 130         UnicodeString buf;
 131         int32_t oOutput; // offset into 'output'
 132         isComplex = FALSE;
 133
 134         // The temporary buffer starts at tempStart, and extends
 135         // to destLimit.  The start of the buffer has a single
 136         // character from before the key.  This provides style
 137         // data when addition characters are filled into the
 138         // temporary buffer.  If there is nothing to the left, use
 139         // the non-character U+FFFF, which Replaceable subclasses
 140         // should treat specially as a "no-style character."
 141         // destStart points to the point after the style context
 142         // character, so it is tempStart+1 or tempStart+2.
 143         int32_t tempStart = text.length(); // start of temp buffer
 144         int32_t destStart = tempStart; // copy new text to here
 145         if (start > 0) {
 146             int32_t len = U16_LENGTH(text.char32At(start-1));
 147             text.copy(start-len, start, tempStart);
 148             destStart += len;
 149         } else {
 150             UnicodeString str((UChar) 0xFFFF);
 151             text.handleReplaceBetween(tempStart, tempStart, str);
 152             destStart++;
 153         }
 154         int32_t destLimit = destStart;
 155
 156         for (oOutput=0; oOutput<output.length(); ) {
 157             if (oOutput == cursorPos) {
 158                 // Record the position of the cursor
 159                 newStart = destLimit - destStart; // relative to start
 160             }
 161             UChar32 c = output.char32At(oOutput);
 162             UnicodeReplacer* r = data->lookupReplacer(c);
 163             if (r == NULL) {
 164                 // Accumulate straight (non-segment) text.
 165                 buf.append(c);
 166             } else {
 167                 isComplex = TRUE;
 168
 169                 // Insert any accumulated straight text.
 170                 if (buf.length() > 0) {
 171                     text.handleReplaceBetween(destLimit, destLimit, buf);
 172                     destLimit += buf.length();
 173                     buf.truncate(0);
 174                 }
 175
 176                 // Delegate output generation to replacer object
 177                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
 178                 destLimit += len;
 179             }
 180             oOutput += U16_LENGTH(c);
 181         }
 182         // Insert any accumulated straight text.
 183         if (buf.length() > 0) {
 184             text.handleReplaceBetween(destLimit, destLimit, buf);
 185             destLimit += buf.length();
 186         }
 187         if (oOutput == cursorPos) {
 188             // Record the position of the cursor
 189             newStart = destLimit - destStart; // relative to start
 190         }
 191
 192         outLen = destLimit - destStart;
 193
 194         // Copy new text to start, and delete it
 195         text.copy(destStart, destLimit, start);
 196         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
 197
 198         // Delete the old text (the key)
 199         text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
 200     }
 201
 202     if (hasCursor) {
 203         // Adjust the cursor for positions outside the key.  These
 204         // refer to code points rather than code units.  If cursorPos
 205         // is within the output string, then use newStart, which has
 206         // already been set above.
 207         if (cursorPos < 0) {
 208             newStart = start;
 209             int32_t n = cursorPos;
 210             // Outside the output string, cursorPos counts code points
 211             while (n < 0 && newStart > 0) {
 212                 newStart -= U16_LENGTH(text.char32At(newStart-1));
 213                 ++n;
 214             }
 215             newStart += n;
 216         } else if (cursorPos > output.length()) {
 217             newStart = start + outLen;
 218             int32_t n = cursorPos - output.length();
 219             // Outside the output string, cursorPos counts code points
 220             while (n > 0 && newStart < text.length()) {
 221                 newStart += U16_LENGTH(text.char32At(newStart));
 222                 --n;
 223             }
 224             newStart += n;
 225         } else {
 226             // Cursor is within output string.  It has been set up above
 227             // to be relative to start.
 228             newStart += start;
 229         }
 230
 231         cursor = newStart;
 232     }
 233
 234     return outLen;
 235 }
 236
 237 /**
 238  * UnicodeReplacer API
 239  */
 240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
 241                                                  UBool escapeUnprintable) const {
 242     rule.truncate(0);
 243     UnicodeString quoteBuf;
 244
 245     int32_t cursor = cursorPos;
 246
 247     // Handle a cursor preceding the output
 248     if (hasCursor && cursor < 0) {
 249         while (cursor++ < 0) {
 250             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
 251         }
 252         // Fall through and append '|' below
 253     }
 254
 255     for (int32_t i=0; i<output.length(); ++i) {
 256         if (hasCursor && i == cursor) {
 257             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
 258         }
 259         UChar c = output.charAt(i); // Ok to use 16-bits here
 260
 261         UnicodeReplacer* r = data->lookupReplacer(c);
 262         if (r == NULL) {
 263             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
 264         } else {
 265             UnicodeString buf;
 266             r->toReplacerPattern(buf, escapeUnprintable);
 267             buf.insert(0, (UChar)0x20);
 268             buf.append((UChar)0x20);
 269             ICU_Utility::appendToRule(rule, buf,
 270                                       TRUE, escapeUnprintable, quoteBuf);
 271         }
 272     }
 273
 274     // Handle a cursor after the output.  Use > rather than >= because
 275     // if cursor == output.length() it is at the end of the output,
 276     // which is the default position, so we need not emit it.
 277     if (hasCursor && cursor > output.length()) {
 278         cursor -= output.length();
 279         while (cursor-- > 0) {
 280             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
 281         }
 282         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
 283     }
 284     // Flush quoteBuf out to result
 285     ICU_Utility::appendToRule(rule, -1,
 286                               TRUE, escapeUnprintable, quoteBuf);
 287
 288     return rule;
 289 }
 290
 291 /**
 292  * Implement UnicodeReplacer
 293  */
 294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
 295     UChar32 ch;
 296     for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
 297     ch = output.char32At(i);
 298     UnicodeReplacer* r = data->lookupReplacer(ch);
 299     if (r == NULL) {
 300         toUnionTo.add(ch);
 301     } else {
 302         r->addReplacementSetTo(toUnionTo);
 303     }
 304     }
 305 }
 306
 307 /**
 308  * UnicodeFunctor API
 309  */
 310 void StringReplacer::setData(const TransliterationRuleData* d) {
 311     data = d;
 312     int32_t i = 0;
 313     while (i<output.length()) {
 314         UChar32 c = output.char32At(i);
 315         UnicodeFunctor* f = data->lookup(c);
 316         if (f != NULL) {
 317             f->setData(data);
 318         }
 319         i += U16_LENGTH(c);
 320     }
 321 }
 322
 323 U_NAMESPACE_END
 324
 325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 326
 327 //eof