icuSources/i18n/strrepl.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2002-2004, International Business Machines Corporation
   4 *   and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   01/21/2002  aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "strrepl.h"
  16 #include "rbt_data.h"
  17 #include "util.h"
  18 #include "unicode/uniset.h"
  19
  20 U_NAMESPACE_BEGIN
  21
  22 static const UChar EMPTY[] = { 0 }; // empty string: ""
  23
  24 UnicodeReplacer::~UnicodeReplacer() {}
  25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
  26
  27 /**
  28  * Construct a StringReplacer that sets the emits the given output
  29  * text and sets the cursor to the given position.
  30  * @param theOutput text that will replace input text when the
  31  * replace() method is called.  May contain stand-in characters
  32  * that represent nested replacers.
  33  * @param theCursorPos cursor position that will be returned by
  34  * the replace() method
  35  * @param theData transliterator context object that translates
  36  * stand-in characters to UnicodeReplacer objects
  37  */
  38 StringReplacer::StringReplacer(const UnicodeString& theOutput,
  39                                int32_t theCursorPos,
  40                                const TransliterationRuleData* theData) {
  41     output = theOutput;
  42     cursorPos = theCursorPos;
  43     hasCursor = TRUE;
  44     data = theData;
  45     isComplex = TRUE;
  46 }
  47
  48 /**
  49  * Construct a StringReplacer that sets the emits the given output
  50  * text and does not modify the cursor.
  51  * @param theOutput text that will replace input text when the
  52  * replace() method is called.  May contain stand-in characters
  53  * that represent nested replacers.
  54  * @param theData transliterator context object that translates
  55  * stand-in characters to UnicodeReplacer objects
  56  */
  57 StringReplacer::StringReplacer(const UnicodeString& theOutput,
  58                                const TransliterationRuleData* theData) {
  59     output = theOutput;
  60     cursorPos = 0;
  61     hasCursor = FALSE;
  62     data = theData;
  63     isComplex = TRUE;
  64 }
  65
  66 /**
  67  * Copy constructor.
  68  */
  69 StringReplacer::StringReplacer(const StringReplacer& other) :
  70     UnicodeFunctor(other),
  71     UnicodeReplacer(other)
  72 {
  73     output = other.output;
  74     cursorPos = other.cursorPos;
  75     hasCursor = other.hasCursor;
  76     data = other.data;
  77     isComplex = other.isComplex;
  78 }
  79
  80 /**
  81  * Destructor
  82  */
  83 StringReplacer::~StringReplacer() {
  84 }
  85
  86 /**
  87  * Implement UnicodeFunctor
  88  */
  89 UnicodeFunctor* StringReplacer::clone() const {
  90     return new StringReplacer(*this);
  91 }
  92
  93 /**
  94  * Implement UnicodeFunctor
  95  */
  96 UnicodeReplacer* StringReplacer::toReplacer() const {
  97     return (UnicodeReplacer*) this;
  98 }
  99
 100 /**
 101  * UnicodeReplacer API
 102  */
 103 int32_t StringReplacer::replace(Replaceable& text,
 104                                 int32_t start,
 105                                 int32_t limit,
 106                                 int32_t& cursor) {
 107     int32_t outLen;
 108     int32_t newStart = 0;
 109
 110     // NOTE: It should be possible to _always_ run the complex
 111     // processing code; just slower.  If not, then there is a bug
 112     // in the complex processing code.
 113
 114     // Simple (no nested replacers) Processing Code :
 115     if (!isComplex) {
 116         text.handleReplaceBetween(start, limit, output);
 117         outLen = output.length();
 118
 119         // Setup default cursor position (for cursorPos within output)
 120         newStart = cursorPos;
 121     }
 122
 123     // Complex (nested replacers) Processing Code :
 124     else {
 125         /* When there are segments to be copied, use the Replaceable.copy()
 126          * API in order to retain out-of-band data.  Copy everything to the
 127          * end of the string, then copy them back over the key.  This preserves
 128          * the integrity of indices into the key and surrounding context while
 129          * generating the output text.
 130          */
 131         UnicodeString buf;
 132         int32_t oOutput; // offset into 'output'
 133         isComplex = FALSE;
 134
 135         // The temporary buffer starts at tempStart, and extends
 136         // to destLimit.  The start of the buffer has a single
 137         // character from before the key.  This provides style
 138         // data when addition characters are filled into the
 139         // temporary buffer.  If there is nothing to the left, use
 140         // the non-character U+FFFF, which Replaceable subclasses
 141         // should treat specially as a "no-style character."
 142         // destStart points to the point after the style context
 143         // character, so it is tempStart+1 or tempStart+2.
 144         int32_t tempStart = text.length(); // start of temp buffer
 145         int32_t destStart = tempStart; // copy new text to here
 146         if (start > 0) {
 147             int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
 148             text.copy(start-len, start, tempStart);
 149             destStart += len;
 150         } else {
 151             UnicodeString str((UChar) 0xFFFF);
 152             text.handleReplaceBetween(tempStart, tempStart, str);
 153             destStart++;
 154         }
 155         int32_t destLimit = destStart;
 156
 157         for (oOutput=0; oOutput<output.length(); ) {
 158             if (oOutput == cursorPos) {
 159                 // Record the position of the cursor
 160                 newStart = destLimit - destStart; // relative to start
 161             }
 162             UChar32 c = output.char32At(oOutput);
 163             UnicodeReplacer* r = data->lookupReplacer(c);
 164             if (r == NULL) {
 165                 // Accumulate straight (non-segment) text.
 166                 buf.append(c);
 167             } else {
 168                 isComplex = TRUE;
 169
 170                 // Insert any accumulated straight text.
 171                 if (buf.length() > 0) {
 172                     text.handleReplaceBetween(destLimit, destLimit, buf);
 173                     destLimit += buf.length();
 174                     buf.truncate(0);
 175                 }
 176
 177                 // Delegate output generation to replacer object
 178                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
 179                 destLimit += len;
 180             }
 181             oOutput += UTF_CHAR_LENGTH(c);
 182         }
 183         // Insert any accumulated straight text.
 184         if (buf.length() > 0) {
 185             text.handleReplaceBetween(destLimit, destLimit, buf);
 186             destLimit += buf.length();
 187         }
 188         if (oOutput == cursorPos) {
 189             // Record the position of the cursor
 190             newStart = destLimit - destStart; // relative to start
 191         }
 192
 193         outLen = destLimit - destStart;
 194
 195         // Copy new text to start, and delete it
 196         text.copy(destStart, destLimit, start);
 197         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);
 198
 199         // Delete the old text (the key)
 200         text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
 201     }
 202
 203     if (hasCursor) {
 204         // Adjust the cursor for positions outside the key.  These
 205         // refer to code points rather than code units.  If cursorPos
 206         // is within the output string, then use newStart, which has
 207         // already been set above.
 208         if (cursorPos < 0) {
 209             newStart = start;
 210             int32_t n = cursorPos;
 211             // Outside the output string, cursorPos counts code points
 212             while (n < 0 && newStart > 0) {
 213                 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
 214                 ++n;
 215             }
 216             newStart += n;
 217         } else if (cursorPos > output.length()) {
 218             newStart = start + outLen;
 219             int32_t n = cursorPos - output.length();
 220             // Outside the output string, cursorPos counts code points
 221             while (n > 0 && newStart < text.length()) {
 222                 newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
 223                 --n;
 224             }
 225             newStart += n;
 226         } else {
 227             // Cursor is within output string.  It has been set up above
 228             // to be relative to start.
 229             newStart += start;
 230         }
 231
 232         cursor = newStart;
 233     }
 234
 235     return outLen;
 236 }
 237
 238 /**
 239  * UnicodeReplacer API
 240  */
 241 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
 242                                                  UBool escapeUnprintable) const {
 243     rule.truncate(0);
 244     UnicodeString quoteBuf;
 245
 246     int32_t cursor = cursorPos;
 247
 248     // Handle a cursor preceding the output
 249     if (hasCursor && cursor < 0) {
 250         while (cursor++ < 0) {
 251             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
 252         }
 253         // Fall through and append '|' below
 254     }
 255
 256     for (int32_t i=0; i<output.length(); ++i) {
 257         if (hasCursor && i == cursor) {
 258             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
 259         }
 260         UChar c = output.charAt(i); // Ok to use 16-bits here
 261
 262         UnicodeReplacer* r = data->lookupReplacer(c);
 263         if (r == NULL) {
 264             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
 265         } else {
 266             UnicodeString buf;
 267             r->toReplacerPattern(buf, escapeUnprintable);
 268             buf.insert(0, (UChar)0x20);
 269             buf.append((UChar)0x20);
 270             ICU_Utility::appendToRule(rule, buf,
 271                                       TRUE, escapeUnprintable, quoteBuf);
 272         }
 273     }
 274
 275     // Handle a cursor after the output.  Use > rather than >= because
 276     // if cursor == output.length() it is at the end of the output,
 277     // which is the default position, so we need not emit it.
 278     if (hasCursor && cursor > output.length()) {
 279         cursor -= output.length();
 280         while (cursor-- > 0) {
 281             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
 282         }
 283         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
 284     }
 285     // Flush quoteBuf out to result
 286     ICU_Utility::appendToRule(rule, -1,
 287                               TRUE, escapeUnprintable, quoteBuf);
 288
 289     return rule;
 290 }
 291
 292 /**
 293  * Implement UnicodeReplacer
 294  */
 295 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
 296     UChar32 ch;
 297     for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
 298     ch = output.char32At(i);
 299     UnicodeReplacer* r = data->lookupReplacer(ch);
 300     if (r == NULL) {
 301         toUnionTo.add(ch);
 302     } else {
 303         r->addReplacementSetTo(toUnionTo);
 304     }
 305     }
 306 }
 307
 308 /**
 309  * UnicodeFunctor API
 310  */
 311 void StringReplacer::setData(const TransliterationRuleData* d) {
 312     data = d;
 313     int32_t i = 0;
 314     while (i<output.length()) {
 315         UChar32 c = output.char32At(i);
 316         UnicodeFunctor* f = data->lookup(c);
 317         if (f != NULL) {
 318             f->setData(data);
 319         }
 320         i += UTF_CHAR_LENGTH(c);
 321     }
 322 }
 323
 324 U_NAMESPACE_END
 325
 326 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 327
 328 //eof