[apple/icu.git] / icuSources / i18n / strrepl.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (c) 2002-2012, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/21/2002  aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "strrepl.h"
#include "rbt_data.h"
#include "util.h"

U_NAMESPACE_BEGIN

UnicodeReplacer::~UnicodeReplacer() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)

/**
 * Construct a StringReplacer that sets the emits the given output
 * text and sets the cursor to the given position.
 * @param theOutput text that will replace input text when the
 * replace() method is called.  May contain stand-in characters
 * that represent nested replacers.
 * @param theCursorPos cursor position that will be returned by
 * the replace() method
 * @param theData transliterator context object that translates
 * stand-in characters to UnicodeReplacer objects
 */
StringReplacer::StringReplacer(const UnicodeString& theOutput,
                               int32_t theCursorPos,
                               const TransliterationRuleData* theData) {
    output = theOutput;
    cursorPos = theCursorPos;
    hasCursor = TRUE;
    data = theData;
    isComplex = TRUE;
}

/**
 * Construct a StringReplacer that sets the emits the given output
 * text and does not modify the cursor.
 * @param theOutput text that will replace input text when the
 * replace() method is called.  May contain stand-in characters
 * that represent nested replacers.
 * @param theData transliterator context object that translates
 * stand-in characters to UnicodeReplacer objects
 */
StringReplacer::StringReplacer(const UnicodeString& theOutput,
                               const TransliterationRuleData* theData) {
    output = theOutput;
    cursorPos = 0;
    hasCursor = FALSE;
    data = theData;
    isComplex = TRUE;
}

/**
 * Copy constructor.
 */
StringReplacer::StringReplacer(const StringReplacer& other) :
    UnicodeFunctor(other),
    UnicodeReplacer(other)
{
    output = other.output;
    cursorPos = other.cursorPos;
    hasCursor = other.hasCursor;
    data = other.data;
    isComplex = other.isComplex;
}

/**
 * Destructor
 */
StringReplacer::~StringReplacer() {
}

/**
 * Implement UnicodeFunctor
 */
UnicodeFunctor* StringReplacer::clone() const {
    return new StringReplacer(*this);
}

/**
 * Implement UnicodeFunctor
 */
UnicodeReplacer* StringReplacer::toReplacer() const {
  return const_cast<StringReplacer *>(this);
}

/**
 * UnicodeReplacer API
 */
int32_t StringReplacer::replace(Replaceable& text,
                                int32_t start,
                                int32_t limit,
                                int32_t& cursor) {
    int32_t outLen;
    int32_t newStart = 0;

    // NOTE: It should be possible to _always_ run the complex
    // processing code; just slower.  If not, then there is a bug
    // in the complex processing code.

    // Simple (no nested replacers) Processing Code :
    if (!isComplex) {
        text.handleReplaceBetween(start, limit, output);
        outLen = output.length();

        // Setup default cursor position (for cursorPos within output)
        newStart = cursorPos;
    }

    // Complex (nested replacers) Processing Code :
    else {
        /* When there are segments to be copied, use the Replaceable.copy()
         * API in order to retain out-of-band data.  Copy everything to the
         * end of the string, then copy them back over the key.  This preserves
         * the integrity of indices into the key and surrounding context while
         * generating the output text.
         */
        UnicodeString buf;
        int32_t oOutput; // offset into 'output'
        isComplex = FALSE;

        // The temporary buffer starts at tempStart, and extends
        // to destLimit.  The start of the buffer has a single
        // character from before the key.  This provides style
        // data when addition characters are filled into the
        // temporary buffer.  If there is nothing to the left, use
        // the non-character U+FFFF, which Replaceable subclasses
        // should treat specially as a "no-style character."
        // destStart points to the point after the style context
        // character, so it is tempStart+1 or tempStart+2.
        int32_t tempStart = text.length(); // start of temp buffer
        int32_t destStart = tempStart; // copy new text to here
        if (start > 0) {
            int32_t len = U16_LENGTH(text.char32At(start-1));
            text.copy(start-len, start, tempStart);
            destStart += len;
        } else {
            UnicodeString str((UChar) 0xFFFF);
            text.handleReplaceBetween(tempStart, tempStart, str);
            destStart++;
        }
        int32_t destLimit = destStart;

        for (oOutput=0; oOutput<output.length(); ) {
            if (oOutput == cursorPos) {
                // Record the position of the cursor
                newStart = destLimit - destStart; // relative to start
            }
            UChar32 c = output.char32At(oOutput);
            UnicodeReplacer* r = data->lookupReplacer(c);
            if (r == NULL) {
                // Accumulate straight (non-segment) text.
                buf.append(c);
            } else {
                isComplex = TRUE;

                // Insert any accumulated straight text.
                if (buf.length() > 0) {
                    text.handleReplaceBetween(destLimit, destLimit, buf);
                    destLimit += buf.length();
                    buf.truncate(0);
                }

                // Delegate output generation to replacer object
                int32_t len = r->replace(text, destLimit, destLimit, cursor);
                destLimit += len;
            }
            oOutput += U16_LENGTH(c);
        }
        // Insert any accumulated straight text.
        if (buf.length() > 0) {
            text.handleReplaceBetween(destLimit, destLimit, buf);
            destLimit += buf.length();
        }
        if (oOutput == cursorPos) {
            // Record the position of the cursor
            newStart = destLimit - destStart; // relative to start
        }

        outLen = destLimit - destStart;

        // Copy new text to start, and delete it
        text.copy(destStart, destLimit, start);
        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());

        // Delete the old text (the key)
        text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
    }        

    if (hasCursor) {
        // Adjust the cursor for positions outside the key.  These
        // refer to code points rather than code units.  If cursorPos
        // is within the output string, then use newStart, which has
        // already been set above.
        if (cursorPos < 0) {
            newStart = start;
            int32_t n = cursorPos;
            // Outside the output string, cursorPos counts code points
            while (n < 0 && newStart > 0) {
                newStart -= U16_LENGTH(text.char32At(newStart-1));
                ++n;
            }
            newStart += n;
        } else if (cursorPos > output.length()) {
            newStart = start + outLen;
            int32_t n = cursorPos - output.length();
            // Outside the output string, cursorPos counts code points
            while (n > 0 && newStart < text.length()) {
                newStart += U16_LENGTH(text.char32At(newStart));
                --n;
            }
            newStart += n;
        } else {
            // Cursor is within output string.  It has been set up above
            // to be relative to start.
            newStart += start;
        }

        cursor = newStart;
    }

    return outLen;
}

/**
 * UnicodeReplacer API
 */
UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
                                                 UBool escapeUnprintable) const {
    rule.truncate(0);
    UnicodeString quoteBuf;

    int32_t cursor = cursorPos;

    // Handle a cursor preceding the output
    if (hasCursor && cursor < 0) {
        while (cursor++ < 0) {
            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        // Fall through and append '|' below
    }

    for (int32_t i=0; i<output.length(); ++i) {
        if (hasCursor && i == cursor) {
            ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
        }
        UChar c = output.charAt(i); // Ok to use 16-bits here

        UnicodeReplacer* r = data->lookupReplacer(c);
        if (r == NULL) {
            ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
        } else {
            UnicodeString buf;
            r->toReplacerPattern(buf, escapeUnprintable);
            buf.insert(0, (UChar)0x20);
            buf.append((UChar)0x20);
            ICU_Utility::appendToRule(rule, buf,
                                      TRUE, escapeUnprintable, quoteBuf);
        }
    }

    // Handle a cursor after the output.  Use > rather than >= because
    // if cursor == output.length() it is at the end of the output,
    // which is the default position, so we need not emit it.
    if (hasCursor && cursor > output.length()) {
        cursor -= output.length();
        while (cursor-- > 0) {
            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    }
    // Flush quoteBuf out to result
    ICU_Utility::appendToRule(rule, -1,
                              TRUE, escapeUnprintable, quoteBuf);

    return rule;
}

/**
 * Implement UnicodeReplacer
 */
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
    ch = output.char32At(i);
    UnicodeReplacer* r = data->lookupReplacer(ch);
    if (r == NULL) {
        toUnionTo.add(ch);
    } else {
        r->addReplacementSetTo(toUnionTo);
    }
    }
}

/**
 * UnicodeFunctor API
 */
void StringReplacer::setData(const TransliterationRuleData* d) {
    data = d;
    int32_t i = 0;
    while (i<output.length()) {
        UChar32 c = output.char32At(i);
        UnicodeFunctor* f = data->lookup(c);
        if (f != NULL) {
            f->setData(data);
        }
        i += U16_LENGTH(c);
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
b75a7d8f A	4	**********************************************************************
51004dcb	5	* Copyright (c) 2002-2012, International Business Machines Corporation
b75a7d8f A	6	* and others. All Rights Reserved.
	7	**********************************************************************
	8	* Date Name Description
	9	* 01/21/2002 aliu Creation.
	10	**********************************************************************
	11	*/
	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_TRANSLITERATION
	16
4388f060 A	17	#include "unicode/uniset.h"
4388f060 A	18	#include "unicode/utf16.h"
b75a7d8f A	19	#include "strrepl.h"
	20	#include "rbt_data.h"
	21	#include "util.h"
b75a7d8f A	22
	23	U_NAMESPACE_BEGIN
	24
374ca955 A	25	UnicodeReplacer::~UnicodeReplacer() {}
374ca955 A	26	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
b75a7d8f A	27
	28	/**
	29	* Construct a StringReplacer that sets the emits the given output
	30	* text and sets the cursor to the given position.
	31	* @param theOutput text that will replace input text when the
	32	* replace() method is called. May contain stand-in characters
	33	* that represent nested replacers.
	34	* @param theCursorPos cursor position that will be returned by
	35	* the replace() method
	36	* @param theData transliterator context object that translates
	37	* stand-in characters to UnicodeReplacer objects
	38	*/
	39	StringReplacer::StringReplacer(const UnicodeString& theOutput,
	40	int32_t theCursorPos,
	41	const TransliterationRuleData* theData) {
	42	output = theOutput;
	43	cursorPos = theCursorPos;
	44	hasCursor = TRUE;
	45	data = theData;
	46	isComplex = TRUE;
	47	}
	48
	49	/**
	50	* Construct a StringReplacer that sets the emits the given output
	51	* text and does not modify the cursor.
	52	* @param theOutput text that will replace input text when the
	53	* replace() method is called. May contain stand-in characters
	54	* that represent nested replacers.
	55	* @param theData transliterator context object that translates
	56	* stand-in characters to UnicodeReplacer objects
	57	*/
	58	StringReplacer::StringReplacer(const UnicodeString& theOutput,
	59	const TransliterationRuleData* theData) {
	60	output = theOutput;
	61	cursorPos = 0;
	62	hasCursor = FALSE;
	63	data = theData;
	64	isComplex = TRUE;
	65	}
	66
	67	/**
	68	* Copy constructor.
	69	*/
374ca955 A	70	StringReplacer::StringReplacer(const StringReplacer& other) :
	71	UnicodeFunctor(other),
	72	UnicodeReplacer(other)
	73	{
b75a7d8f A	74	output = other.output;
	75	cursorPos = other.cursorPos;
	76	hasCursor = other.hasCursor;
	77	data = other.data;
	78	isComplex = other.isComplex;
	79	}
	80
	81	/**
	82	* Destructor
	83	*/
	84	StringReplacer::~StringReplacer() {
	85	}
	86
	87	/**
	88	* Implement UnicodeFunctor
	89	*/
	90	UnicodeFunctor* StringReplacer::clone() const {
	91	return new StringReplacer(*this);
	92	}
	93
	94	/**
	95	* Implement UnicodeFunctor
	96	*/
	97	UnicodeReplacer* StringReplacer::toReplacer() const {
51004dcb	98	return const_cast<StringReplacer *>(this);
b75a7d8f A	99	}
	100
	101	/**
	102	* UnicodeReplacer API
	103	*/
	104	int32_t StringReplacer::replace(Replaceable& text,
	105	int32_t start,
	106	int32_t limit,
	107	int32_t& cursor) {
	108	int32_t outLen;
	109	int32_t newStart = 0;
	110
	111	// NOTE: It should be possible to _always_ run the complex
	112	// processing code; just slower. If not, then there is a bug
	113	// in the complex processing code.
	114
	115	// Simple (no nested replacers) Processing Code :
	116	if (!isComplex) {
	117	text.handleReplaceBetween(start, limit, output);
	118	outLen = output.length();
	119
	120	// Setup default cursor position (for cursorPos within output)
	121	newStart = cursorPos;
	122	}
	123
	124	// Complex (nested replacers) Processing Code :
	125	else {
	126	/* When there are segments to be copied, use the Replaceable.copy()
	127	* API in order to retain out-of-band data. Copy everything to the
	128	* end of the string, then copy them back over the key. This preserves
	129	* the integrity of indices into the key and surrounding context while
	130	* generating the output text.
	131	*/
	132	UnicodeString buf;
	133	int32_t oOutput; // offset into 'output'
	134	isComplex = FALSE;
	135
	136	// The temporary buffer starts at tempStart, and extends
	137	// to destLimit. The start of the buffer has a single
	138	// character from before the key. This provides style
	139	// data when addition characters are filled into the
	140	// temporary buffer. If there is nothing to the left, use
	141	// the non-character U+FFFF, which Replaceable subclasses
	142	// should treat specially as a "no-style character."
	143	// destStart points to the point after the style context
	144	// character, so it is tempStart+1 or tempStart+2.
	145	int32_t tempStart = text.length(); // start of temp buffer
	146	int32_t destStart = tempStart; // copy new text to here
	147	if (start > 0) {
4388f060	148	int32_t len = U16_LENGTH(text.char32At(start-1));
b75a7d8f A	149	text.copy(start-len, start, tempStart);
	150	destStart += len;
	151	} else {
	152	UnicodeString str((UChar) 0xFFFF);
	153	text.handleReplaceBetween(tempStart, tempStart, str);
	154	destStart++;
	155	}
	156	int32_t destLimit = destStart;
	157
	158	for (oOutput=0; oOutput<output.length(); ) {
	159	if (oOutput == cursorPos) {
	160	// Record the position of the cursor
	161	newStart = destLimit - destStart; // relative to start
	162	}
	163	UChar32 c = output.char32At(oOutput);
	164	UnicodeReplacer* r = data->lookupReplacer(c);
	165	if (r == NULL) {
	166	// Accumulate straight (non-segment) text.
	167	buf.append(c);
	168	} else {
	169	isComplex = TRUE;
	170
	171	// Insert any accumulated straight text.
	172	if (buf.length() > 0) {
	173	text.handleReplaceBetween(destLimit, destLimit, buf);
	174	destLimit += buf.length();
	175	buf.truncate(0);
	176	}
	177
	178	// Delegate output generation to replacer object
	179	int32_t len = r->replace(text, destLimit, destLimit, cursor);
	180	destLimit += len;
	181	}
4388f060	182	oOutput += U16_LENGTH(c);
b75a7d8f A	183	}
	184	// Insert any accumulated straight text.
	185	if (buf.length() > 0) {
	186	text.handleReplaceBetween(destLimit, destLimit, buf);
	187	destLimit += buf.length();
	188	}
	189	if (oOutput == cursorPos) {
	190	// Record the position of the cursor
	191	newStart = destLimit - destStart; // relative to start
	192	}
	193
	194	outLen = destLimit - destStart;
	195
	196	// Copy new text to start, and delete it
	197	text.copy(destStart, destLimit, start);
4388f060	198	text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
b75a7d8f A	199
b75a7d8f A	200	// Delete the old text (the key)
4388f060	201	text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
b75a7d8f A	202	}
	203
	204	if (hasCursor) {
	205	// Adjust the cursor for positions outside the key. These
	206	// refer to code points rather than code units. If cursorPos
	207	// is within the output string, then use newStart, which has
	208	// already been set above.
	209	if (cursorPos < 0) {
	210	newStart = start;
	211	int32_t n = cursorPos;
	212	// Outside the output string, cursorPos counts code points
	213	while (n < 0 && newStart > 0) {
4388f060	214	newStart -= U16_LENGTH(text.char32At(newStart-1));
b75a7d8f A	215	++n;
	216	}
	217	newStart += n;
	218	} else if (cursorPos > output.length()) {
	219	newStart = start + outLen;
	220	int32_t n = cursorPos - output.length();
	221	// Outside the output string, cursorPos counts code points
	222	while (n > 0 && newStart < text.length()) {
4388f060	223	newStart += U16_LENGTH(text.char32At(newStart));
b75a7d8f A	224	--n;
	225	}
	226	newStart += n;
	227	} else {
	228	// Cursor is within output string. It has been set up above
	229	// to be relative to start.
	230	newStart += start;
	231	}
	232
	233	cursor = newStart;
	234	}
	235
	236	return outLen;
	237	}
	238
	239	/**
	240	* UnicodeReplacer API
	241	*/
	242	UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
	243	UBool escapeUnprintable) const {
	244	rule.truncate(0);
	245	UnicodeString quoteBuf;
	246
	247	int32_t cursor = cursorPos;
	248
	249	// Handle a cursor preceding the output
	250	if (hasCursor && cursor < 0) {
	251	while (cursor++ < 0) {
	252	ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, TRUE, escapeUnprintable, quoteBuf);
	253	}
	254	// Fall through and append '\|' below
	255	}
	256
	257	for (int32_t i=0; i<output.length(); ++i) {
	258	if (hasCursor && i == cursor) {
	259	ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, TRUE, escapeUnprintable, quoteBuf);
	260	}
	261	UChar c = output.charAt(i); // Ok to use 16-bits here
	262
	263	UnicodeReplacer* r = data->lookupReplacer(c);
	264	if (r == NULL) {
	265	ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
	266	} else {
	267	UnicodeString buf;
	268	r->toReplacerPattern(buf, escapeUnprintable);
	269	buf.insert(0, (UChar)0x20);
	270	buf.append((UChar)0x20);
	271	ICU_Utility::appendToRule(rule, buf,
	272	TRUE, escapeUnprintable, quoteBuf);
	273	}
	274	}
	275
	276	// Handle a cursor after the output. Use > rather than >= because
	277	// if cursor == output.length() it is at the end of the output,
	278	// which is the default position, so we need not emit it.
	279	if (hasCursor && cursor > output.length()) {
	280	cursor -= output.length();
	281	while (cursor-- > 0) {
	282	ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, TRUE, escapeUnprintable, quoteBuf);
	283	}
	284	ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, TRUE, escapeUnprintable, quoteBuf);
	285	}
	286	// Flush quoteBuf out to result
	287	ICU_Utility::appendToRule(rule, -1,
288	TRUE, escapeUnprintable, quoteBuf);
289
290	return rule;
291	}
292
293	/**
294	* Implement UnicodeReplacer
295	*/
296	void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
297	UChar32 ch;
4388f060	298	for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
374ca955 A	299	ch = output.char32At(i);
	300	UnicodeReplacer* r = data->lookupReplacer(ch);
	301	if (r == NULL) {
	302	toUnionTo.add(ch);
	303	} else {
	304	r->addReplacementSetTo(toUnionTo);
	305	}
b75a7d8f A	306	}
	307	}
	308
	309	/**
	310	* UnicodeFunctor API
	311	*/
	312	void StringReplacer::setData(const TransliterationRuleData* d) {
	313	data = d;
	314	int32_t i = 0;
	315	while (i<output.length()) {
	316	UChar32 c = output.char32At(i);
	317	UnicodeFunctor* f = data->lookup(c);
	318	if (f != NULL) {
	319	f->setData(data);
	320	}
4388f060	321	i += U16_LENGTH(c);
b75a7d8f A	322	}
	323	}
	324
	325	U_NAMESPACE_END
	326
	327	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	328
	329	//eof