[apple/icu.git] / icuSources / i18n / brktrans.cpp

/*
**********************************************************************
*   Copyright (C) 2008-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   05/11/2008  Andy Heninger  Port from Java
**********************************************************************
*/

#include "unicode/utypes.h"

#if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION

#include "unicode/unifilt.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/brkiter.h"
#include "brktrans.h"
#include "unicode/uchar.h"
#include "cmemory.h"
#include "uprops.h"
#include "uinvchar.h"
#include "util.h"
#include "uvectr32.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)

static const UChar SPACE       = 32;  // ' '


/**
 * Constructs a transliterator with the default delimiters '{' and
 * '}'.
 */
BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
    Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
    fInsertion(SPACE) {
        bi = NULL;
        UErrorCode status = U_ZERO_ERROR;
        boundaries = new UVector32(status);
    }


/**
 * Destructor.
 */
BreakTransliterator::~BreakTransliterator() {
    delete bi;
    bi = NULL;
    delete boundaries;
    boundaries = NULL;
}

/**
 * Copy constructor.
 */
BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
    Transliterator(o) {
        bi = NULL;
        if (o.bi != NULL) {
            bi = o.bi->clone();
        }
        fInsertion = o.fInsertion;
        UErrorCode status = U_ZERO_ERROR;
        boundaries = new UVector32(status);
    }


/**
 * Transliterator API.
 */
Transliterator* BreakTransliterator::clone(void) const {
    return new BreakTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental ) const {

        UErrorCode status = U_ZERO_ERROR;
        boundaries->removeAllElements();
        BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
        nonConstThis->getBreakIterator(); // Lazy-create it if necessary
        UnicodeString sText = replaceableAsString(text);
        bi->setText(sText);
        bi->preceding(offsets.start);

        // To make things much easier, we will stack the boundaries, and then insert at the end.
        // generally, we won't need too many, since we will be filtered.

        int32_t boundary;
        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
            if (boundary == 0) continue;
            // HACK: Check to see that preceeding item was a letter

            UChar32 cp = sText.char32At(boundary-1);
            int type = u_charType(cp);
            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

            cp = sText.char32At(boundary);
            type = u_charType(cp);
            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

            boundaries->addElement(boundary, status);
            // printf("Boundary at %d\n", boundary);
        }

        int delta = 0;
        int lastBoundary = 0;

        if (boundaries->size() != 0) { // if we found something, adjust
            delta = boundaries->size() * fInsertion.length();
            lastBoundary = boundaries->lastElementi();

            // we do this from the end backwards, so that we don't have to keep updating.

            while (boundaries->size() > 0) {
                boundary = boundaries->popi();
                text.handleReplaceBetween(boundary, boundary, fInsertion);
            }
        }

        // Now fix up the return values
        offsets.contextLimit += delta;
        offsets.limit += delta;
        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;

        // TODO:  do something with U_FAILURE(status);
        //        (need to look at transliterators overall, not just here.)
}

//
//  getInsertion()
//
const UnicodeString &BreakTransliterator::getInsertion() const {
    return fInsertion;
}

//
//  setInsertion()
//
void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
    this->fInsertion = insertion;
}

//
//  getBreakIterator     Lazily create the break iterator if it does
//                       not already exist.  Copied from Java, probably
//                       better to just create it in the constructor.
//
BreakIterator *BreakTransliterator::getBreakIterator() {
    UErrorCode status = U_ZERO_ERROR;
    if (bi == NULL) {
        // Note:  Thai breaking behavior is universal, it is not
        //        tied to the Thai locale.
        bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
    }
    return bi;
}

//
//   replaceableAsString   Hack to let break iterators work
//                         on the replaceable text from transliterators.
//                         In practice, the only real Replaceable type that we
//                         will be seeing is UnicodeString, so this function
//                         will normally be efficient.
//
UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    UnicodeString s;
    UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
    if (rs != NULL) {
        s = *rs;
    } else {
        r.extractBetween(0, r.length(), s);
    }
    return s;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
Commit	Line	Data
46f4442e A	1	/*
46f4442e A	2	**********************************************************************
729e4ab9	3	* Copyright (C) 2008-2010, International Business Machines
46f4442e A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 05/11/2008 Andy Heninger Port from Java
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
	14
	15	#include "unicode/unifilt.h"
	16	#include "unicode/uchar.h"
	17	#include "unicode/uniset.h"
	18	#include "unicode/brkiter.h"
	19	#include "brktrans.h"
	20	#include "unicode/uchar.h"
	21	#include "cmemory.h"
	22	#include "uprops.h"
	23	#include "uinvchar.h"
	24	#include "util.h"
	25	#include "uvectr32.h"
	26
	27	U_NAMESPACE_BEGIN
	28
	29	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
	30
	31	static const UChar SPACE = 32; // ' '
	32
	33
	34	/**
	35	* Constructs a transliterator with the default delimiters '{' and
	36	* '}'.
	37	*/
	38	BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
	39	Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
	40	fInsertion(SPACE) {
	41	bi = NULL;
	42	UErrorCode status = U_ZERO_ERROR;
	43	boundaries = new UVector32(status);
	44	}
	45
	46
	47	/**
	48	* Destructor.
	49	*/
	50	BreakTransliterator::~BreakTransliterator() {
	51	delete bi;
	52	bi = NULL;
	53	delete boundaries;
	54	boundaries = NULL;
	55	}
	56
	57	/**
	58	* Copy constructor.
	59	*/
	60	BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
	61	Transliterator(o) {
	62	bi = NULL;
	63	if (o.bi != NULL) {
	64	bi = o.bi->clone();
	65	}
	66	fInsertion = o.fInsertion;
	67	UErrorCode status = U_ZERO_ERROR;
68	boundaries = new UVector32(status);
69	}
70
71
72	/**
73	* Transliterator API.
74	*/
75	Transliterator* BreakTransliterator::clone(void) const {
76	return new BreakTransliterator(*this);
77	}
78
79	/**
80	* Implements {@link Transliterator#handleTransliterate}.
81	*/
82	void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
83	UBool isIncremental ) const {
84
85	UErrorCode status = U_ZERO_ERROR;
86	boundaries->removeAllElements();
87	BreakTransliterator nonConstThis = (BreakTransliterator )this;
88	nonConstThis->getBreakIterator(); // Lazy-create it if necessary
89	UnicodeString sText = replaceableAsString(text);
90	bi->setText(sText);
91	bi->preceding(offsets.start);
92
93	// To make things much easier, we will stack the boundaries, and then insert at the end.
94	// generally, we won't need too many, since we will be filtered.
95
96	int32_t boundary;
97	for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
98	if (boundary == 0) continue;
99	// HACK: Check to see that preceeding item was a letter
100
101	UChar32 cp = sText.char32At(boundary-1);
102	int type = u_charType(cp);
103	//System.out.println(Integer.toString(cp,16) + " (before): " + type);
104	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;
105
106	cp = sText.char32At(boundary);
107	type = u_charType(cp);
108	//System.out.println(Integer.toString(cp,16) + " (after): " + type);
109	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;
110
111	boundaries->addElement(boundary, status);
729e4ab9	112	// printf("Boundary at %d\n", boundary);
46f4442e A	113	}
	114
	115	int delta = 0;
	116	int lastBoundary = 0;
	117
	118	if (boundaries->size() != 0) { // if we found something, adjust
	119	delta = boundaries->size() * fInsertion.length();
	120	lastBoundary = boundaries->lastElementi();
	121
	122	// we do this from the end backwards, so that we don't have to keep updating.
	123
	124	while (boundaries->size() > 0) {
	125	boundary = boundaries->popi();
	126	text.handleReplaceBetween(boundary, boundary, fInsertion);
	127	}
	128	}
	129
	130	// Now fix up the return values
	131	offsets.contextLimit += delta;
	132	offsets.limit += delta;
	133	offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
	134
	135	// TODO: do something with U_FAILURE(status);
	136	// (need to look at transliterators overall, not just here.)
	137	}
	138
	139	//
	140	// getInsertion()
	141	//
	142	const UnicodeString &BreakTransliterator::getInsertion() const {
	143	return fInsertion;
	144	}
	145
	146	//
	147	// setInsertion()
	148	//
729e4ab9	149	void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
46f4442e A	150	this->fInsertion = insertion;
	151	}
	152
	153	//
	154	// getBreakIterator Lazily create the break iterator if it does
	155	// not already exist. Copied from Java, probably
	156	// better to just create it in the constructor.
	157	//
	158	BreakIterator *BreakTransliterator::getBreakIterator() {
	159	UErrorCode status = U_ZERO_ERROR;
	160	if (bi == NULL) {
	161	// Note: Thai breaking behavior is universal, it is not
	162	// tied to the Thai locale.
	163	bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
	164	}
	165	return bi;
	166	}
	167
	168	//
	169	// replaceableAsString Hack to let break iterators work
	170	// on the replaceable text from transliterators.
	171	// In practice, the only real Replaceable type that we
	172	// will be seeing is UnicodeString, so this function
	173	// will normally be efficient.
	174	//
	175	UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
46f4442e	176	UnicodeString s;
729e4ab9 A	177	UnicodeString rs = dynamic_cast<UnicodeString >(&r);
	178	if (rs != NULL) {
	179	s = *rs;
	180	} else {
	181	r.extractBetween(0, r.length(), s);
	182	}
46f4442e A	183	return s;
	184	}
	185
	186	U_NAMESPACE_END
	187
	188	#endif /* #if !UCONFIG_NO_TRANSLITERATION */