[apple/icu.git] / icuSources / i18n / brktrans.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2008-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   05/11/2008  Andy Heninger  Port from Java
**********************************************************************
*/

#include "unicode/utypes.h"

#if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION

#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "unicode/uchar.h"
#include "unicode/unifilt.h"
#include "unicode/uniset.h"

#include "brktrans.h"
#include "cmemory.h"
#include "mutex.h"
#include "uprops.h"
#include "uinvchar.h"
#include "util.h"
#include "uvectr32.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)

static const UChar SPACE       = 32;  // ' '


/**
 * Constructs a transliterator with the default delimiters '{' and
 * '}'.
 */
BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
        Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
        cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
    }


/**
 * Destructor.
 */
BreakTransliterator::~BreakTransliterator() {
}

/**
 * Copy constructor.
 */
BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
        Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
}


/**
 * Transliterator API.
 */
Transliterator* BreakTransliterator::clone(void) const {
    return new BreakTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental ) const {

        UErrorCode status = U_ZERO_ERROR;
        LocalPointer<BreakIterator> bi;
        LocalPointer<UVector32> boundaries;

        {
            Mutex m;
            BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
            boundaries.moveFrom(nonConstThis->cachedBoundaries);
            bi.moveFrom(nonConstThis->cachedBI);
        }
        if (bi.isNull()) {
            bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
        }
        if (boundaries.isNull()) {
            boundaries.adoptInstead(new UVector32(status));
        }

        if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
            return;
        }

        boundaries->removeAllElements();
        UnicodeString sText = replaceableAsString(text);
        bi->setText(sText);
        bi->preceding(offsets.start);

        // To make things much easier, we will stack the boundaries, and then insert at the end.
        // generally, we won't need too many, since we will be filtered.

        int32_t boundary;
        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
            if (boundary == 0) continue;
            // HACK: Check to see that preceeding item was a letter

            UChar32 cp = sText.char32At(boundary-1);
            int type = u_charType(cp);
            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

            cp = sText.char32At(boundary);
            type = u_charType(cp);
            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

            boundaries->addElement(boundary, status);
            // printf("Boundary at %d\n", boundary);
        }

        int delta = 0;
        int lastBoundary = 0;

        if (boundaries->size() != 0) { // if we found something, adjust
            delta = boundaries->size() * fInsertion.length();
            lastBoundary = boundaries->lastElementi();

            // we do this from the end backwards, so that we don't have to keep updating.

            while (boundaries->size() > 0) {
                boundary = boundaries->popi();
                text.handleReplaceBetween(boundary, boundary, fInsertion);
            }
        }

        // Now fix up the return values
        offsets.contextLimit += delta;
        offsets.limit += delta;
        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;

        // Return break iterator & boundaries vector to the cache.
        {
            Mutex m;
            BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
            if (nonConstThis->cachedBI.isNull()) {
                nonConstThis->cachedBI.moveFrom(bi);
            }
            if (nonConstThis->cachedBoundaries.isNull()) {
                nonConstThis->cachedBoundaries.moveFrom(boundaries);
            }
        }

        // TODO:  do something with U_FAILURE(status);
        //        (need to look at transliterators overall, not just here.)
}

//
//  getInsertion()
//
const UnicodeString &BreakTransliterator::getInsertion() const {
    return fInsertion;
}

//
//  setInsertion()
//
void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
    this->fInsertion = insertion;
}

//
//   replaceableAsString   Hack to let break iterators work
//                         on the replaceable text from transliterators.
//                         In practice, the only real Replaceable type that we
//                         will be seeing is UnicodeString, so this function
//                         will normally be efficient.
//
UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    UnicodeString s;
    UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
    if (rs != NULL) {
        s = *rs;
    } else {
        r.extractBetween(0, r.length(), s);
    }
    return s;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
46f4442e A	3	/*
46f4442e A	4	**********************************************************************
2ca993e8	5	* Copyright (C) 2008-2015, International Business Machines
46f4442e A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	* Date Name Description
	9	* 05/11/2008 Andy Heninger Port from Java
	10	**********************************************************************
	11	*/
	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
	16
2ca993e8 A	17	#include "unicode/brkiter.h"
2ca993e8 A	18	#include "unicode/localpointer.h"
46f4442e	19	#include "unicode/uchar.h"
2ca993e8	20	#include "unicode/unifilt.h"
46f4442e	21	#include "unicode/uniset.h"
2ca993e8	22
46f4442e	23	#include "brktrans.h"
46f4442e	24	#include "cmemory.h"
2ca993e8	25	#include "mutex.h"
46f4442e A	26	#include "uprops.h"
	27	#include "uinvchar.h"
	28	#include "util.h"
	29	#include "uvectr32.h"
	30
	31	U_NAMESPACE_BEGIN
	32
	33	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
	34
	35	static const UChar SPACE = 32; // ' '
	36
	37
	38	/**
	39	* Constructs a transliterator with the default delimiters '{' and
	40	* '}'.
	41	*/
	42	BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
2ca993e8 A	43	Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
2ca993e8 A	44	cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
46f4442e A	45	}
	46
	47
	48	/**
	49	* Destructor.
	50	*/
	51	BreakTransliterator::~BreakTransliterator() {
46f4442e A	52	}
	53
	54	/**
	55	* Copy constructor.
	56	*/
	57	BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
2ca993e8 A	58	Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
2ca993e8 A	59	}
46f4442e A	60
	61
	62	/**
	63	* Transliterator API.
	64	*/
	65	Transliterator* BreakTransliterator::clone(void) const {
	66	return new BreakTransliterator(*this);
	67	}
	68
	69	/**
	70	* Implements {@link Transliterator#handleTransliterate}.
	71	*/
	72	void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	73	UBool isIncremental ) const {
	74
	75	UErrorCode status = U_ZERO_ERROR;
2ca993e8 A	76	LocalPointer<BreakIterator> bi;
	77	LocalPointer<UVector32> boundaries;
	78
	79	{
	80	Mutex m;
	81	BreakTransliterator nonConstThis = const_cast<BreakTransliterator >(this);
	82	boundaries.moveFrom(nonConstThis->cachedBoundaries);
	83	bi.moveFrom(nonConstThis->cachedBI);
	84	}
	85	if (bi.isNull()) {
	86	bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
	87	}
	88	if (boundaries.isNull()) {
	89	boundaries.adoptInstead(new UVector32(status));
	90	}
	91
	92	if (bi.isNull() \|\| boundaries.isNull() \|\| U_FAILURE(status)) {
	93	return;
	94	}
	95
46f4442e	96	boundaries->removeAllElements();
46f4442e A	97	UnicodeString sText = replaceableAsString(text);
	98	bi->setText(sText);
	99	bi->preceding(offsets.start);
	100
	101	// To make things much easier, we will stack the boundaries, and then insert at the end.
	102	// generally, we won't need too many, since we will be filtered.
	103
	104	int32_t boundary;
	105	for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
	106	if (boundary == 0) continue;
	107	// HACK: Check to see that preceeding item was a letter
	108
	109	UChar32 cp = sText.char32At(boundary-1);
	110	int type = u_charType(cp);
	111	//System.out.println(Integer.toString(cp,16) + " (before): " + type);
	112	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;
	113
	114	cp = sText.char32At(boundary);
	115	type = u_charType(cp);
	116	//System.out.println(Integer.toString(cp,16) + " (after): " + type);
	117	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;
	118
	119	boundaries->addElement(boundary, status);
729e4ab9	120	// printf("Boundary at %d\n", boundary);
46f4442e A	121	}
	122
	123	int delta = 0;
	124	int lastBoundary = 0;
	125
	126	if (boundaries->size() != 0) { // if we found something, adjust
	127	delta = boundaries->size() * fInsertion.length();
	128	lastBoundary = boundaries->lastElementi();
	129
	130	// we do this from the end backwards, so that we don't have to keep updating.
	131
	132	while (boundaries->size() > 0) {
	133	boundary = boundaries->popi();
	134	text.handleReplaceBetween(boundary, boundary, fInsertion);
	135	}
	136	}
	137
	138	// Now fix up the return values
	139	offsets.contextLimit += delta;
	140	offsets.limit += delta;
	141	offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
	142
2ca993e8 A	143	// Return break iterator & boundaries vector to the cache.
	144	{
	145	Mutex m;
	146	BreakTransliterator nonConstThis = const_cast<BreakTransliterator >(this);
	147	if (nonConstThis->cachedBI.isNull()) {
	148	nonConstThis->cachedBI.moveFrom(bi);
	149	}
	150	if (nonConstThis->cachedBoundaries.isNull()) {
	151	nonConstThis->cachedBoundaries.moveFrom(boundaries);
	152	}
	153	}
	154
46f4442e A	155	// TODO: do something with U_FAILURE(status);
	156	// (need to look at transliterators overall, not just here.)
	157	}
	158
	159	//
	160	// getInsertion()
	161	//
	162	const UnicodeString &BreakTransliterator::getInsertion() const {
	163	return fInsertion;
	164	}
	165
	166	//
	167	// setInsertion()
	168	//
729e4ab9	169	void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
46f4442e A	170	this->fInsertion = insertion;
	171	}
	172
46f4442e A	173	//
	174	// replaceableAsString Hack to let break iterators work
	175	// on the replaceable text from transliterators.
	176	// In practice, the only real Replaceable type that we
	177	// will be seeing is UnicodeString, so this function
	178	// will normally be efficient.
	179	//
	180	UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
46f4442e	181	UnicodeString s;
729e4ab9 A	182	UnicodeString rs = dynamic_cast<UnicodeString >(&r);
	183	if (rs != NULL) {
	184	s = *rs;
	185	} else {
	186	r.extractBetween(0, r.length(), s);
	187	}
46f4442e A	188	return s;
	189	}
	190
	191	U_NAMESPACE_END
	192
	193	#endif /* #if !UCONFIG_NO_TRANSLITERATION */