[apple/icu.git] / icuSources / common / brkeng.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 ************************************************************************************
 * Copyright (C) 2006-2016, International Business Machines Corporation
 * and others. All Rights Reserved.
 ************************************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "brkeng.h"
#include "cmemory.h"
#include "dictbe.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ures.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ustring.h"
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "charstr.h"
#include "dictionarydata.h"
#include "mutex.h"
#include "uvector.h"
#include "umutex.h"
#include "uresimp.h"
#include "ubrkimpl.h"

U_NAMESPACE_BEGIN

/*
 ******************************************************************
 */

LanguageBreakEngine::LanguageBreakEngine() {
}

LanguageBreakEngine::~LanguageBreakEngine() {
}

/*
 ******************************************************************
 */

LanguageBreakFactory::LanguageBreakFactory() {
}

LanguageBreakFactory::~LanguageBreakFactory() {
}

/*
 ******************************************************************
 */

UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
    for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
        fHandled[i] = 0;
    }
}

UnhandledEngine::~UnhandledEngine() {
    for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
        if (fHandled[i] != 0) {
            delete fHandled[i];
        }
    }
}

UBool
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
    return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
}

int32_t
UnhandledEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
        UChar32 c = utext_current32(text); 
        if (reverse) {
            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
                c = utext_previous32(text);
            }
        }
        else {
            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
                c = utext_current32(text);
            }
        }
    }
    return 0;
}

void
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
    if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
        if (fHandled[breakType] == 0) {
            fHandled[breakType] = new UnicodeSet();
            if (fHandled[breakType] == 0) {
                return;
            }
        }
        if (!fHandled[breakType]->contains(c)) {
            UErrorCode status = U_ZERO_ERROR;
            // Apply the entire script of the character.
            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
        }
    }
}

/*
 ******************************************************************
 */

ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
    fEngines = 0;
}

ICULanguageBreakFactory::~ICULanguageBreakFactory() {
    if (fEngines != 0) {
        delete fEngines;
    }
}

U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
    delete (const icu::LanguageBreakEngine *) obj;
}
U_CDECL_END
U_NAMESPACE_BEGIN

static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;

const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
    const LanguageBreakEngine *lbe = NULL;
    UErrorCode  status = U_ZERO_ERROR;

    Mutex m(&gBreakEngineMutex);

    if (fEngines == NULL) {
        UStack  *engines = new UStack(_deleteEngine, NULL, status);
        if (U_FAILURE(status) || engines == NULL) {
            // Note: no way to return error code to caller.
            delete engines;
            return NULL;
        }
        fEngines = engines;
    } else {
        int32_t i = fEngines->size();
        while (--i >= 0) {
            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
            if (lbe != NULL && lbe->handles(c, breakType)) {
                return lbe;
            }
        }
    }
    
    // We didn't find an engine. Create one.
    lbe = loadEngineFor(c, breakType);
    if (lbe != NULL) {
        fEngines->push((void *)lbe, status);
    }
    return lbe;
}

const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
    UErrorCode status = U_ZERO_ERROR;
    UScriptCode code = uscript_getScript(c, &status);
    if (U_SUCCESS(status)) {
        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
        if (m != NULL) {
            const LanguageBreakEngine *engine = NULL;
            switch(code) {
            case USCRIPT_THAI:
                engine = new ThaiBreakEngine(m, status);
                break;
            case USCRIPT_LAO:
                engine = new LaoBreakEngine(m, status);
                break;
            case USCRIPT_MYANMAR:
                engine = new BurmeseBreakEngine(m, status);
                break;
            case USCRIPT_KHMER:
                engine = new KhmerBreakEngine(m, status);
                break;

#if !UCONFIG_NO_NORMALIZATION
                // CJK not available w/o normalization
            case USCRIPT_HANGUL:
                engine = new CjkBreakEngine(m, kKorean, status);
                break;

            // use same BreakEngine and dictionary for both Chinese and Japanese
            case USCRIPT_HIRAGANA:
            case USCRIPT_KATAKANA:
            case USCRIPT_HAN:
                engine = new CjkBreakEngine(m, kChineseJapanese, status);
                break;
#if 0
            // TODO: Have to get some characters with script=common handled
            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
            // them to CjkBreakEngine does not work. The engine has to
            // special-case them.
            case USCRIPT_COMMON:
            {
                UBlockCode block = ublock_getCode(code);
                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
                break;
            }
#endif
#endif

            default:
                break;
            }
            if (engine == NULL) {
                delete m;
            }
            else if (U_FAILURE(status)) {
                delete engine;
                engine = NULL;
            }
            return engine;
        }
    }
    return NULL;
}

DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 
    UErrorCode status = U_ZERO_ERROR;
    // open root from brkitr tree.
    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
    int32_t dictnlength = 0;
    const UChar *dictfname =
        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
    if (U_FAILURE(status)) {
        ures_close(b);
        return NULL;
    }
    CharString dictnbuf;
    CharString ext;
    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
    if (extStart != NULL) {
        int32_t len = (int32_t)(extStart - dictfname);
        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
        dictnlength = len;
    }
    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
    ures_close(b);

    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
    if (U_SUCCESS(status)) {
        // build trie
        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
        const int32_t *indexes = (const int32_t *)data;
        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
        DictionaryMatcher *m = NULL;
        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
            const char *characters = (const char *)(data + offset);
            m = new BytesDictionaryMatcher(characters, transform, file);
        }
        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
            const UChar *characters = (const UChar *)(data + offset);
            m = new UCharsDictionaryMatcher(characters, file);
        }
        if (m == NULL) {
            // no matcher exists to take ownership - either we are an invalid 
            // type or memory allocation failed
            udata_close(file);
        }
        return m;
    } else if (dictfname != NULL) {
        // we don't have a dictionary matcher.
        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
        status = U_ZERO_ERROR;
        return NULL;
    }
    return NULL;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
4388f060	3	/*
46f4442e	4	************************************************************************************
2ca993e8	5	* Copyright (C) 2006-2016, International Business Machines Corporation
4388f060	6	* and others. All Rights Reserved.
46f4442e	7	************************************************************************************
73c04bcf A	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_BREAK_ITERATION
	13
	14	#include "brkeng.h"
2ca993e8	15	#include "cmemory.h"
73c04bcf	16	#include "dictbe.h"
73c04bcf A	17	#include "unicode/uchar.h"
	18	#include "unicode/uniset.h"
	19	#include "unicode/chariter.h"
	20	#include "unicode/ures.h"
	21	#include "unicode/udata.h"
	22	#include "unicode/putil.h"
	23	#include "unicode/ustring.h"
	24	#include "unicode/uscript.h"
51004dcb A	25	#include "unicode/ucharstrie.h"
	26	#include "unicode/bytestrie.h"
	27	#include "charstr.h"
	28	#include "dictionarydata.h"
2ca993e8	29	#include "mutex.h"
73c04bcf	30	#include "uvector.h"
46f4442e	31	#include "umutex.h"
73c04bcf A	32	#include "uresimp.h"
	33	#include "ubrkimpl.h"
	34
	35	U_NAMESPACE_BEGIN
	36
	37	/*
	38	******************************************************************
	39	*/
	40
	41	LanguageBreakEngine::LanguageBreakEngine() {
	42	}
	43
	44	LanguageBreakEngine::~LanguageBreakEngine() {
	45	}
	46
	47	/*
	48	******************************************************************
	49	*/
	50
	51	LanguageBreakFactory::LanguageBreakFactory() {
	52	}
	53
	54	LanguageBreakFactory::~LanguageBreakFactory() {
	55	}
	56
	57	/*
	58	******************************************************************
	59	*/
	60
	61	UnhandledEngine::UnhandledEngine(UErrorCode &/status/) {
2ca993e8	62	for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
73c04bcf A	63	fHandled[i] = 0;
	64	}
	65	}
	66
	67	UnhandledEngine::~UnhandledEngine() {
2ca993e8	68	for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
73c04bcf A	69	if (fHandled[i] != 0) {
	70	delete fHandled[i];
	71	}
	72	}
	73	}
	74
	75	UBool
	76	UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
2ca993e8	77	return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
73c04bcf A	78	&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
	79	}
	80
	81	int32_t
	82	UnhandledEngine::findBreaks( UText *text,
	83	int32_t startPos,
	84	int32_t endPos,
	85	UBool reverse,
	86	int32_t breakType,
	87	UStack &/foundBreaks/ ) const {
2ca993e8	88	if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
73c04bcf A	89	UChar32 c = utext_current32(text);
	90	if (reverse) {
	91	while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
	92	c = utext_previous32(text);
	93	}
	94	}
	95	else {
	96	while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
	97	utext_next32(text); // TODO: recast loop to work with post-increment operations.
	98	c = utext_current32(text);
	99	}
	100	}
	101	}
	102	return 0;
	103	}
	104
	105	void
	106	UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
2ca993e8	107	if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
73c04bcf A	108	if (fHandled[breakType] == 0) {
	109	fHandled[breakType] = new UnicodeSet();
	110	if (fHandled[breakType] == 0) {
	111	return;
	112	}
	113	}
	114	if (!fHandled[breakType]->contains(c)) {
	115	UErrorCode status = U_ZERO_ERROR;
	116	// Apply the entire script of the character.
	117	int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
	118	fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
	119	}
	120	}
	121	}
	122
	123	/*
	124	******************************************************************
	125	*/
	126
	127	ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/status/) {
	128	fEngines = 0;
	129	}
	130
	131	ICULanguageBreakFactory::~ICULanguageBreakFactory() {
	132	if (fEngines != 0) {
	133	delete fEngines;
	134	}
	135	}
	136
	137	U_NAMESPACE_END
	138	U_CDECL_BEGIN
	139	static void U_CALLCONV _deleteEngine(void *obj) {
4388f060	140	delete (const icu::LanguageBreakEngine *) obj;
73c04bcf A	141	}
	142	U_CDECL_END
	143	U_NAMESPACE_BEGIN
	144
2ca993e8 A	145	static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
2ca993e8 A	146
73c04bcf A	147	const LanguageBreakEngine *
73c04bcf A	148	ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
73c04bcf A	149	const LanguageBreakEngine *lbe = NULL;
	150	UErrorCode status = U_ZERO_ERROR;
	151
2ca993e8 A	152	Mutex m(&gBreakEngineMutex);
	153
	154	if (fEngines == NULL) {
	155	UStack *engines = new UStack(_deleteEngine, NULL, status);
	156	if (U_FAILURE(status) \|\| engines == NULL) {
	157	// Note: no way to return error code to caller.
	158	delete engines;
	159	return NULL;
	160	}
	161	fEngines = engines;
	162	} else {
	163	int32_t i = fEngines->size();
73c04bcf A	164	while (--i >= 0) {
	165	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
	166	if (lbe != NULL && lbe->handles(c, breakType)) {
2ca993e8	167	return lbe;
73c04bcf	168	}
73c04bcf A	169	}
73c04bcf A	170	}
73c04bcf	171
2ca993e8 A	172	// We didn't find an engine. Create one.
2ca993e8 A	173	lbe = loadEngineFor(c, breakType);
73c04bcf	174	if (lbe != NULL) {
2ca993e8	175	fEngines->push((void *)lbe, status);
73c04bcf	176	}
73c04bcf A	177	return lbe;
	178	}
	179
	180	const LanguageBreakEngine *
	181	ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
	182	UErrorCode status = U_ZERO_ERROR;
	183	UScriptCode code = uscript_getScript(c, &status);
	184	if (U_SUCCESS(status)) {
51004dcb A	185	DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
51004dcb A	186	if (m != NULL) {
73c04bcf A	187	const LanguageBreakEngine *engine = NULL;
	188	switch(code) {
	189	case USCRIPT_THAI:
51004dcb	190	engine = new ThaiBreakEngine(m, status);
73c04bcf	191	break;
57a6839d A	192	case USCRIPT_LAO:
	193	engine = new LaoBreakEngine(m, status);
	194	break;
b331163b A	195	case USCRIPT_MYANMAR:
	196	engine = new BurmeseBreakEngine(m, status);
	197	break;
4388f060	198	case USCRIPT_KHMER:
51004dcb	199	engine = new KhmerBreakEngine(m, status);
4388f060	200	break;
51004dcb A	201
	202	#if !UCONFIG_NO_NORMALIZATION
	203	// CJK not available w/o normalization
	204	case USCRIPT_HANGUL:
	205	engine = new CjkBreakEngine(m, kKorean, status);
	206	break;
	207
	208	// use same BreakEngine and dictionary for both Chinese and Japanese
	209	case USCRIPT_HIRAGANA:
	210	case USCRIPT_KATAKANA:
	211	case USCRIPT_HAN:
	212	engine = new CjkBreakEngine(m, kChineseJapanese, status);
	213	break;
	214	#if 0
	215	// TODO: Have to get some characters with script=common handled
	216	// by CjkBreakEngine (e.g. U+309B). Simply subjecting
	217	// them to CjkBreakEngine does not work. The engine has to
	218	// special-case them.
	219	case USCRIPT_COMMON:
	220	{
	221	UBlockCode block = ublock_getCode(code);
	222	if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)
	223	engine = new CjkBreakEngine(dict, kChineseJapanese, status);
	224	break;
	225	}
	226	#endif
	227	#endif
	228
73c04bcf A	229	default:
	230	break;
	231	}
	232	if (engine == NULL) {
51004dcb	233	delete m;
73c04bcf A	234	}
	235	else if (U_FAILURE(status)) {
	236	delete engine;
	237	engine = NULL;
	238	}
	239	return engine;
	240	}
	241	}
	242	return NULL;
	243	}
	244
51004dcb A	245	DictionaryMatcher *
51004dcb A	246	ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
73c04bcf	247	UErrorCode status = U_ZERO_ERROR;
51004dcb	248	// open root from brkitr tree.
73c04bcf A	249	UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
73c04bcf A	250	b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
73c04bcf	251	int32_t dictnlength = 0;
51004dcb A	252	const UChar *dictfname =
	253	ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
	254	if (U_FAILURE(status)) {
	255	ures_close(b);
	256	return NULL;
73c04bcf	257	}
51004dcb A	258	CharString dictnbuf;
	259	CharString ext;
	260	const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
	261	if (extStart != NULL) {
	262	int32_t len = (int32_t)(extStart - dictfname);
	263	ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
	264	dictnlength = len;
73c04bcf	265	}
51004dcb	266	dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
73c04bcf	267	ures_close(b);
51004dcb A	268
51004dcb A	269	UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
73c04bcf	270	if (U_SUCCESS(status)) {
51004dcb A	271	// build trie
	272	const uint8_t data = (const uint8_t )udata_getMemory(file);
	273	const int32_t indexes = (const int32_t )data;
	274	const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
	275	const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
	276	DictionaryMatcher *m = NULL;
	277	if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
	278	const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
	279	const char characters = (const char )(data + offset);
	280	m = new BytesDictionaryMatcher(characters, transform, file);
	281	}
	282	else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
	283	const UChar characters = (const UChar )(data + offset);
	284	m = new UCharsDictionaryMatcher(characters, file);
73c04bcf	285	}
51004dcb A	286	if (m == NULL) {
	287	// no matcher exists to take ownership - either we are an invalid
	288	// type or memory allocation failed
	289	udata_close(file);
73c04bcf	290	}
51004dcb A	291	return m;
	292	} else if (dictfname != NULL) {
	293	// we don't have a dictionary matcher.
	294	// returning NULL here will cause us to fail to find a dictionary break engine, as expected
	295	status = U_ZERO_ERROR;
	296	return NULL;
73c04bcf A	297	}
	298	return NULL;
	299	}
	300
	301	U_NAMESPACE_END
	302
	303	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */