[apple/icu.git] / icuSources / common / brkeng.cpp

/*
 ************************************************************************************
 * Copyright (C) 2006-2016, International Business Machines Corporation
 * and others. All Rights Reserved.
 ************************************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "brkeng.h"
#include "cmemory.h"
#include "dictbe.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ures.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ustring.h"
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "charstr.h"
#include "dictionarydata.h"
#include "mutex.h"
#include "uvector.h"
#include "umutex.h"
#include "uresimp.h"
#include "ubrkimpl.h"

U_NAMESPACE_BEGIN

/*
 ******************************************************************
 */

LanguageBreakEngine::LanguageBreakEngine() {
}

LanguageBreakEngine::~LanguageBreakEngine() {
}

/*
 ******************************************************************
 */

LanguageBreakFactory::LanguageBreakFactory() {
}

LanguageBreakFactory::~LanguageBreakFactory() {
}

/*
 ******************************************************************
 */

UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
    for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
        fHandled[i] = 0;
    }
}

UnhandledEngine::~UnhandledEngine() {
    for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
        if (fHandled[i] != 0) {
            delete fHandled[i];
        }
    }
}

UBool
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
    return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
}

int32_t
UnhandledEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
        UChar32 c = utext_current32(text); 
        if (reverse) {
            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
                c = utext_previous32(text);
            }
        }
        else {
            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
                c = utext_current32(text);
            }
        }
    }
    return 0;
}

void
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
    if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
        if (fHandled[breakType] == 0) {
            fHandled[breakType] = new UnicodeSet();
            if (fHandled[breakType] == 0) {
                return;
            }
        }
        if (!fHandled[breakType]->contains(c)) {
            UErrorCode status = U_ZERO_ERROR;
            // Apply the entire script of the character.
            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
        }
    }
}

/*
 ******************************************************************
 */

ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
    fEngines = 0;
}

ICULanguageBreakFactory::~ICULanguageBreakFactory() {
    if (fEngines != 0) {
        delete fEngines;
    }
}

U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
    delete (const icu::LanguageBreakEngine *) obj;
}
U_CDECL_END
U_NAMESPACE_BEGIN

static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;

const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
    const LanguageBreakEngine *lbe = NULL;
    UErrorCode  status = U_ZERO_ERROR;

    Mutex m(&gBreakEngineMutex);

    if (fEngines == NULL) {
        UStack  *engines = new UStack(_deleteEngine, NULL, status);
        if (U_FAILURE(status) || engines == NULL) {
            // Note: no way to return error code to caller.
            delete engines;
            return NULL;
        }
        fEngines = engines;
    } else {
        int32_t i = fEngines->size();
        while (--i >= 0) {
            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
            if (lbe != NULL && lbe->handles(c, breakType)) {
                return lbe;
            }
        }
    }
    
    // We didn't find an engine. Create one.
    lbe = loadEngineFor(c, breakType);
    if (lbe != NULL) {
        fEngines->push((void *)lbe, status);
    }
    return lbe;
}

const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
    UErrorCode status = U_ZERO_ERROR;
    UScriptCode code = uscript_getScript(c, &status);
    if (U_SUCCESS(status)) {
        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
        if (m != NULL) {
            const LanguageBreakEngine *engine = NULL;
            switch(code) {
            case USCRIPT_THAI:
                engine = new ThaiBreakEngine(m, status);
                break;
            case USCRIPT_LAO:
                engine = new LaoBreakEngine(m, status);
                break;
            case USCRIPT_MYANMAR:
                engine = new BurmeseBreakEngine(m, status);
                break;
            case USCRIPT_KHMER:
                engine = new KhmerBreakEngine(m, status);
                break;

#if !UCONFIG_NO_NORMALIZATION
                // CJK not available w/o normalization
            case USCRIPT_HANGUL:
                engine = new CjkBreakEngine(m, kKorean, status);
                break;

            // use same BreakEngine and dictionary for both Chinese and Japanese
            case USCRIPT_HIRAGANA:
            case USCRIPT_KATAKANA:
            case USCRIPT_HAN:
                engine = new CjkBreakEngine(m, kChineseJapanese, status);
                break;
#if 0
            // TODO: Have to get some characters with script=common handled
            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
            // them to CjkBreakEngine does not work. The engine has to
            // special-case them.
            case USCRIPT_COMMON:
            {
                UBlockCode block = ublock_getCode(code);
                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
                break;
            }
#endif
#endif

            default:
                break;
            }
            if (engine == NULL) {
                delete m;
            }
            else if (U_FAILURE(status)) {
                delete engine;
                engine = NULL;
            }
            return engine;
        }
    }
    return NULL;
}

DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 
    UErrorCode status = U_ZERO_ERROR;
    // open root from brkitr tree.
    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
    int32_t dictnlength = 0;
    const UChar *dictfname =
        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
    if (U_FAILURE(status)) {
        ures_close(b);
        return NULL;
    }
    CharString dictnbuf;
    CharString ext;
    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
    if (extStart != NULL) {
        int32_t len = (int32_t)(extStart - dictfname);
        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
        dictnlength = len;
    }
    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
    ures_close(b);

    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
    if (U_SUCCESS(status)) {
        // build trie
        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
        const int32_t *indexes = (const int32_t *)data;
        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
        DictionaryMatcher *m = NULL;
        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
            const char *characters = (const char *)(data + offset);
            m = new BytesDictionaryMatcher(characters, transform, file);
        }
        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
            const UChar *characters = (const UChar *)(data + offset);
            m = new UCharsDictionaryMatcher(characters, file);
        }
        if (m == NULL) {
            // no matcher exists to take ownership - either we are an invalid 
            // type or memory allocation failed
            udata_close(file);
        }
        return m;
    } else if (dictfname != NULL) {
        // we don't have a dictionary matcher.
        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
        status = U_ZERO_ERROR;
        return NULL;
    }
    return NULL;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
Commit	Line	Data
4388f060	1	/*
46f4442e	2	************************************************************************************
2ca993e8	3	* Copyright (C) 2006-2016, International Business Machines Corporation
4388f060	4	* and others. All Rights Reserved.
46f4442e	5	************************************************************************************
73c04bcf A	6	*/
	7
	8	#include "unicode/utypes.h"
	9
	10	#if !UCONFIG_NO_BREAK_ITERATION
	11
	12	#include "brkeng.h"
2ca993e8	13	#include "cmemory.h"
73c04bcf	14	#include "dictbe.h"
73c04bcf A	15	#include "unicode/uchar.h"
	16	#include "unicode/uniset.h"
	17	#include "unicode/chariter.h"
	18	#include "unicode/ures.h"
	19	#include "unicode/udata.h"
	20	#include "unicode/putil.h"
	21	#include "unicode/ustring.h"
	22	#include "unicode/uscript.h"
51004dcb A	23	#include "unicode/ucharstrie.h"
	24	#include "unicode/bytestrie.h"
	25	#include "charstr.h"
	26	#include "dictionarydata.h"
2ca993e8	27	#include "mutex.h"
73c04bcf	28	#include "uvector.h"
46f4442e	29	#include "umutex.h"
73c04bcf A	30	#include "uresimp.h"
	31	#include "ubrkimpl.h"
	32
	33	U_NAMESPACE_BEGIN
	34
	35	/*
	36	******************************************************************
	37	*/
	38
	39	LanguageBreakEngine::LanguageBreakEngine() {
	40	}
	41
	42	LanguageBreakEngine::~LanguageBreakEngine() {
	43	}
	44
	45	/*
	46	******************************************************************
	47	*/
	48
	49	LanguageBreakFactory::LanguageBreakFactory() {
	50	}
	51
	52	LanguageBreakFactory::~LanguageBreakFactory() {
	53	}
	54
	55	/*
	56	******************************************************************
	57	*/
	58
	59	UnhandledEngine::UnhandledEngine(UErrorCode &/status/) {
2ca993e8	60	for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
73c04bcf A	61	fHandled[i] = 0;
	62	}
	63	}
	64
	65	UnhandledEngine::~UnhandledEngine() {
2ca993e8	66	for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
73c04bcf A	67	if (fHandled[i] != 0) {
	68	delete fHandled[i];
	69	}
	70	}
	71	}
	72
	73	UBool
	74	UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
2ca993e8	75	return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
73c04bcf A	76	&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
	77	}
	78
	79	int32_t
	80	UnhandledEngine::findBreaks( UText *text,
	81	int32_t startPos,
	82	int32_t endPos,
	83	UBool reverse,
	84	int32_t breakType,
	85	UStack &/foundBreaks/ ) const {
2ca993e8	86	if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
73c04bcf A	87	UChar32 c = utext_current32(text);
	88	if (reverse) {
	89	while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
	90	c = utext_previous32(text);
	91	}
	92	}
	93	else {
	94	while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
	95	utext_next32(text); // TODO: recast loop to work with post-increment operations.
	96	c = utext_current32(text);
	97	}
	98	}
	99	}
	100	return 0;
	101	}
	102
	103	void
	104	UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
2ca993e8	105	if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
73c04bcf A	106	if (fHandled[breakType] == 0) {
	107	fHandled[breakType] = new UnicodeSet();
	108	if (fHandled[breakType] == 0) {
	109	return;
	110	}
	111	}
	112	if (!fHandled[breakType]->contains(c)) {
	113	UErrorCode status = U_ZERO_ERROR;
	114	// Apply the entire script of the character.
	115	int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
	116	fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
	117	}
	118	}
	119	}
	120
	121	/*
	122	******************************************************************
	123	*/
	124
	125	ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/status/) {
	126	fEngines = 0;
	127	}
	128
	129	ICULanguageBreakFactory::~ICULanguageBreakFactory() {
	130	if (fEngines != 0) {
	131	delete fEngines;
	132	}
	133	}
	134
	135	U_NAMESPACE_END
	136	U_CDECL_BEGIN
	137	static void U_CALLCONV _deleteEngine(void *obj) {
4388f060	138	delete (const icu::LanguageBreakEngine *) obj;
73c04bcf A	139	}
	140	U_CDECL_END
	141	U_NAMESPACE_BEGIN
	142
2ca993e8 A	143	static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
2ca993e8 A	144
73c04bcf A	145	const LanguageBreakEngine *
73c04bcf A	146	ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
73c04bcf A	147	const LanguageBreakEngine *lbe = NULL;
	148	UErrorCode status = U_ZERO_ERROR;
	149
2ca993e8 A	150	Mutex m(&gBreakEngineMutex);
	151
	152	if (fEngines == NULL) {
	153	UStack *engines = new UStack(_deleteEngine, NULL, status);
	154	if (U_FAILURE(status) \|\| engines == NULL) {
	155	// Note: no way to return error code to caller.
	156	delete engines;
	157	return NULL;
	158	}
	159	fEngines = engines;
	160	} else {
	161	int32_t i = fEngines->size();
73c04bcf A	162	while (--i >= 0) {
	163	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
	164	if (lbe != NULL && lbe->handles(c, breakType)) {
2ca993e8	165	return lbe;
73c04bcf	166	}
73c04bcf A	167	}
73c04bcf A	168	}
73c04bcf	169
2ca993e8 A	170	// We didn't find an engine. Create one.
2ca993e8 A	171	lbe = loadEngineFor(c, breakType);
73c04bcf	172	if (lbe != NULL) {
2ca993e8	173	fEngines->push((void *)lbe, status);
73c04bcf	174	}
73c04bcf A	175	return lbe;
	176	}
	177
	178	const LanguageBreakEngine *
	179	ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
	180	UErrorCode status = U_ZERO_ERROR;
	181	UScriptCode code = uscript_getScript(c, &status);
	182	if (U_SUCCESS(status)) {
51004dcb A	183	DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
51004dcb A	184	if (m != NULL) {
73c04bcf A	185	const LanguageBreakEngine *engine = NULL;
	186	switch(code) {
	187	case USCRIPT_THAI:
51004dcb	188	engine = new ThaiBreakEngine(m, status);
73c04bcf	189	break;
57a6839d A	190	case USCRIPT_LAO:
	191	engine = new LaoBreakEngine(m, status);
	192	break;
b331163b A	193	case USCRIPT_MYANMAR:
	194	engine = new BurmeseBreakEngine(m, status);
	195	break;
4388f060	196	case USCRIPT_KHMER:
51004dcb	197	engine = new KhmerBreakEngine(m, status);
4388f060	198	break;
51004dcb A	199
	200	#if !UCONFIG_NO_NORMALIZATION
	201	// CJK not available w/o normalization
	202	case USCRIPT_HANGUL:
	203	engine = new CjkBreakEngine(m, kKorean, status);
	204	break;
	205
	206	// use same BreakEngine and dictionary for both Chinese and Japanese
	207	case USCRIPT_HIRAGANA:
	208	case USCRIPT_KATAKANA:
	209	case USCRIPT_HAN:
	210	engine = new CjkBreakEngine(m, kChineseJapanese, status);
	211	break;
	212	#if 0
	213	// TODO: Have to get some characters with script=common handled
	214	// by CjkBreakEngine (e.g. U+309B). Simply subjecting
	215	// them to CjkBreakEngine does not work. The engine has to
	216	// special-case them.
	217	case USCRIPT_COMMON:
	218	{
	219	UBlockCode block = ublock_getCode(code);
	220	if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)
	221	engine = new CjkBreakEngine(dict, kChineseJapanese, status);
	222	break;
	223	}
	224	#endif
	225	#endif
	226
73c04bcf A	227	default:
	228	break;
	229	}
	230	if (engine == NULL) {
51004dcb	231	delete m;
73c04bcf A	232	}
	233	else if (U_FAILURE(status)) {
	234	delete engine;
	235	engine = NULL;
	236	}
	237	return engine;
	238	}
	239	}
	240	return NULL;
	241	}
	242
51004dcb A	243	DictionaryMatcher *
51004dcb A	244	ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
73c04bcf	245	UErrorCode status = U_ZERO_ERROR;
51004dcb	246	// open root from brkitr tree.
73c04bcf A	247	UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
73c04bcf A	248	b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
73c04bcf	249	int32_t dictnlength = 0;
51004dcb A	250	const UChar *dictfname =
	251	ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
	252	if (U_FAILURE(status)) {
	253	ures_close(b);
	254	return NULL;
73c04bcf	255	}
51004dcb A	256	CharString dictnbuf;
	257	CharString ext;
	258	const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
	259	if (extStart != NULL) {
	260	int32_t len = (int32_t)(extStart - dictfname);
	261	ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
	262	dictnlength = len;
73c04bcf	263	}
51004dcb	264	dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
73c04bcf	265	ures_close(b);
51004dcb A	266
51004dcb A	267	UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
73c04bcf	268	if (U_SUCCESS(status)) {
51004dcb A	269	// build trie
	270	const uint8_t data = (const uint8_t )udata_getMemory(file);
	271	const int32_t indexes = (const int32_t )data;
	272	const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
	273	const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
	274	DictionaryMatcher *m = NULL;
	275	if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
	276	const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
	277	const char characters = (const char )(data + offset);
	278	m = new BytesDictionaryMatcher(characters, transform, file);
	279	}
	280	else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
	281	const UChar characters = (const UChar )(data + offset);
	282	m = new UCharsDictionaryMatcher(characters, file);
73c04bcf	283	}
51004dcb A	284	if (m == NULL) {
	285	// no matcher exists to take ownership - either we are an invalid
	286	// type or memory allocation failed
	287	udata_close(file);
73c04bcf	288	}
51004dcb A	289	return m;
	290	} else if (dictfname != NULL) {
	291	// we don't have a dictionary matcher.
	292	// returning NULL here will cause us to fail to find a dictionary break engine, as expected
	293	status = U_ZERO_ERROR;
	294	return NULL;
73c04bcf A	295	}
	296	return NULL;
	297	}
	298
	299	U_NAMESPACE_END
	300
	301	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */