git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	************************************************************************************
	5	* Copyright (C) 2006-2016, International Business Machines Corporation
	6	* and others. All Rights Reserved.
	7	************************************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_BREAK_ITERATION
	13
	14	#include "unicode/uchar.h"
	15	#include "unicode/uniset.h"
	16	#include "unicode/chariter.h"
	17	#include "unicode/ures.h"
	18	#include "unicode/udata.h"
	19	#include "unicode/putil.h"
	20	#include "unicode/ustring.h"
	21	#include "unicode/uscript.h"
	22	#include "unicode/ucharstrie.h"
	23	#include "unicode/bytestrie.h"
	24
	25	#include "brkeng.h"
	26	#include "cmemory.h"
	27	#include "dictbe.h"
	28	#include "charstr.h"
	29	#include "dictionarydata.h"
	30	#include "mutex.h"
	31	#include "uvector.h"
	32	#include "umutex.h"
	33	#include "uresimp.h"
	34	#include "ubrkimpl.h"
	35
	36	U_NAMESPACE_BEGIN
	37
	38	/*
	39	******************************************************************
	40	*/
	41
	42	LanguageBreakEngine::LanguageBreakEngine() {
	43	}
	44
	45	LanguageBreakEngine::~LanguageBreakEngine() {
	46	}
	47
	48	/*
	49	******************************************************************
	50	*/
	51
	52	LanguageBreakFactory::LanguageBreakFactory() {
	53	}
	54
	55	LanguageBreakFactory::~LanguageBreakFactory() {
	56	}
	57
	58	/*
	59	******************************************************************
	60	*/
	61
	62	UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
	63	(void)status;
	64	}
	65
	66	UnhandledEngine::~UnhandledEngine() {
	67	delete fHandled;
	68	fHandled = nullptr;
	69	}
	70
	71	UBool
	72	UnhandledEngine::handles(UChar32 c) const {
	73	return fHandled && fHandled->contains(c);
	74	}
	75
	76	int32_t
	77	UnhandledEngine::findBreaks( UText *text,
	78	int32_t /* startPos */,
	79	int32_t endPos,
	80	UVector32 &/foundBreaks/ ) const {
	81	UChar32 c = utext_current32(text);
	82	while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
	83	utext_next32(text); // TODO: recast loop to work with post-increment operations.
	84	c = utext_current32(text);
	85	}
	86	return 0;
	87	}
	88
	89	void
	90	UnhandledEngine::handleCharacter(UChar32 c) {
	91	if (fHandled == nullptr) {
	92	fHandled = new UnicodeSet();
	93	if (fHandled == nullptr) {
	94	return;
	95	}
	96	}
	97	if (!fHandled->contains(c)) {
	98	UErrorCode status = U_ZERO_ERROR;
	99	// Apply the entire script of the character.
	100	int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
	101	fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
	102	}
	103	}
	104
	105	/*
	106	******************************************************************
	107	*/
	108
	109	ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/status/) {
	110	fEngines = 0;
	111	}
	112
	113	ICULanguageBreakFactory::~ICULanguageBreakFactory() {
	114	if (fEngines != 0) {
	115	delete fEngines;
	116	}
	117	}
	118
	119	U_NAMESPACE_END
	120	U_CDECL_BEGIN
	121	static void U_CALLCONV _deleteEngine(void *obj) {
	122	delete (const icu::LanguageBreakEngine *) obj;
	123	}
	124	U_CDECL_END
	125	U_NAMESPACE_BEGIN
	126
	127	const LanguageBreakEngine *
	128	ICULanguageBreakFactory::getEngineFor(UChar32 c) {
	129	const LanguageBreakEngine *lbe = NULL;
	130	UErrorCode status = U_ZERO_ERROR;
	131
	132	static UMutex *gBreakEngineMutex = STATIC_NEW(UMutex);
	133	Mutex m(gBreakEngineMutex);
	134
	135	if (fEngines == NULL) {
	136	UStack *engines = new UStack(_deleteEngine, NULL, status);
	137	if (U_FAILURE(status) \|\| engines == NULL) {
	138	// Note: no way to return error code to caller.
	139	delete engines;
	140	return NULL;
	141	}
	142	fEngines = engines;
	143	} else {
	144	int32_t i = fEngines->size();
	145	while (--i >= 0) {
	146	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
	147	if (lbe != NULL && lbe->handles(c)) {
	148	return lbe;
	149	}
	150	}
	151	}
	152
	153	// We didn't find an engine. Create one.
	154	lbe = loadEngineFor(c);
	155	if (lbe != NULL) {
	156	fEngines->push((void *)lbe, status);
	157	}
	158	return lbe;
	159	}
	160
	161	const LanguageBreakEngine *
	162	ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
	163	UErrorCode status = U_ZERO_ERROR;
	164	UScriptCode code = uscript_getScript(c, &status);
	165	if (U_SUCCESS(status)) {
	166	DictionaryMatcher *m = loadDictionaryMatcherFor(code);
	167	if (m != NULL) {
	168	const LanguageBreakEngine *engine = NULL;
	169	switch(code) {
	170	case USCRIPT_THAI:
	171	engine = new ThaiBreakEngine(m, status);
	172	break;
	173	case USCRIPT_LAO:
	174	engine = new LaoBreakEngine(m, status);
	175	break;
	176	case USCRIPT_MYANMAR:
	177	engine = new BurmeseBreakEngine(m, status);
	178	break;
	179	case USCRIPT_KHMER:
	180	engine = new KhmerBreakEngine(m, status);
	181	break;
	182
	183	#if !UCONFIG_NO_NORMALIZATION
	184	// CJK not available w/o normalization
	185	case USCRIPT_HANGUL:
	186	engine = new CjkBreakEngine(m, kKorean, status);
	187	break;
	188
	189	// use same BreakEngine and dictionary for both Chinese and Japanese
	190	case USCRIPT_HIRAGANA:
	191	case USCRIPT_KATAKANA:
	192	case USCRIPT_HAN:
	193	engine = new CjkBreakEngine(m, kChineseJapanese, status);
	194	break;
	195	#if 0
	196	// TODO: Have to get some characters with script=common handled
	197	// by CjkBreakEngine (e.g. U+309B). Simply subjecting
	198	// them to CjkBreakEngine does not work. The engine has to
	199	// special-case them.
	200	case USCRIPT_COMMON:
	201	{
	202	UBlockCode block = ublock_getCode(code);
	203	if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)
	204	engine = new CjkBreakEngine(dict, kChineseJapanese, status);
	205	break;
	206	}
	207	#endif
	208	#endif
	209
	210	default:
	211	break;
	212	}
	213	if (engine == NULL) {
	214	delete m;
	215	}
	216	else if (U_FAILURE(status)) {
	217	delete engine;
	218	engine = NULL;
	219	}
	220	return engine;
	221	}
	222	}
	223	return NULL;
	224	}
	225
	226	DictionaryMatcher *
	227	ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
	228	UErrorCode status = U_ZERO_ERROR;
	229	// open root from brkitr tree.
	230	UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
	231	b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
	232	int32_t dictnlength = 0;
	233	const UChar *dictfname =
	234	ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
	235	if (U_FAILURE(status)) {
	236	ures_close(b);
	237	return NULL;
	238	}
	239	CharString dictnbuf;
	240	CharString ext;
	241	const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
	242	if (extStart != NULL) {
	243	int32_t len = (int32_t)(extStart - dictfname);
	244	ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
	245	dictnlength = len;
	246	}
	247	dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
	248	ures_close(b);
	249
	250	UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
	251	if (U_SUCCESS(status)) {
	252	// build trie
	253	const uint8_t data = (const uint8_t )udata_getMemory(file);
	254	const int32_t indexes = (const int32_t )data;
	255	const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
	256	const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
	257	DictionaryMatcher *m = NULL;
	258	if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
	259	const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
	260	const char characters = (const char )(data + offset);
	261	m = new BytesDictionaryMatcher(characters, transform, file);
	262	}
	263	else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
	264	const UChar characters = (const UChar )(data + offset);
	265	m = new UCharsDictionaryMatcher(characters, file);
	266	}
	267	if (m == NULL) {
	268	// no matcher exists to take ownership - either we are an invalid
	269	// type or memory allocation failed
	270	udata_close(file);
	271	}
	272	return m;
	273	} else if (dictfname != NULL) {
	274	// we don't have a dictionary matcher.
	275	// returning NULL here will cause us to fail to find a dictionary break engine, as expected
	276	status = U_ZERO_ERROR;
	277	return NULL;
	278	}
	279	return NULL;
	280	}
	281
	282	U_NAMESPACE_END
	283
	284	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */