git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/**
	4	*******************************************************************************
	5	* Copyright (C) 2006-2016, International Business Machines Corporation
	6	* and others. All Rights Reserved.
	7	*******************************************************************************
	8	*/
	9
	10	#include <utility>
	11
	12	#include "unicode/utypes.h"
	13
	14	#if !UCONFIG_NO_BREAK_ITERATION
	15
	16	#include "brkeng.h"
	17	#include "dictbe.h"
	18	#include "unicode/uniset.h"
	19	#include "unicode/chariter.h"
	20	#include "unicode/ubrk.h"
	21	#include "uvectr32.h"
	22	#include "uvector.h"
	23	#include "uassert.h"
	24	#include "unicode/normlzr.h"
	25	#include "cmemory.h"
	26	#include "dictionarydata.h"
	27
	28	U_NAMESPACE_BEGIN
	29
	30	/*
	31	******************************************************************
	32	*/
	33
	34	DictionaryBreakEngine::DictionaryBreakEngine() {
	35	}
	36
	37	DictionaryBreakEngine::~DictionaryBreakEngine() {
	38	}
	39
	40	UBool
	41	DictionaryBreakEngine::handles(UChar32 c) const {
	42	return fSet.contains(c);
	43	}
	44
	45	int32_t
	46	DictionaryBreakEngine::findBreaks( UText *text,
	47	int32_t startPos,
	48	int32_t endPos,
	49	UVector32 &foundBreaks ) const {
	50	(void)startPos; // TODO: remove this param?
	51	int32_t result = 0;
	52
	53	// Find the span of characters included in the set.
	54	// The span to break begins at the current position in the text, and
	55	// extends towards the start or end of the text, depending on 'reverse'.
	56
	57	int32_t start = (int32_t)utext_getNativeIndex(text);
	58	int32_t current;
	59	int32_t rangeStart;
	60	int32_t rangeEnd;
	61	UChar32 c = utext_current32(text);
	62	while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
	63	utext_next32(text); // TODO: recast loop for postincrement
	64	c = utext_current32(text);
	65	}
	66	rangeStart = start;
	67	rangeEnd = current;
	68	result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
	69	utext_setNativeIndex(text, current);
	70
	71	return result;
	72	}
	73
	74	void
	75	DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
	76	fSet = set;
	77	// Compact for caching
	78	fSet.compact();
	79	}
	80
	81	/*
	82	******************************************************************
	83	* PossibleWord
	84	*/
	85
	86	// Helper class for improving readability of the Thai/Lao/Khmer word break
	87	// algorithm. The implementation is completely inline.
	88
	89	// List size, limited by the maximum number of words in the dictionary
	90	// that form a nested sequence.
	91	static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
	92
	93	class PossibleWord {
	94	private:
	95	// list of word candidate lengths, in increasing length order
	96	// TODO: bytes would be sufficient for word lengths.
	97	int32_t count; // Count of candidates
	98	int32_t prefix; // The longest match with a dictionary word
	99	int32_t offset; // Offset in the text of these candidates
	100	int32_t mark; // The preferred candidate's offset
	101	int32_t current; // The candidate we're currently looking at
	102	int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
	103	int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
	104
	105	public:
	106	PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
	107	~PossibleWord() {}
	108
	109	// Fill the list of candidates if needed, select the longest, and return the number found
	110	int32_t candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd );
	111
	112	// Select the currently marked candidate, point after it in the text, and invalidate self
	113	int32_t acceptMarked( UText *text );
	114
	115	// Back up from the current candidate to the next shorter one; return TRUE if that exists
	116	// and point the text after it
	117	UBool backUp( UText *text );
	118
	119	// Return the longest prefix this candidate location shares with a dictionary word
	120	// Return value is in code points.
	121	int32_t longestPrefix() { return prefix; }
	122
	123	// Mark the current candidate as the one we like
	124	void markCurrent() { mark = current; }
	125
	126	// Get length in code points of the marked word.
	127	int32_t markedCPLength() { return cpLengths[mark]; }
	128	};
	129
	130
	131	int32_t PossibleWord::candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd ) {
	132	// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
	133	int32_t start = (int32_t)utext_getNativeIndex(text);
	134	if (start != offset) {
	135	offset = start;
	136	count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
	137	// Dictionary leaves text after longest prefix, not longest word. Back up.
	138	if (count <= 0) {
	139	utext_setNativeIndex(text, start);
	140	}
	141	}
	142	if (count > 0) {
	143	utext_setNativeIndex(text, start+cuLengths[count-1]);
	144	}
	145	current = count-1;
	146	mark = current;
	147	return count;
	148	}
	149
	150	int32_t
	151	PossibleWord::acceptMarked( UText *text ) {
	152	utext_setNativeIndex(text, offset + cuLengths[mark]);
	153	return cuLengths[mark];
	154	}
	155
	156
	157	UBool
	158	PossibleWord::backUp( UText *text ) {
	159	if (current > 0) {
	160	utext_setNativeIndex(text, offset + cuLengths[--current]);
	161	return TRUE;
	162	}
	163	return FALSE;
	164	}
	165
	166	/*
	167	******************************************************************
	168	* ThaiBreakEngine
	169	*/
	170
	171	// How many words in a row are "good enough"?
	172	static const int32_t THAI_LOOKAHEAD = 3;
	173
	174	// Will not combine a non-word with a preceding dictionary word longer than this
	175	static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
	176
	177	// Will not combine a non-word that shares at least this much prefix with a
	178	// dictionary word, with a preceding word
	179	static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
	180
	181	// Ellision character
	182	static const int32_t THAI_PAIYANNOI = 0x0E2F;
	183
	184	// Repeat character
	185	static const int32_t THAI_MAIYAMOK = 0x0E46;
	186
	187	// Minimum word size
	188	static const int32_t THAI_MIN_WORD = 2;
	189
	190	// Minimum number of characters for two words
	191	static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
	192
	193	ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
	194	: DictionaryBreakEngine(),
	195	fDictionary(adoptDictionary)
	196	{
	197	fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
	198	if (U_SUCCESS(status)) {
	199	setCharacters(fThaiWordSet);
	200	}
	201	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
	202	fMarkSet.add(0x0020);
	203	fEndWordSet = fThaiWordSet;
	204	fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
	205	fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
	206	fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
	207	fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
	208	fSuffixSet.add(THAI_PAIYANNOI);
	209	fSuffixSet.add(THAI_MAIYAMOK);
	210
	211	// Compact for caching.
	212	fMarkSet.compact();
	213	fEndWordSet.compact();
	214	fBeginWordSet.compact();
	215	fSuffixSet.compact();
	216	}
	217
	218	ThaiBreakEngine::~ThaiBreakEngine() {
	219	delete fDictionary;
	220	}
	221
	222	int32_t
	223	ThaiBreakEngine::divideUpDictionaryRange( UText *text,
	224	int32_t rangeStart,
	225	int32_t rangeEnd,
	226	UVector32 &foundBreaks ) const {
	227	utext_setNativeIndex(text, rangeStart);
	228	utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
	229	if (utext_getNativeIndex(text) >= rangeEnd) {
	230	return 0; // Not enough characters for two words
	231	}
	232	utext_setNativeIndex(text, rangeStart);
	233
	234
	235	uint32_t wordsFound = 0;
	236	int32_t cpWordLength = 0; // Word Length in Code Points.
	237	int32_t cuWordLength = 0; // Word length in code units (UText native indexing)
	238	int32_t current;
	239	UErrorCode status = U_ZERO_ERROR;
	240	PossibleWord words[THAI_LOOKAHEAD];
	241
	242	utext_setNativeIndex(text, rangeStart);
	243
	244	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
	245	cpWordLength = 0;
	246	cuWordLength = 0;
	247
	248	// Look for candidate words at the current position
	249	int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	250
	251	// If we found exactly one, use that
	252	if (candidates == 1) {
	253	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
	254	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
	255	wordsFound += 1;
	256	}
	257	// If there was more than one, see which one can take us forward the most words
	258	else if (candidates > 1) {
	259	// If we're already at the end of the range, we're done
	260	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
	261	goto foundBest;
	262	}
	263	do {
	264	int32_t wordsMatched = 1;
	265	if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
	266	if (wordsMatched < 2) {
	267	// Followed by another dictionary word; mark first word as a good candidate
	268	words[wordsFound%THAI_LOOKAHEAD].markCurrent();
	269	wordsMatched = 2;
	270	}
	271
	272	// If we're already at the end of the range, we're done
	273	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
	274	goto foundBest;
	275	}
	276
	277	// See if any of the possible second words is followed by a third word
	278	do {
	279	// If we find a third word, stop right away
	280	if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
	281	words[wordsFound % THAI_LOOKAHEAD].markCurrent();
	282	goto foundBest;
	283	}
	284	}
	285	while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
	286	}
	287	}
	288	while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
	289	foundBest:
	290	// Set UText position to after the accepted word.
	291	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
	292	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
	293	wordsFound += 1;
	294	}
	295
	296	// We come here after having either found a word or not. We look ahead to the
	297	// next word. If it's not a dictionary word, we will combine it with the word we
	298	// just found (if there is one), but only if the preceding word does not exceed
	299	// the threshold.
	300	// The text iterator should now be positioned at the end of the word we found.
	301
	302	UChar32 uc = 0;
	303	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
	304	// if it is a dictionary word, do nothing. If it isn't, then if there is
	305	// no preceding word, or the non-word shares less than the minimum threshold
	306	// of characters with a dictionary word, then scan to resynchronize
	307	if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
	308	&& (cuWordLength == 0
	309	\|\| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
	310	// Look for a plausible word boundary
	311	int32_t remaining = rangeEnd - (current+cuWordLength);
	312	UChar32 pc;
	313	int32_t chars = 0;
	314	for (;;) {
	315	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
	316	pc = utext_next32(text);
	317	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
	318	chars += pcSize;
	319	remaining -= pcSize;
	320	if (remaining <= 0) {
	321	break;
	322	}
	323	uc = utext_current32(text);
	324	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
	325	// Maybe. See if it's in the dictionary.
	326	// NOTE: In the original Apple code, checked that the next
	327	// two characters after uc were not 0x0E4C THANTHAKHAT before
	328	// checking the dictionary. That is just a performance filter,
	329	// but it's not clear it's faster than checking the trie.
	330	int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	331	utext_setNativeIndex(text, current + cuWordLength + chars);
	332	if (num_candidates > 0) {
	333	break;
	334	}
	335	}
	336	}
	337
	338	// Bump the word count if there wasn't already one
	339	if (cuWordLength <= 0) {
	340	wordsFound += 1;
	341	}
	342
	343	// Update the length with the passed-over characters
	344	cuWordLength += chars;
	345	}
	346	else {
	347	// Back up to where we were for next iteration
	348	utext_setNativeIndex(text, current+cuWordLength);
	349	}
	350	}
	351
	352	// Never stop before a combining mark.
	353	int32_t currPos;
	354	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
	355	utext_next32(text);
	356	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
	357	}
	358
	359	// Look ahead for possible suffixes if a dictionary word does not follow.
	360	// We do this in code rather than using a rule so that the heuristic
	361	// resynch continues to function. For example, one of the suffix characters
	362	// could be a typo in the middle of a word.
	363	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
	364	if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
	365	&& fSuffixSet.contains(uc = utext_current32(text))) {
	366	if (uc == THAI_PAIYANNOI) {
	367	if (!fSuffixSet.contains(utext_previous32(text))) {
	368	// Skip over previous end and PAIYANNOI
	369	utext_next32(text);
	370	int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
	371	utext_next32(text);
	372	cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
	373	uc = utext_current32(text); // Fetch next character
	374	}
	375	else {
	376	// Restore prior position
	377	utext_next32(text);
	378	}
	379	}
	380	if (uc == THAI_MAIYAMOK) {
	381	if (utext_previous32(text) != THAI_MAIYAMOK) {
	382	// Skip over previous end and MAIYAMOK
	383	utext_next32(text);
	384	int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
	385	utext_next32(text);
	386	cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
	387	}
	388	else {
	389	// Restore prior position
	390	utext_next32(text);
	391	}
	392	}
	393	}
	394	else {
	395	utext_setNativeIndex(text, current+cuWordLength);
	396	}
	397	}
	398
	399	// Did we find a word on this iteration? If so, push it on the break stack
	400	if (cuWordLength > 0) {
	401	foundBreaks.push((current+cuWordLength), status);
	402	}
	403	}
	404
	405	// Don't return a break for the end of the dictionary range if there is one there.
	406	if (foundBreaks.peeki() >= rangeEnd) {
	407	(void) foundBreaks.popi();
	408	wordsFound -= 1;
	409	}
	410
	411	return wordsFound;
	412	}
	413
	414	/*
	415	******************************************************************
	416	* LaoBreakEngine
	417	*/
	418
	419	// How many words in a row are "good enough"?
	420	static const int32_t LAO_LOOKAHEAD = 3;
	421
	422	// Will not combine a non-word with a preceding dictionary word longer than this
	423	static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
	424
	425	// Will not combine a non-word that shares at least this much prefix with a
	426	// dictionary word, with a preceding word
	427	static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
	428
	429	// Minimum word size
	430	static const int32_t LAO_MIN_WORD = 2;
	431
	432	// Minimum number of characters for two words
	433	static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
	434
	435	LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
	436	: DictionaryBreakEngine(),
	437	fDictionary(adoptDictionary)
	438	{
	439	fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
	440	if (U_SUCCESS(status)) {
	441	setCharacters(fLaoWordSet);
	442	}
	443	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
	444	fMarkSet.add(0x0020);
	445	fEndWordSet = fLaoWordSet;
	446	fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
	447	fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
	448	fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
	449	fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
	450
	451	// Compact for caching.
	452	fMarkSet.compact();
	453	fEndWordSet.compact();
	454	fBeginWordSet.compact();
	455	}
	456
	457	LaoBreakEngine::~LaoBreakEngine() {
	458	delete fDictionary;
	459	}
	460
	461	int32_t
	462	LaoBreakEngine::divideUpDictionaryRange( UText *text,
	463	int32_t rangeStart,
	464	int32_t rangeEnd,
	465	UVector32 &foundBreaks ) const {
	466	if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
	467	return 0; // Not enough characters for two words
	468	}
	469
	470	uint32_t wordsFound = 0;
	471	int32_t cpWordLength = 0;
	472	int32_t cuWordLength = 0;
	473	int32_t current;
	474	UErrorCode status = U_ZERO_ERROR;
	475	PossibleWord words[LAO_LOOKAHEAD];
	476
	477	utext_setNativeIndex(text, rangeStart);
	478
	479	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
	480	cuWordLength = 0;
	481	cpWordLength = 0;
	482
	483	// Look for candidate words at the current position
	484	int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	485
	486	// If we found exactly one, use that
	487	if (candidates == 1) {
	488	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
	489	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
	490	wordsFound += 1;
	491	}
	492	// If there was more than one, see which one can take us forward the most words
	493	else if (candidates > 1) {
	494	// If we're already at the end of the range, we're done
	495	if (utext_getNativeIndex(text) >= rangeEnd) {
	496	goto foundBest;
	497	}
	498	do {
	499	int32_t wordsMatched = 1;
	500	if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
	501	if (wordsMatched < 2) {
	502	// Followed by another dictionary word; mark first word as a good candidate
	503	words[wordsFound%LAO_LOOKAHEAD].markCurrent();
	504	wordsMatched = 2;
	505	}
	506
	507	// If we're already at the end of the range, we're done
	508	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
	509	goto foundBest;
	510	}
	511
	512	// See if any of the possible second words is followed by a third word
	513	do {
	514	// If we find a third word, stop right away
	515	if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
	516	words[wordsFound % LAO_LOOKAHEAD].markCurrent();
	517	goto foundBest;
	518	}
	519	}
	520	while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
	521	}
	522	}
	523	while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
	524	foundBest:
	525	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
	526	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
	527	wordsFound += 1;
	528	}
	529
	530	// We come here after having either found a word or not. We look ahead to the
	531	// next word. If it's not a dictionary word, we will combine it withe the word we
	532	// just found (if there is one), but only if the preceding word does not exceed
	533	// the threshold.
	534	// The text iterator should now be positioned at the end of the word we found.
	535	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
	536	// if it is a dictionary word, do nothing. If it isn't, then if there is
	537	// no preceding word, or the non-word shares less than the minimum threshold
	538	// of characters with a dictionary word, then scan to resynchronize
	539	if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
	540	&& (cuWordLength == 0
	541	\|\| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
	542	// Look for a plausible word boundary
	543	int32_t remaining = rangeEnd - (current + cuWordLength);
	544	UChar32 pc;
	545	UChar32 uc;
	546	int32_t chars = 0;
	547	for (;;) {
	548	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
	549	pc = utext_next32(text);
	550	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
	551	chars += pcSize;
	552	remaining -= pcSize;
	553	if (remaining <= 0) {
	554	break;
	555	}
	556	uc = utext_current32(text);
	557	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
	558	// Maybe. See if it's in the dictionary.
	559	// TODO: this looks iffy; compare with old code.
	560	int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	561	utext_setNativeIndex(text, current + cuWordLength + chars);
	562	if (num_candidates > 0) {
	563	break;
	564	}
	565	}
	566	}
	567
	568	// Bump the word count if there wasn't already one
	569	if (cuWordLength <= 0) {
	570	wordsFound += 1;
	571	}
	572
	573	// Update the length with the passed-over characters
	574	cuWordLength += chars;
	575	}
	576	else {
	577	// Back up to where we were for next iteration
	578	utext_setNativeIndex(text, current + cuWordLength);
	579	}
	580	}
	581
	582	// Never stop before a combining mark.
	583	int32_t currPos;
	584	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
	585	utext_next32(text);
	586	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
	587	}
	588
	589	// Look ahead for possible suffixes if a dictionary word does not follow.
	590	// We do this in code rather than using a rule so that the heuristic
	591	// resynch continues to function. For example, one of the suffix characters
	592	// could be a typo in the middle of a word.
	593	// NOT CURRENTLY APPLICABLE TO LAO
	594
	595	// Did we find a word on this iteration? If so, push it on the break stack
	596	if (cuWordLength > 0) {
	597	foundBreaks.push((current+cuWordLength), status);
	598	}
	599	}
	600
	601	// Don't return a break for the end of the dictionary range if there is one there.
	602	if (foundBreaks.peeki() >= rangeEnd) {
	603	(void) foundBreaks.popi();
	604	wordsFound -= 1;
	605	}
	606
	607	return wordsFound;
	608	}
	609
	610	/*
	611	******************************************************************
	612	* BurmeseBreakEngine
	613	*/
	614
	615	// How many words in a row are "good enough"?
	616	static const int32_t BURMESE_LOOKAHEAD = 3;
	617
	618	// Will not combine a non-word with a preceding dictionary word longer than this
	619	static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
	620
	621	// Will not combine a non-word that shares at least this much prefix with a
	622	// dictionary word, with a preceding word
	623	static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
	624
	625	// Minimum word size
	626	static const int32_t BURMESE_MIN_WORD = 2;
	627
	628	// Minimum number of characters for two words
	629	static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
	630
	631	BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
	632	: DictionaryBreakEngine(),
	633	fDictionary(adoptDictionary)
	634	{
	635	fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
	636	if (U_SUCCESS(status)) {
	637	setCharacters(fBurmeseWordSet);
	638	}
	639	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
	640	fMarkSet.add(0x0020);
	641	fEndWordSet = fBurmeseWordSet;
	642	fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
	643
	644	// Compact for caching.
	645	fMarkSet.compact();
	646	fEndWordSet.compact();
	647	fBeginWordSet.compact();
	648	}
	649
	650	BurmeseBreakEngine::~BurmeseBreakEngine() {
	651	delete fDictionary;
	652	}
	653
	654	int32_t
	655	BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
	656	int32_t rangeStart,
	657	int32_t rangeEnd,
	658	UVector32 &foundBreaks ) const {
	659	if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
	660	return 0; // Not enough characters for two words
	661	}
	662
	663	uint32_t wordsFound = 0;
	664	int32_t cpWordLength = 0;
	665	int32_t cuWordLength = 0;
	666	int32_t current;
	667	UErrorCode status = U_ZERO_ERROR;
	668	PossibleWord words[BURMESE_LOOKAHEAD];
	669
	670	utext_setNativeIndex(text, rangeStart);
	671
	672	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
	673	cuWordLength = 0;
	674	cpWordLength = 0;
	675
	676	// Look for candidate words at the current position
	677	int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	678
	679	// If we found exactly one, use that
	680	if (candidates == 1) {
	681	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
	682	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
	683	wordsFound += 1;
	684	}
	685	// If there was more than one, see which one can take us forward the most words
	686	else if (candidates > 1) {
	687	// If we're already at the end of the range, we're done
	688	if (utext_getNativeIndex(text) >= rangeEnd) {
	689	goto foundBest;
	690	}
	691	do {
	692	int32_t wordsMatched = 1;
	693	if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
	694	if (wordsMatched < 2) {
	695	// Followed by another dictionary word; mark first word as a good candidate
	696	words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
	697	wordsMatched = 2;
	698	}
	699
	700	// If we're already at the end of the range, we're done
	701	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
	702	goto foundBest;
	703	}
	704
	705	// See if any of the possible second words is followed by a third word
	706	do {
	707	// If we find a third word, stop right away
	708	if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
	709	words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
	710	goto foundBest;
	711	}
	712	}
	713	while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
	714	}
	715	}
	716	while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
	717	foundBest:
	718	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
	719	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
	720	wordsFound += 1;
	721	}
	722
	723	// We come here after having either found a word or not. We look ahead to the
	724	// next word. If it's not a dictionary word, we will combine it withe the word we
	725	// just found (if there is one), but only if the preceding word does not exceed
	726	// the threshold.
	727	// The text iterator should now be positioned at the end of the word we found.
	728	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
	729	// if it is a dictionary word, do nothing. If it isn't, then if there is
	730	// no preceding word, or the non-word shares less than the minimum threshold
	731	// of characters with a dictionary word, then scan to resynchronize
	732	if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
	733	&& (cuWordLength == 0
	734	\|\| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
	735	// Look for a plausible word boundary
	736	int32_t remaining = rangeEnd - (current + cuWordLength);
	737	UChar32 pc;
	738	UChar32 uc;
	739	int32_t chars = 0;
	740	for (;;) {
	741	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
	742	pc = utext_next32(text);
	743	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
	744	chars += pcSize;
	745	remaining -= pcSize;
	746	if (remaining <= 0) {
	747	break;
	748	}
	749	uc = utext_current32(text);
	750	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
	751	// Maybe. See if it's in the dictionary.
	752	// TODO: this looks iffy; compare with old code.
	753	int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	754	utext_setNativeIndex(text, current + cuWordLength + chars);
	755	if (num_candidates > 0) {
	756	break;
	757	}
	758	}
	759	}
	760
	761	// Bump the word count if there wasn't already one
	762	if (cuWordLength <= 0) {
	763	wordsFound += 1;
	764	}
	765
	766	// Update the length with the passed-over characters
	767	cuWordLength += chars;
	768	}
	769	else {
	770	// Back up to where we were for next iteration
	771	utext_setNativeIndex(text, current + cuWordLength);
	772	}
	773	}
	774
	775	// Never stop before a combining mark.
	776	int32_t currPos;
	777	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
	778	utext_next32(text);
	779	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
	780	}
	781
	782	// Look ahead for possible suffixes if a dictionary word does not follow.
	783	// We do this in code rather than using a rule so that the heuristic
	784	// resynch continues to function. For example, one of the suffix characters
	785	// could be a typo in the middle of a word.
	786	// NOT CURRENTLY APPLICABLE TO BURMESE
	787
	788	// Did we find a word on this iteration? If so, push it on the break stack
	789	if (cuWordLength > 0) {
	790	foundBreaks.push((current+cuWordLength), status);
	791	}
	792	}
	793
	794	// Don't return a break for the end of the dictionary range if there is one there.
	795	if (foundBreaks.peeki() >= rangeEnd) {
	796	(void) foundBreaks.popi();
	797	wordsFound -= 1;
	798	}
	799
	800	return wordsFound;
	801	}
	802
	803	/*
	804	******************************************************************
	805	* KhmerBreakEngine
	806	*/
	807
	808	// How many words in a row are "good enough"?
	809	static const int32_t KHMER_LOOKAHEAD = 3;
	810
	811	// Will not combine a non-word with a preceding dictionary word longer than this
	812	static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
	813
	814	// Will not combine a non-word that shares at least this much prefix with a
	815	// dictionary word, with a preceding word
	816	static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
	817
	818	// Minimum word size
	819	static const int32_t KHMER_MIN_WORD = 2;
	820
	821	// Minimum number of characters for two words
	822	static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
	823
	824	KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
	825	: DictionaryBreakEngine(),
	826	fDictionary(adoptDictionary)
	827	{
	828	fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
	829	if (U_SUCCESS(status)) {
	830	setCharacters(fKhmerWordSet);
	831	}
	832	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
	833	fMarkSet.add(0x0020);
	834	fEndWordSet = fKhmerWordSet;
	835	fBeginWordSet.add(0x1780, 0x17B3);
	836	//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
	837	//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
	838	//fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
	839	fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
	840	//fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
	841	// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
	842	// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
	843	// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
	844	// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
	845	// fSuffixSet.add(THAI_PAIYANNOI);
	846	// fSuffixSet.add(THAI_MAIYAMOK);
	847
	848	// Compact for caching.
	849	fMarkSet.compact();
	850	fEndWordSet.compact();
	851	fBeginWordSet.compact();
	852	// fSuffixSet.compact();
	853	}
	854
	855	KhmerBreakEngine::~KhmerBreakEngine() {
	856	delete fDictionary;
	857	}
	858
	859	int32_t
	860	KhmerBreakEngine::divideUpDictionaryRange( UText *text,
	861	int32_t rangeStart,
	862	int32_t rangeEnd,
	863	UVector32 &foundBreaks ) const {
	864	if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
	865	return 0; // Not enough characters for two words
	866	}
	867
	868	uint32_t wordsFound = 0;
	869	int32_t cpWordLength = 0;
	870	int32_t cuWordLength = 0;
	871	int32_t current;
	872	UErrorCode status = U_ZERO_ERROR;
	873	PossibleWord words[KHMER_LOOKAHEAD];
	874
	875	utext_setNativeIndex(text, rangeStart);
	876
	877	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
	878	cuWordLength = 0;
	879	cpWordLength = 0;
	880
	881	// Look for candidate words at the current position
	882	int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	883
	884	// If we found exactly one, use that
	885	if (candidates == 1) {
	886	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
	887	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
	888	wordsFound += 1;
	889	}
	890
	891	// If there was more than one, see which one can take us forward the most words
	892	else if (candidates > 1) {
	893	// If we're already at the end of the range, we're done
	894	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
	895	goto foundBest;
	896	}
	897	do {
	898	int32_t wordsMatched = 1;
	899	if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
	900	if (wordsMatched < 2) {
	901	// Followed by another dictionary word; mark first word as a good candidate
	902	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
	903	wordsMatched = 2;
	904	}
	905
	906	// If we're already at the end of the range, we're done
	907	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
	908	goto foundBest;
	909	}
	910
	911	// See if any of the possible second words is followed by a third word
	912	do {
	913	// If we find a third word, stop right away
	914	if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
	915	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
	916	goto foundBest;
	917	}
	918	}
	919	while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
	920	}
	921	}
	922	while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
	923	foundBest:
	924	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
	925	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
	926	wordsFound += 1;
	927	}
	928
	929	// We come here after having either found a word or not. We look ahead to the
	930	// next word. If it's not a dictionary word, we will combine it with the word we
	931	// just found (if there is one), but only if the preceding word does not exceed
	932	// the threshold.
	933	// The text iterator should now be positioned at the end of the word we found.
	934	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
	935	// if it is a dictionary word, do nothing. If it isn't, then if there is
	936	// no preceding word, or the non-word shares less than the minimum threshold
	937	// of characters with a dictionary word, then scan to resynchronize
	938	if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
	939	&& (cuWordLength == 0
	940	\|\| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
	941	// Look for a plausible word boundary
	942	int32_t remaining = rangeEnd - (current+cuWordLength);
	943	UChar32 pc;
	944	UChar32 uc;
	945	int32_t chars = 0;
	946	for (;;) {
	947	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
	948	pc = utext_next32(text);
	949	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
	950	chars += pcSize;
	951	remaining -= pcSize;
	952	if (remaining <= 0) {
	953	break;
	954	}
	955	uc = utext_current32(text);
	956	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
	957	// Maybe. See if it's in the dictionary.
	958	int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
	959	utext_setNativeIndex(text, current+cuWordLength+chars);
	960	if (num_candidates > 0) {
	961	break;
	962	}
	963	}
	964	}
	965
	966	// Bump the word count if there wasn't already one
	967	if (cuWordLength <= 0) {
	968	wordsFound += 1;
	969	}
	970
	971	// Update the length with the passed-over characters
	972	cuWordLength += chars;
	973	}
	974	else {
	975	// Back up to where we were for next iteration
	976	utext_setNativeIndex(text, current+cuWordLength);
	977	}
	978	}
	979
	980	// Never stop before a combining mark.
	981	int32_t currPos;
	982	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
	983	utext_next32(text);
	984	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
	985	}
	986
	987	// Look ahead for possible suffixes if a dictionary word does not follow.
	988	// We do this in code rather than using a rule so that the heuristic
	989	// resynch continues to function. For example, one of the suffix characters
	990	// could be a typo in the middle of a word.
	991	// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
	992	// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
	993	// && fSuffixSet.contains(uc = utext_current32(text))) {
	994	// if (uc == KHMER_PAIYANNOI) {
	995	// if (!fSuffixSet.contains(utext_previous32(text))) {
	996	// // Skip over previous end and PAIYANNOI
	997	// utext_next32(text);
	998	// utext_next32(text);
	999	// wordLength += 1; // Add PAIYANNOI to word
	1000	// uc = utext_current32(text); // Fetch next character
	1001	// }
	1002	// else {
	1003	// // Restore prior position
	1004	// utext_next32(text);
	1005	// }
	1006	// }
	1007	// if (uc == KHMER_MAIYAMOK) {
	1008	// if (utext_previous32(text) != KHMER_MAIYAMOK) {
	1009	// // Skip over previous end and MAIYAMOK
	1010	// utext_next32(text);
	1011	// utext_next32(text);
	1012	// wordLength += 1; // Add MAIYAMOK to word
	1013	// }
	1014	// else {
	1015	// // Restore prior position
	1016	// utext_next32(text);
	1017	// }
	1018	// }
	1019	// }
	1020	// else {
	1021	// utext_setNativeIndex(text, current+wordLength);
	1022	// }
	1023	// }
	1024
	1025	// Did we find a word on this iteration? If so, push it on the break stack
	1026	if (cuWordLength > 0) {
	1027	foundBreaks.push((current+cuWordLength), status);
	1028	}
	1029	}
	1030
	1031	// Don't return a break for the end of the dictionary range if there is one there.
	1032	if (foundBreaks.peeki() >= rangeEnd) {
	1033	(void) foundBreaks.popi();
	1034	wordsFound -= 1;
	1035	}
	1036
	1037	return wordsFound;
	1038	}
	1039
	1040	#if !UCONFIG_NO_NORMALIZATION
	1041	/*
	1042	******************************************************************
	1043	* CjkBreakEngine
	1044	*/
	1045	static const uint32_t kuint32max = 0xFFFFFFFF;
	1046	CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
	1047	: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
	1048	// Korean dictionary only includes Hangul syllables
	1049	fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
	1050	fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
	1051	fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
	1052	fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
	1053	nfkcNorm2 = Normalizer2::getNFKCInstance(status);
	1054
	1055	if (U_SUCCESS(status)) {
	1056	// handle Korean and Japanese/Chinese using different dictionaries
	1057	if (type == kKorean) {
	1058	setCharacters(fHangulWordSet);
	1059	} else { //Chinese and Japanese
	1060	UnicodeSet cjSet;
	1061	cjSet.addAll(fHanWordSet);
	1062	cjSet.addAll(fKatakanaWordSet);
	1063	cjSet.addAll(fHiraganaWordSet);
	1064	cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
	1065	cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
	1066	setCharacters(cjSet);
	1067	}
	1068	}
	1069	}
	1070
	1071	CjkBreakEngine::~CjkBreakEngine(){
	1072	delete fDictionary;
	1073	}
	1074
	1075	// The katakanaCost values below are based on the length frequencies of all
	1076	// katakana phrases in the dictionary
	1077	static const int32_t kMaxKatakanaLength = 8;
	1078	static const int32_t kMaxKatakanaGroupLength = 20;
	1079	static const uint32_t maxSnlp = 255;
	1080
	1081	static inline uint32_t getKatakanaCost(int32_t wordLength){
	1082	//TODO: fill array with actual values from dictionary!
	1083	static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
	1084	= {8192, 984, 408, 240, 204, 252, 300, 372, 480};
	1085	return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
	1086	}
	1087
	1088	static inline bool isKatakana(UChar32 value) {
	1089	return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) \|\|
	1090	(value >= 0xFF66 && value <= 0xFF9f);
	1091	}
	1092
	1093
	1094	// Function for accessing internal utext flags.
	1095	// Replicates an internal UText function.
	1096
	1097	static inline int32_t utext_i32_flag(int32_t bitIndex) {
	1098	return (int32_t)1 << bitIndex;
	1099	}
	1100
	1101
	1102	/*
	1103	* @param text A UText representing the text
	1104	* @param rangeStart The start of the range of dictionary characters
	1105	* @param rangeEnd The end of the range of dictionary characters
	1106	* @param foundBreaks vector<int32> to receive the break positions
	1107	* @return The number of breaks found
	1108	*/
	1109	int32_t
	1110	CjkBreakEngine::divideUpDictionaryRange( UText *inText,
	1111	int32_t rangeStart,
	1112	int32_t rangeEnd,
	1113	UVector32 &foundBreaks ) const {
	1114	if (rangeStart >= rangeEnd) {
	1115	return 0;
	1116	}
	1117
	1118	// UnicodeString version of input UText, NFKC normalized if necessary.
	1119	UnicodeString inString;
	1120
	1121	// inputMap[inStringIndex] = corresponding native index from UText inText.
	1122	// If NULL then mapping is 1:1
	1123	LocalPointer<UVector32> inputMap;
	1124
	1125	UErrorCode status = U_ZERO_ERROR;
	1126
	1127
	1128	// if UText has the input string as one contiguous UTF-16 chunk
	1129	if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
	1130	inText->chunkNativeStart <= rangeStart &&
	1131	inText->chunkNativeLimit >= rangeEnd &&
	1132	inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
	1133
	1134	// Input UText is in one contiguous UTF-16 chunk.
	1135	// Use Read-only aliasing UnicodeString.
	1136	inString.setTo(FALSE,
	1137	inText->chunkContents + rangeStart - inText->chunkNativeStart,
	1138	rangeEnd - rangeStart);
	1139	} else {
	1140	// Copy the text from the original inText (UText) to inString (UnicodeString).
	1141	// Create a map from UnicodeString indices -> UText offsets.
	1142	utext_setNativeIndex(inText, rangeStart);
	1143	int32_t limit = rangeEnd;
	1144	U_ASSERT(limit <= utext_nativeLength(inText));
	1145	if (limit > utext_nativeLength(inText)) {
	1146	limit = (int32_t)utext_nativeLength(inText);
	1147	}
	1148	inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
	1149	if (U_FAILURE(status)) {
	1150	return 0;
	1151	}
	1152	while (utext_getNativeIndex(inText) < limit) {
	1153	int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
	1154	UChar32 c = utext_next32(inText);
	1155	U_ASSERT(c != U_SENTINEL);
	1156	inString.append(c);
	1157	while (inputMap->size() < inString.length()) {
	1158	inputMap->addElement(nativePosition, status);
	1159	}
	1160	}
	1161	inputMap->addElement(limit, status);
	1162	}
	1163
	1164
	1165	if (!nfkcNorm2->isNormalized(inString, status)) {
	1166	UnicodeString normalizedInput;
	1167	// normalizedMap[normalizedInput position] == original UText position.
	1168	LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
	1169	if (U_FAILURE(status)) {
	1170	return 0;
	1171	}
	1172
	1173	UnicodeString fragment;
	1174	UnicodeString normalizedFragment;
	1175	for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
	1176	fragment.remove();
	1177	int32_t fragmentStartI = srcI;
	1178	UChar32 c = inString.char32At(srcI);
	1179	for (;;) {
	1180	fragment.append(c);
	1181	srcI = inString.moveIndex32(srcI, 1);
	1182	if (srcI == inString.length()) {
	1183	break;
	1184	}
	1185	c = inString.char32At(srcI);
	1186	if (nfkcNorm2->hasBoundaryBefore(c)) {
	1187	break;
	1188	}
	1189	}
	1190	nfkcNorm2->normalize(fragment, normalizedFragment, status);
	1191	normalizedInput.append(normalizedFragment);
	1192
	1193	// Map every position in the normalized chunk to the start of the chunk
	1194	// in the original input.
	1195	int32_t fragmentOriginalStart = inputMap.isValid() ?
	1196	inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
	1197	while (normalizedMap->size() < normalizedInput.length()) {
	1198	normalizedMap->addElement(fragmentOriginalStart, status);
	1199	if (U_FAILURE(status)) {
	1200	break;
	1201	}
	1202	}
	1203	}
	1204	U_ASSERT(normalizedMap->size() == normalizedInput.length());
	1205	int32_t nativeEnd = inputMap.isValid() ?
	1206	inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
	1207	normalizedMap->addElement(nativeEnd, status);
	1208
	1209	inputMap = std::move(normalizedMap);
	1210	inString = std::move(normalizedInput);
	1211	}
	1212
	1213	int32_t numCodePts = inString.countChar32();
	1214	if (numCodePts != inString.length()) {
	1215	// There are supplementary characters in the input.
	1216	// The dictionary will produce boundary positions in terms of code point indexes,
	1217	// not in terms of code unit string indexes.
	1218	// Use the inputMap mechanism to take care of this in addition to indexing differences
	1219	// from normalization and/or UTF-8 input.
	1220	UBool hadExistingMap = inputMap.isValid();
	1221	if (!hadExistingMap) {
	1222	inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
	1223	if (U_FAILURE(status)) {
	1224	return 0;
	1225	}
	1226	}
	1227	int32_t cpIdx = 0;
	1228	for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
	1229	U_ASSERT(cuIdx >= cpIdx);
	1230	if (hadExistingMap) {
	1231	inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
	1232	} else {
	1233	inputMap->addElement(cuIdx+rangeStart, status);
	1234	}
	1235	cpIdx++;
	1236	if (cuIdx == inString.length()) {
	1237	break;
	1238	}
	1239	}
	1240	}
	1241
	1242	// bestSnlp[i] is the snlp of the best segmentation of the first i
	1243	// code points in the range to be matched.
	1244	UVector32 bestSnlp(numCodePts + 1, status);
	1245	bestSnlp.addElement(0, status);
	1246	for(int32_t i = 1; i <= numCodePts; i++) {
	1247	bestSnlp.addElement(kuint32max, status);
	1248	}
	1249
	1250
	1251	// prev[i] is the index of the last CJK code point in the previous word in
	1252	// the best segmentation of the first i characters.
	1253	UVector32 prev(numCodePts + 1, status);
	1254	for(int32_t i = 0; i <= numCodePts; i++){
	1255	prev.addElement(-1, status);
	1256	}
	1257
	1258	const int32_t maxWordSize = 20;
	1259	UVector32 values(numCodePts, status);
	1260	values.setSize(numCodePts);
	1261	UVector32 lengths(numCodePts, status);
	1262	lengths.setSize(numCodePts);
	1263
	1264	UText fu = UTEXT_INITIALIZER;
	1265	utext_openUnicodeString(&fu, &inString, &status);
	1266
	1267	// Dynamic programming to find the best segmentation.
	1268
	1269	// In outer loop, i is the code point index,
	1270	// ix is the corresponding string (code unit) index.
	1271	// They differ when the string contains supplementary characters.
	1272	int32_t ix = 0;
	1273	bool is_prev_katakana = false;
	1274	for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
	1275	if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
	1276	continue;
	1277	}
	1278
	1279	int32_t count;
	1280	utext_setNativeIndex(&fu, ix);
	1281	count = fDictionary->matches(&fu, maxWordSize, numCodePts,
	1282	NULL, lengths.getBuffer(), values.getBuffer(), NULL);
	1283	// Note: lengths is filled with code point lengths
	1284	// The NULL parameter is the ignored code unit lengths.
	1285
	1286	// if there are no single character matches found in the dictionary
	1287	// starting with this character, treat character as a 1-character word
	1288	// with the highest value possible, i.e. the least likely to occur.
	1289	// Exclude Korean characters from this treatment, as they should be left
	1290	// together by default.
	1291	if ((count == 0 \|\| lengths.elementAti(0) != 1) &&
	1292	!fHangulWordSet.contains(inString.char32At(ix))) {
	1293	values.setElementAt(maxSnlp, count); // 255
	1294	lengths.setElementAt(1, count++);
	1295	}
	1296
	1297	for (int32_t j = 0; j < count; j++) {
	1298	uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
	1299	int32_t ln_j_i = lengths.elementAti(j) + i;
	1300	if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
	1301	bestSnlp.setElementAt(newSnlp, ln_j_i);
	1302	prev.setElementAt(i, ln_j_i);
	1303	}
	1304	}
	1305
	1306	// In Japanese,
	1307	// Katakana word in single character is pretty rare. So we apply
	1308	// the following heuristic to Katakana: any continuous run of Katakana
	1309	// characters is considered a candidate word with a default cost
	1310	// specified in the katakanaCost table according to its length.
	1311
	1312	bool is_katakana = isKatakana(inString.char32At(ix));
	1313	int32_t katakanaRunLength = 1;
	1314	if (!is_prev_katakana && is_katakana) {
	1315	int32_t j = inString.moveIndex32(ix, 1);
	1316	// Find the end of the continuous run of Katakana characters
	1317	while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
	1318	isKatakana(inString.char32At(j))) {
	1319	j = inString.moveIndex32(j, 1);
	1320	katakanaRunLength++;
	1321	}
	1322	if (katakanaRunLength < kMaxKatakanaGroupLength) {
	1323	uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
	1324	if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
	1325	bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
	1326	prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
	1327	}
	1328	}
	1329	}
	1330	is_prev_katakana = is_katakana;
	1331	}
	1332	utext_close(&fu);
	1333
	1334	// Start pushing the optimal offset index into t_boundary (t for tentative).
	1335	// prev[numCodePts] is guaranteed to be meaningful.
	1336	// We'll first push in the reverse order, i.e.,
	1337	// t_boundary[0] = numCodePts, and afterwards do a swap.
	1338	UVector32 t_boundary(numCodePts+1, status);
	1339
	1340	int32_t numBreaks = 0;
	1341	// No segmentation found, set boundary to end of range
	1342	if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
	1343	t_boundary.addElement(numCodePts, status);
	1344	numBreaks++;
	1345	} else {
	1346	for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
	1347	t_boundary.addElement(i, status);
	1348	numBreaks++;
	1349	}
	1350	U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
	1351	}
	1352
	1353	// Add a break for the start of the dictionary range if there is not one
	1354	// there already.
	1355	if (foundBreaks.size() == 0 \|\| foundBreaks.peeki() < rangeStart) {
	1356	t_boundary.addElement(0, status);
	1357	numBreaks++;
	1358	}
	1359
	1360	// Now that we're done, convert positions in t_boundary[] (indices in
	1361	// the normalized input string) back to indices in the original input UText
	1362	// while reversing t_boundary and pushing values to foundBreaks.
	1363	int32_t prevCPPos = -1;
	1364	int32_t prevUTextPos = -1;
	1365	for (int32_t i = numBreaks-1; i >= 0; i--) {
	1366	int32_t cpPos = t_boundary.elementAti(i);
	1367	U_ASSERT(cpPos > prevCPPos);
	1368	int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
	1369	U_ASSERT(utextPos >= prevUTextPos);
	1370	if (utextPos > prevUTextPos) {
	1371	// Boundaries are added to foundBreaks output in ascending order.
	1372	U_ASSERT(foundBreaks.size() == 0 \|\| foundBreaks.peeki() < utextPos);
	1373	foundBreaks.push(utextPos, status);
	1374	} else {
	1375	// Normalization expanded the input text, the dictionary found a boundary
	1376	// within the expansion, giving two boundaries with the same index in the
	1377	// original text. Ignore the second. See ticket #12918.
	1378	--numBreaks;
	1379	}
	1380	prevCPPos = cpPos;
	1381	prevUTextPos = utextPos;
	1382	}
	1383	(void)prevCPPos; // suppress compiler warnings about unused variable
	1384
	1385	// inString goes out of scope
	1386	// inputMap goes out of scope
	1387	return numBreaks;
	1388	}
	1389	#endif
	1390
	1391	U_NAMESPACE_END
	1392
	1393	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	1394