git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	***************************************************************************
	5	* Copyright (C) 1999-2016 International Business Machines Corporation
	6	* and others. All rights reserved.
	7	***************************************************************************
	8
	9	**********************************************************************
	10	* Legacy version of RuleBasedBreakIterator from ICU 57,
	11	* only for use by Apple RuleBasedTokenizer
	12	**********************************************************************
	13	*/
	14
	15	#include "utypeinfo.h" // for 'typeid' to work
	16
	17	#include "unicode/utypes.h"
	18
	19	#if !UCONFIG_NO_BREAK_ITERATION
	20
	21	#include "unicode/schriter.h"
	22	#include "unicode/uchriter.h"
	23	#include "unicode/udata.h"
	24	#include "unicode/uclean.h"
	25	#include "unicode/utext.h"
	26	#include "rbbidata57.h"
	27	#include "rbbirb57.h"
	28	#include "rbbi57.h"
	29	#include "cmemory.h"
	30	#include "cstring.h"
	31	#include "umutex.h"
	32	#include "ucln_cmn.h"
	33	#include "brkeng.h"
	34	#include "utrie.h"
	35
	36	#include "uassert.h"
	37	#include "uvectr32.h"
	38
	39	// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
	40	#if U_LOCAL_SERVICE_HOOK
	41	#include "localsvc.h"
	42	#endif
	43
	44	#ifdef RBBI_DEBUG
	45	static UBool fTrace = FALSE;
	46	#endif
	47
	48	U_NAMESPACE_BEGIN
	49
	50	// The state number of the starting state
	51	#define START_STATE 1
	52
	53	// The state-transition value indicating "stop"
	54	#define STOP_STATE 0
	55
	56
	57	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator57)
	58
	59
	60	//=======================================================================
	61	// constructors
	62	//=======================================================================
	63
	64	/**
	65	* Constructs a RuleBasedBreakIterator57 that uses the already-created
	66	* tables object that is passed in as a parameter.
	67	*/
	68	RuleBasedBreakIterator57::RuleBasedBreakIterator57(RBBIDataHeader57* data, UErrorCode &status)
	69	{
	70	init();
	71	fData = new RBBIDataWrapper57(data, status); // status checked in constructor
	72	if (U_FAILURE(status)) {return;}
	73	if(fData == 0) {
	74	status = U_MEMORY_ALLOCATION_ERROR;
	75	return;
	76	}
	77	}
	78
	79	/**
	80	* Same as above but does not adopt memory
	81	*/
	82	RuleBasedBreakIterator57::RuleBasedBreakIterator57(const RBBIDataHeader57* data, enum EDontAdopt, UErrorCode &status)
	83	{
	84	init();
	85	fData = new RBBIDataWrapper57(data, RBBIDataWrapper57::kDontAdopt, status); // status checked in constructor
	86	if (U_FAILURE(status)) {return;}
	87	if(fData == 0) {
	88	status = U_MEMORY_ALLOCATION_ERROR;
	89	return;
	90	}
	91	}
	92
	93
	94	#if 0
	95	// not used by rbtok.cpp
	96
	97	//
	98	// Construct from precompiled binary rules (tables). This constructor is public API,
	99	// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
	100	//
	101	RuleBasedBreakIterator57::RuleBasedBreakIterator57(const uint8_t *compiledRules,
	102	uint32_t ruleLength,
	103	UErrorCode &status) {
	104	init();
	105	if (U_FAILURE(status)) {
	106	return;
	107	}
	108	if (compiledRules == NULL \|\| ruleLength < sizeof(RBBIDataHeader57)) {
	109	status = U_ILLEGAL_ARGUMENT_ERROR;
	110	return;
	111	}
	112	const RBBIDataHeader57 data = (const RBBIDataHeader57 )compiledRules;
	113	if (data->fLength > ruleLength) {
	114	status = U_ILLEGAL_ARGUMENT_ERROR;
	115	return;
	116	}
	117	fData = new RBBIDataWrapper57(data, RBBIDataWrapper57::kDontAdopt, status);
	118	if (U_FAILURE(status)) {return;}
	119	if(fData == 0) {
	120	status = U_MEMORY_ALLOCATION_ERROR;
	121	return;
	122	}
	123	}
	124
	125
	126	//-------------------------------------------------------------------------------
	127	//
	128	// Constructor from a UDataMemory handle to precompiled break rules
	129	// stored in an ICU data file.
	130	//
	131	//-------------------------------------------------------------------------------
	132	RuleBasedBreakIterator57::RuleBasedBreakIterator57(UDataMemory* udm, UErrorCode &status)
	133	{
	134	init();
	135	fData = new RBBIDataWrapper57(udm, status); // status checked in constructor
	136	if (U_FAILURE(status)) {return;}
	137	if(fData == 0) {
	138	status = U_MEMORY_ALLOCATION_ERROR;
	139	return;
	140	}
	141	}
	142	#endif
	143
	144
	145
	146	//-------------------------------------------------------------------------------
	147	//
	148	// Constructor from a set of rules supplied as a string.
	149	//
	150	//-------------------------------------------------------------------------------
	151	RuleBasedBreakIterator57::RuleBasedBreakIterator57( const UnicodeString &rules,
	152	UParseError &parseError,
	153	UErrorCode &status)
	154	{
	155	init();
	156	if (U_FAILURE(status)) {return;}
	157	RuleBasedBreakIterator57 bi = (RuleBasedBreakIterator57 )
	158	RBBIRuleBuilder57::createRuleBasedBreakIterator(rules, &parseError, status);
	159	// Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
	160	// creates and returns a complete RBBI. From here, in a constructor, we
	161	// can't just return the object created by the builder factory, hence
	162	// the assignment of the factory created object to "this".
	163	if (U_SUCCESS(status)) {
	164	this = bi;
	165	delete bi;
	166	}
	167	}
	168
	169
	170	//-------------------------------------------------------------------------------
	171	//
	172	// Default Constructor. Create an empty shell that can be set up later.
	173	// Used when creating a RuleBasedBreakIterator57 from a set
	174	// of rules.
	175	//-------------------------------------------------------------------------------
	176	RuleBasedBreakIterator57::RuleBasedBreakIterator57() {
	177	init();
	178	}
	179
	180
	181	//-------------------------------------------------------------------------------
	182	//
	183	// Copy constructor. Will produce a break iterator with the same behavior,
	184	// and which iterates over the same text, as the one passed in.
	185	//
	186	//-------------------------------------------------------------------------------
	187	RuleBasedBreakIterator57::RuleBasedBreakIterator57(const RuleBasedBreakIterator57& other)
	188	: BreakIterator(other)
	189	{
	190	this->init();
	191	*this = other;
	192	}
	193
	194
	195	/**
	196	* Destructor
	197	*/
	198	RuleBasedBreakIterator57::~RuleBasedBreakIterator57() {
	199	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
	200	// fCharIter was adopted from the outside.
	201	delete fCharIter;
	202	}
	203	fCharIter = NULL;
	204	delete fSCharIter;
	205	fCharIter = NULL;
	206	delete fDCharIter;
	207	fDCharIter = NULL;
	208
	209	utext_close(fText);
	210
	211	if (fData != NULL) {
	212	fData->removeReference();
	213	fData = NULL;
	214	}
	215	if (fCachedBreakPositions) {
	216	uprv_free(fCachedBreakPositions);
	217	fCachedBreakPositions = NULL;
	218	}
	219	if (fLanguageBreakEngines) {
	220	delete fLanguageBreakEngines;
	221	fLanguageBreakEngines = NULL;
	222	}
	223	if (fUnhandledBreakEngine) {
	224	delete fUnhandledBreakEngine;
	225	fUnhandledBreakEngine = NULL;
	226	}
	227	}
	228
	229	/**
	230	* Assignment operator. Sets this iterator to have the same behavior,
	231	* and iterate over the same text, as the one passed in.
	232	*/
	233	RuleBasedBreakIterator57&
	234	RuleBasedBreakIterator57::operator=(const RuleBasedBreakIterator57& that) {
	235	if (this == &that) {
	236	return *this;
	237	}
	238	fLineWordOpts = that.fLineWordOpts;
	239	reset(); // Delete break cache information
	240	fBreakType = that.fBreakType;
	241	if (fLanguageBreakEngines != NULL) {
	242	delete fLanguageBreakEngines;
	243	fLanguageBreakEngines = NULL; // Just rebuild for now
	244	}
	245	// TODO: clone fLanguageBreakEngines from "that"
	246	UErrorCode status = U_ZERO_ERROR;
	247	fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
	248
	249	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
	250	delete fCharIter;
	251	}
	252	fCharIter = NULL;
	253
	254	if (that.fCharIter != NULL ) {
	255	// This is a little bit tricky - it will intially appear that
	256	// this->fCharIter is adopted, even if that->fCharIter was
	257	// not adopted. That's ok.
	258	fCharIter = that.fCharIter->clone();
	259	}
	260
	261	if (fData != NULL) {
	262	fData->removeReference();
	263	fData = NULL;
	264	}
	265	if (that.fData != NULL) {
	266	fData = that.fData->addReference();
	267	}
	268
	269	return *this;
	270	}
	271
	272
	273
	274	//-----------------------------------------------------------------------------
	275	//
	276	// init() Shared initialization routine. Used by all the constructors.
	277	// Initializes all fields, leaving the object in a consistent state.
	278	//
	279	//-----------------------------------------------------------------------------
	280	void RuleBasedBreakIterator57::init() {
	281	UErrorCode status = U_ZERO_ERROR;
	282	fText = utext_openUChars(NULL, NULL, 0, &status);
	283	fCharIter = NULL;
	284	fSCharIter = NULL;
	285	fDCharIter = NULL;
	286	fData = NULL;
	287	fLastRuleStatusIndex = 0;
	288	fLastStatusIndexValid = TRUE;
	289	fDictionaryCharCount = 0;
	290	fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
	291	// dictionary behavior for Break Iterators that are
	292	// built from rules. Even better would be the ability to
	293	// declare the type in the rules.
	294
	295	fCachedBreakPositions = NULL;
	296	fLanguageBreakEngines = NULL;
	297	fUnhandledBreakEngine = NULL;
	298	fNumCachedBreakPositions = 0;
	299	fPositionInCache = 0;
	300
	301	#ifdef RBBI_DEBUG
	302	static UBool debugInitDone = FALSE;
	303	if (debugInitDone == FALSE) {
	304	char *debugEnv = getenv("U_RBBIDEBUG");
	305	if (debugEnv && uprv_strstr(debugEnv, "trace")) {
	306	fTrace = TRUE;
	307	}
	308	debugInitDone = TRUE;
	309	}
	310	#endif
	311	}
	312
	313
	314
	315	//-----------------------------------------------------------------------------
	316	//
	317	// clone - Returns a newly-constructed RuleBasedBreakIterator57 with the same
	318	// behavior, and iterating over the same text, as this one.
	319	// Virtual function: does the right thing with subclasses.
	320	//
	321	//-----------------------------------------------------------------------------
	322	BreakIterator*
	323	RuleBasedBreakIterator57::clone(void) const {
	324	return new RuleBasedBreakIterator57(*this);
	325	}
	326
	327	/**
	328	* Equality operator. Returns TRUE if both BreakIterators are of the
	329	* same class, have the same behavior, and iterate over the same text.
	330	*/
	331	UBool
	332	RuleBasedBreakIterator57::operator==(const BreakIterator& that) const {
	333	if (typeid(*this) != typeid(that)) {
	334	return FALSE;
	335	}
	336
	337	const RuleBasedBreakIterator57& that2 = (const RuleBasedBreakIterator57&) that;
	338	if (that2.fLineWordOpts != fLineWordOpts) {
	339	return FALSE;
	340	}
	341
	342	if (!utext_equals(fText, that2.fText)) {
	343	// The two break iterators are operating on different text,
	344	// or have a different interation position.
	345	return FALSE;
	346	};
	347
	348	// TODO: need a check for when in a dictionary region at different offsets.
	349
	350	if (that2.fData == fData \|\|
	351	(fData != NULL && that2.fData != NULL && that2.fData == fData)) {
	352	// The two break iterators are using the same rules.
	353	return TRUE;
	354	}
	355	return FALSE;
	356	}
	357
	358	/**
	359	* Compute a hash code for this BreakIterator
	360	* @return A hash code
	361	*/
	362	int32_t
	363	RuleBasedBreakIterator57::hashCode(void) const {
	364	int32_t hash = 0;
	365	if (fData != NULL) {
	366	hash = fData->hashCode();
	367	}
	368	return hash;
	369	}
	370
	371
	372	void RuleBasedBreakIterator57::setText(UText *ut, UErrorCode &status) {
	373	if (U_FAILURE(status)) {
	374	return;
	375	}
	376	reset();
	377	fText = utext_clone(fText, ut, FALSE, TRUE, &status);
	378
	379	// Set up a dummy CharacterIterator to be returned if anyone
	380	// calls getText(). With input from UText, there is no reasonable
	381	// way to return a characterIterator over the actual input text.
	382	// Return one over an empty string instead - this is the closest
	383	// we can come to signaling a failure.
	384	// (GetText() is obsolete, this failure is sort of OK)
	385	if (fDCharIter == NULL) {
	386	static const UChar c = 0;
	387	fDCharIter = new UCharCharacterIterator(&c, 0);
	388	if (fDCharIter == NULL) {
	389	status = U_MEMORY_ALLOCATION_ERROR;
	390	return;
	391	}
	392	}
	393
	394	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
	395	// existing fCharIter was adopted from the outside. Delete it now.
	396	delete fCharIter;
	397	}
	398	fCharIter = fDCharIter;
	399
	400	this->first();
	401	}
	402
	403
	404	UText RuleBasedBreakIterator57::getUText(UText fillIn, UErrorCode &status) const {
	405	UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
	406	return result;
	407	}
	408
	409
	410
	411	#if 0
	412	// not used by rbtok.cpp
	413	/**
	414	* Returns the description used to create this iterator
	415	*/
	416	const UnicodeString&
	417	RuleBasedBreakIterator57::getRules() const {
	418	if (fData != NULL) {
	419	return fData->getRuleSourceString();
	420	} else {
	421	static const UnicodeString *s;
	422	if (s == NULL) {
	423	// TODO: something more elegant here.
	424	// perhaps API should return the string by value.
	425	// Note: thread unsafe init & leak are semi-ok, better than
	426	// what was before. Sould be cleaned up, though.
	427	s = new UnicodeString;
	428	}
	429	return *s;
	430	}
	431	}
	432	#endif
	433
	434	//=======================================================================
	435	// BreakIterator overrides
	436	//=======================================================================
	437
	438	/**
	439	* Return a CharacterIterator over the text being analyzed.
	440	*/
	441	CharacterIterator&
	442	RuleBasedBreakIterator57::getText() const {
	443	return *fCharIter;
	444	}
	445
	446	/**
	447	* Set the iterator to analyze a new piece of text. This function resets
	448	* the current iteration position to the beginning of the text.
	449	* @param newText An iterator over the text to analyze.
	450	*/
	451	void
	452	RuleBasedBreakIterator57::adoptText(CharacterIterator* newText) {
	453	// If we are holding a CharacterIterator adopted from a
	454	// previous call to this function, delete it now.
	455	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
	456	delete fCharIter;
	457	}
	458
	459	fCharIter = newText;
	460	UErrorCode status = U_ZERO_ERROR;
	461	reset();
	462	if (newText==NULL \|\| newText->startIndex() != 0) {
	463	// startIndex !=0 wants to be an error, but there's no way to report it.
	464	// Make the iterator text be an empty string.
	465	fText = utext_openUChars(fText, NULL, 0, &status);
	466	} else {
	467	fText = utext_openCharacterIterator(fText, newText, &status);
	468	}
	469	this->first();
	470	}
	471
	472	/**
	473	* Set the iterator to analyze a new piece of text. This function resets
	474	* the current iteration position to the beginning of the text.
	475	* @param newText An iterator over the text to analyze.
	476	*/
	477	void
	478	RuleBasedBreakIterator57::setText(const UnicodeString& newText) {
	479	UErrorCode status = U_ZERO_ERROR;
	480	reset();
	481	fText = utext_openConstUnicodeString(fText, &newText, &status);
	482
	483	// Set up a character iterator on the string.
	484	// Needed in case someone calls getText().
	485	// Can not, unfortunately, do this lazily on the (probably never)
	486	// call to getText(), because getText is const.
	487	if (fSCharIter == NULL) {
	488	fSCharIter = new StringCharacterIterator(newText);
	489	} else {
	490	fSCharIter->setText(newText);
	491	}
	492
	493	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
	494	// old fCharIter was adopted from the outside. Delete it.
	495	delete fCharIter;
	496	}
	497	fCharIter = fSCharIter;
	498
	499	this->first();
	500	}
	501
	502
	503	/**
	504	* Provide a new UText for the input text. Must reference text with contents identical
	505	* to the original.
	506	* Intended for use with text data originating in Java (garbage collected) environments
	507	* where the data may be moved in memory at arbitrary times.
	508	*/
	509	RuleBasedBreakIterator57 &RuleBasedBreakIterator57::refreshInputText(UText *input, UErrorCode &status) {
	510	if (U_FAILURE(status)) {
	511	return *this;
	512	}
	513	if (input == NULL) {
	514	status = U_ILLEGAL_ARGUMENT_ERROR;
	515	return *this;
	516	}
	517	int64_t pos = utext_getNativeIndex(fText);
	518	// Shallow read-only clone of the new UText into the existing input UText
	519	fText = utext_clone(fText, input, FALSE, TRUE, &status);
	520	if (U_FAILURE(status)) {
	521	return *this;
	522	}
	523	utext_setNativeIndex(fText, pos);
	524	if (utext_getNativeIndex(fText) != pos) {
	525	// Sanity check. The new input utext is supposed to have the exact same
	526	// contents as the old. If we can't set to the same position, it doesn't.
	527	// The contents underlying the old utext might be invalid at this point,
	528	// so it's not safe to check directly.
	529	status = U_ILLEGAL_ARGUMENT_ERROR;
	530	}
	531	return *this;
	532	}
	533
	534
	535	/**
	536	* Sets the current iteration position to the beginning of the text, position zero.
	537	* @return The new iterator position, which is zero.
	538	*/
	539	int32_t RuleBasedBreakIterator57::first(void) {
	540	reset();
	541	fLastRuleStatusIndex = 0;
	542	fLastStatusIndexValid = TRUE;
	543	//if (fText == NULL)
	544	// return BreakIterator::DONE;
	545
	546	utext_setNativeIndex(fText, 0);
	547	return 0;
	548	}
	549
	550	/**
	551	* Sets the current iteration position to the end of the text.
	552	* @return The text's past-the-end offset.
	553	*/
	554	int32_t RuleBasedBreakIterator57::last(void) {
	555	reset();
	556	if (fText == NULL) {
	557	fLastRuleStatusIndex = 0;
	558	fLastStatusIndexValid = TRUE;
	559	return BreakIterator::DONE;
	560	}
	561
	562	fLastStatusIndexValid = FALSE;
	563	int32_t pos = (int32_t)utext_nativeLength(fText);
	564	utext_setNativeIndex(fText, pos);
	565	return pos;
	566	}
	567
	568	/**
	569	* Advances the iterator either forward or backward the specified number of steps.
	570	* Negative values move backward, and positive values move forward. This is
	571	* equivalent to repeatedly calling next() or previous().
	572	* @param n The number of steps to move. The sign indicates the direction
	573	* (negative is backwards, and positive is forwards).
	574	* @return The character offset of the boundary position n boundaries away from
	575	* the current one.
	576	*/
	577	int32_t RuleBasedBreakIterator57::next(int32_t n) {
	578	int32_t result = current();
	579	while (n > 0) {
	580	result = next();
	581	--n;
	582	}
	583	while (n < 0) {
	584	result = previous();
	585	++n;
	586	}
	587	return result;
	588	}
	589
	590	/**
	591	* Advances the iterator to the next boundary position.
	592	* @return The position of the first boundary after this one.
	593	*/
	594	int32_t RuleBasedBreakIterator57::next(void) {
	595	// if we have cached break positions and we're still in the range
	596	// covered by them, just move one step forward in the cache
	597	if (fCachedBreakPositions != NULL) {
	598	if (fPositionInCache < fNumCachedBreakPositions - 1) {
	599	++fPositionInCache;
	600	int32_t pos = fCachedBreakPositions[fPositionInCache];
	601	utext_setNativeIndex(fText, pos);
	602	return pos;
	603	}
	604	else {
	605	reset();
	606	}
	607	}
	608
	609	int32_t startPos = current();
	610	fDictionaryCharCount = 0;
	611	int32_t result = handleNext(fData->fForwardTable);
	612	while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
	613	UChar32 prevChr = utext_char32At(fText, result-1);
	614	UChar32 currChr = utext_char32At(fText, result);
	615	if (currChr == U_SENTINEL \|\| prevChr == U_SENTINEL \|\| !u_isalpha(currChr) \|\| !u_isalpha(prevChr)) {
	616	break;
	617	}
	618	int32_t nextResult = handleNext(fData->fForwardTable);
	619	if (nextResult <= result) {
	620	break;
	621	}
	622	result = nextResult;
	623	}
	624	if (fDictionaryCharCount > 0) {
	625	result = checkDictionary(startPos, result, FALSE);
	626	}
	627	return result;
	628	}
	629
	630	/**
	631	* Advances the iterator backwards, to the last boundary preceding this one.
	632	* @return The position of the last boundary position preceding this one.
	633	*/
	634	int32_t RuleBasedBreakIterator57::previous(void) {
	635	int32_t result;
	636	int32_t startPos;
	637
	638	// if we have cached break positions and we're still in the range
	639	// covered by them, just move one step backward in the cache
	640	if (fCachedBreakPositions != NULL) {
	641	if (fPositionInCache > 0) {
	642	--fPositionInCache;
	643	// If we're at the beginning of the cache, need to reevaluate the
	644	// rule status
	645	if (fPositionInCache <= 0) {
	646	fLastStatusIndexValid = FALSE;
	647	}
	648	int32_t pos = fCachedBreakPositions[fPositionInCache];
	649	utext_setNativeIndex(fText, pos);
	650	return pos;
	651	}
	652	else {
	653	reset();
	654	}
	655	}
	656
	657	// if we're already sitting at the beginning of the text, return DONE
	658	if (fText == NULL \|\| (startPos = current()) == 0) {
	659	fLastRuleStatusIndex = 0;
	660	fLastStatusIndexValid = TRUE;
	661	return BreakIterator::DONE;
	662	}
	663
	664	if (fData->fSafeRevTable != NULL \|\| fData->fSafeFwdTable != NULL) {
	665	result = handlePrevious(fData->fReverseTable);
	666	while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
	667	UChar32 prevChr = utext_char32At(fText, result-1);
	668	UChar32 currChr = utext_char32At(fText, result);
	669	if (currChr == U_SENTINEL \|\| prevChr == U_SENTINEL \|\| !u_isalpha(currChr) \|\| !u_isalpha(prevChr)) {
	670	break;
	671	}
	672	int32_t prevResult = handlePrevious(fData->fReverseTable);
	673	if (prevResult >= result) {
	674	break;
	675	}
	676	result = prevResult;
	677	}
	678	if (fDictionaryCharCount > 0) {
	679	result = checkDictionary(result, startPos, TRUE);
	680	}
	681	return result;
	682	}
	683
	684	// old rule syntax
	685	// set things up. handlePrevious() will back us up to some valid
	686	// break position before the current position (we back our internal
	687	// iterator up one step to prevent handlePrevious() from returning
	688	// the current position), but not necessarily the last one before
	689	// where we started
	690
	691	int32_t start = current();
	692
	693	(void)UTEXT_PREVIOUS32(fText);
	694	int32_t lastResult = handlePrevious(fData->fReverseTable);
	695	if (lastResult == UBRK_DONE) {
	696	lastResult = 0;
	697	utext_setNativeIndex(fText, 0);
	698	}
	699	result = lastResult;
	700	int32_t lastTag = 0;
	701	UBool breakTagValid = FALSE;
	702
	703	// iterate forward from the known break position until we pass our
	704	// starting point. The last break position before the starting
	705	// point is our return value
	706
	707	for (;;) {
	708	result = next();
	709	if (result == BreakIterator::DONE \|\| result >= start) {
	710	break;
	711	}
	712	lastResult = result;
	713	lastTag = fLastRuleStatusIndex;
	714	breakTagValid = TRUE;
	715	}
	716
	717	// fLastBreakTag wants to have the value for section of text preceding
	718	// the result position that we are to return (in lastResult.) If
	719	// the backwards rules overshot and the above loop had to do two or more
	720	// next()s to move up to the desired return position, we will have a valid
	721	// tag value. But, if handlePrevious() took us to exactly the correct result position,
	722	// we wont have a tag value for that position, which is only set by handleNext().
	723
	724	// Set the current iteration position to be the last break position
	725	// before where we started, and then return that value.
	726	utext_setNativeIndex(fText, lastResult);
	727	fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
	728	fLastStatusIndexValid = breakTagValid;
	729
	730	// No need to check the dictionary; it will have been handled by
	731	// next()
	732
	733	return lastResult;
	734	}
	735
	736	/**
	737	* Sets the iterator to refer to the first boundary position following
	738	* the specified position.
	739	* @offset The position from which to begin searching for a break position.
	740	* @return The position of the first break after the current position.
	741	*/
	742	int32_t RuleBasedBreakIterator57::following(int32_t offset) {
	743	// if the offset passed in is already past the end of the text,
	744	// just return DONE; if it's before the beginning, return the
	745	// text's starting offset
	746	if (fText == NULL \|\| offset >= utext_nativeLength(fText)) {
	747	last();
	748	return next();
	749	}
	750	else if (offset < 0) {
	751	return first();
	752	}
	753
	754	// Move requested offset to a code point start. It might be on a trail surrogate,
	755	// or on a trail byte if the input is UTF-8.
	756	utext_setNativeIndex(fText, offset);
	757	offset = (int32_t)utext_getNativeIndex(fText);
	758
	759	// if we have cached break positions and offset is in the range
	760	// covered by them, use them
	761	// TODO: could use binary search
	762	// TODO: what if offset is outside range, but break is not?
	763	if (fCachedBreakPositions != NULL) {
	764	if (offset >= fCachedBreakPositions[0]
	765	&& offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
	766	fPositionInCache = 0;
	767	// We are guaranteed not to leave the array due to range test above
	768	while (offset >= fCachedBreakPositions[fPositionInCache]) {
	769	++fPositionInCache;
	770	}
	771	int32_t pos = fCachedBreakPositions[fPositionInCache];
	772	utext_setNativeIndex(fText, pos);
	773	return pos;
	774	}
	775	else {
	776	reset();
	777	}
	778	}
	779
	780	// Set our internal iteration position (temporarily)
	781	// to the position passed in. If this is the _beginning_ position,
	782	// then we can just use next() to get our return value
	783
	784	int32_t result = 0;
	785
	786	if (fData->fSafeRevTable != NULL) {
	787	// new rule syntax
	788	utext_setNativeIndex(fText, offset);
	789	// move forward one codepoint to prepare for moving back to a
	790	// safe point.
	791	// this handles offset being between a supplementary character
	792	// TODO: is this still needed, with move to code point boundary handled above?
	793	(void)UTEXT_NEXT32(fText);
	794	// handlePrevious will move most of the time to < 1 boundary away
	795	handlePrevious(fData->fSafeRevTable);
	796	int32_t result = next();
	797	while (result <= offset) {
	798	result = next();
	799	}
	800	return result;
	801	}
	802	if (fData->fSafeFwdTable != NULL) {
	803	// backup plan if forward safe table is not available
	804	utext_setNativeIndex(fText, offset);
	805	(void)UTEXT_PREVIOUS32(fText);
	806	// handle next will give result >= offset
	807	handleNext(fData->fSafeFwdTable);
	808	// previous will give result 0 or 1 boundary away from offset,
	809	// most of the time
	810	// we have to
	811	int32_t oldresult = previous();
	812	while (oldresult > offset) {
	813	int32_t result = previous();
	814	if (result <= offset) {
	815	return oldresult;
	816	}
	817	oldresult = result;
	818	}
	819	int32_t result = next();
	820	if (result <= offset) {
	821	return next();
	822	}
	823	return result;
	824	}
	825	// otherwise, we have to sync up first. Use handlePrevious() to back
	826	// up to a known break position before the specified position (if
	827	// we can determine that the specified position is a break position,
	828	// we don't back up at all). This may or may not be the last break
	829	// position at or before our starting position. Advance forward
	830	// from here until we've passed the starting position. The position
	831	// we stop on will be the first break position after the specified one.
	832	// old rule syntax
	833
	834	utext_setNativeIndex(fText, offset);
	835	if (offset==0 \|\|
	836	(offset==1 && utext_getNativeIndex(fText)==0)) {
	837	return next();
	838	}
	839	result = previous();
	840
	841	while (result != BreakIterator::DONE && result <= offset) {
	842	result = next();
	843	}
	844
	845	return result;
	846	}
	847
	848	/**
	849	* Sets the iterator to refer to the last boundary position before the
	850	* specified position.
	851	* @offset The position to begin searching for a break from.
	852	* @return The position of the last boundary before the starting position.
	853	*/
	854	int32_t RuleBasedBreakIterator57::preceding(int32_t offset) {
	855	// if the offset passed in is already past the end of the text,
	856	// just return DONE; if it's before the beginning, return the
	857	// text's starting offset
	858	if (fText == NULL \|\| offset > utext_nativeLength(fText)) {
	859	return last();
	860	}
	861	else if (offset < 0) {
	862	return first();
	863	}
	864
	865	// Move requested offset to a code point start. It might be on a trail surrogate,
	866	// or on a trail byte if the input is UTF-8.
	867	utext_setNativeIndex(fText, offset);
	868	offset = (int32_t)utext_getNativeIndex(fText);
	869
	870	// if we have cached break positions and offset is in the range
	871	// covered by them, use them
	872	if (fCachedBreakPositions != NULL) {
	873	// TODO: binary search?
	874	// TODO: What if offset is outside range, but break is not?
	875	if (offset > fCachedBreakPositions[0]
	876	&& offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
	877	fPositionInCache = 0;
	878	while (fPositionInCache < fNumCachedBreakPositions
	879	&& offset > fCachedBreakPositions[fPositionInCache])
	880	++fPositionInCache;
	881	--fPositionInCache;
	882	// If we're at the beginning of the cache, need to reevaluate the
	883	// rule status
	884	if (fPositionInCache <= 0) {
	885	fLastStatusIndexValid = FALSE;
	886	}
	887	utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
	888	return fCachedBreakPositions[fPositionInCache];
	889	}
	890	else {
	891	reset();
	892	}
	893	}
	894
	895	// if we start by updating the current iteration position to the
	896	// position specified by the caller, we can just use previous()
	897	// to carry out this operation
	898
	899	if (fData->fSafeFwdTable != NULL) {
	900	// new rule syntax
	901	utext_setNativeIndex(fText, offset);
	902	int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	903	if (newOffset != offset) {
	904	// Will come here if specified offset was not a code point boundary AND
	905	// the underlying implmentation is using UText, which snaps any non-code-point-boundary
	906	// indices to the containing code point.
	907	// For breakitereator::preceding only, these non-code-point indices need to be moved
	908	// up to refer to the following codepoint.
	909	(void)UTEXT_NEXT32(fText);
	910	offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	911	}
	912
	913	// TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
	914	// rather than adjusting the position unconditionally?
	915	// (Change would interact with safe rules.)
	916	// TODO: change RBBI behavior for off-boundary indices to match that of UText?
	917	// affects only preceding(), seems cleaner, but is slightly different.
	918	(void)UTEXT_PREVIOUS32(fText);
	919	handleNext(fData->fSafeFwdTable);
	920	int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	921	while (result >= offset) {
	922	result = previous();
	923	}
	924	return result;
	925	}
	926	if (fData->fSafeRevTable != NULL) {
	927	// backup plan if forward safe table is not available
	928	// TODO: check whether this path can be discarded
	929	// It's probably OK to say that rules must supply both safe tables
	930	// if they use safe tables at all. We have certainly never described
	931	// to anyone how to work with just one safe table.
	932	utext_setNativeIndex(fText, offset);
	933	(void)UTEXT_NEXT32(fText);
	934
	935	// handle previous will give result <= offset
	936	handlePrevious(fData->fSafeRevTable);
	937
	938	// next will give result 0 or 1 boundary away from offset,
	939	// most of the time
	940	// we have to
	941	int32_t oldresult = next();
	942	while (oldresult < offset) {
	943	int32_t result = next();
	944	if (result >= offset) {
	945	return oldresult;
	946	}
	947	oldresult = result;
	948	}
	949	int32_t result = previous();
	950	if (result >= offset) {
	951	return previous();
	952	}
	953	return result;
	954	}
	955
	956	// old rule syntax
	957	utext_setNativeIndex(fText, offset);
	958	return previous();
	959	}
	960
	961	/**
	962	* Returns true if the specfied position is a boundary position. As a side
	963	* effect, leaves the iterator pointing to the first boundary position at
	964	* or after "offset".
	965	* @param offset the offset to check.
	966	* @return True if "offset" is a boundary position.
	967	*/
	968	UBool RuleBasedBreakIterator57::isBoundary(int32_t offset) {
	969	// the beginning index of the iterator is always a boundary position by definition
	970	if (offset == 0) {
	971	first(); // For side effects on current position, tag values.
	972	return TRUE;
	973	}
	974
	975	if (offset == (int32_t)utext_nativeLength(fText)) {
	976	last(); // For side effects on current position, tag values.
	977	return TRUE;
	978	}
	979
	980	// out-of-range indexes are never boundary positions
	981	if (offset < 0) {
	982	first(); // For side effects on current position, tag values.
	983	return FALSE;
	984	}
	985
	986	if (offset > utext_nativeLength(fText)) {
	987	last(); // For side effects on current position, tag values.
	988	return FALSE;
	989	}
	990
	991	// otherwise, we can use following() on the position before the specified
	992	// one and return true if the position we get back is the one the user
	993	// specified
	994	utext_previous32From(fText, offset);
	995	int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	996	UBool result = following(backOne) == offset;
	997	return result;
	998	}
	999
	1000	/**
	1001	* Returns the current iteration position.
	1002	* @return The current iteration position.
	1003	*/
	1004	int32_t RuleBasedBreakIterator57::current(void) const {
	1005	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1006	return pos;
	1007	}
	1008
	1009	//=======================================================================
	1010	// implementation
	1011	//=======================================================================
	1012
	1013	//
	1014	// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
	1015	// of user text. A variable with this enum type keeps track of where we
	1016	// are. The state machine only fetches user input while in the RUN mode.
	1017	//
	1018	enum RBBIRunMode {
	1019	RBBI_START, // state machine processing is before first char of input
	1020	RBBI_RUN, // state machine processing is in the user text
	1021	RBBI_END // state machine processing is after end of user text.
	1022	};
	1023
	1024
	1025	// Map from look-ahead break states (corresponds to rules) to boundary positions.
	1026	// Allows multiple lookahead break rules to be in flight at the same time.
	1027	//
	1028	// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
	1029	// in the state table be sequential, then we can just index an array. And the
	1030	// table could also tell us in advance how big that array needs to be.
	1031	//
	1032	// Before ICU 57 there was just a single simple variable for a look-ahead match that
	1033	// was in progress. Two rules at once did not work.
	1034
	1035	static const int32_t kMaxLookaheads = 8;
	1036	struct LookAheadResults {
	1037	int32_t fUsedSlotLimit;
	1038	int32_t fPositions[8];
	1039	int16_t fKeys[8];
	1040
	1041	LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
	1042
	1043	int32_t getPosition(int16_t key) {
	1044	for (int32_t i=0; i<fUsedSlotLimit; ++i) {
	1045	if (fKeys[i] == key) {
	1046	return fPositions[i];
	1047	}
	1048	}
	1049	U_ASSERT(FALSE);
	1050	return -1;
	1051	}
	1052
	1053	void setPosition(int16_t key, int32_t position) {
	1054	int32_t i;
	1055	for (i=0; i<fUsedSlotLimit; ++i) {
	1056	if (fKeys[i] == key) {
	1057	fPositions[i] = position;
	1058	return;
	1059	}
	1060	}
	1061	if (i >= kMaxLookaheads) {
	1062	U_ASSERT(FALSE);
	1063	i = kMaxLookaheads - 1;
	1064	}
	1065	fKeys[i] = key;
	1066	fPositions[i] = position;
	1067	U_ASSERT(fUsedSlotLimit == i);
	1068	fUsedSlotLimit = i + 1;
	1069	}
	1070	};
	1071
	1072
	1073	//-----------------------------------------------------------------------------------
	1074	//
	1075	// handleNext(stateTable)
	1076	// This method is the actual implementation of the rbbi next() method.
	1077	// This method initializes the state machine to state 1
	1078	// and advances through the text character by character until we reach the end
	1079	// of the text or the state machine transitions to state 0. We update our return
	1080	// value every time the state machine passes through an accepting state.
	1081	//
	1082	//-----------------------------------------------------------------------------------
	1083	int32_t RuleBasedBreakIterator57::handleNext(const RBBIStateTable *statetable) {
	1084	int32_t state;
	1085	uint16_t category = 0;
	1086	RBBIRunMode mode;
	1087
	1088	RBBIStateTableRow *row;
	1089	UChar32 c;
	1090	LookAheadResults lookAheadMatches;
	1091	int32_t result = 0;
	1092	int32_t initialPosition = 0;
	1093	const char *tableData = statetable->fTableData;
	1094	uint32_t tableRowLen = statetable->fRowLen;
	1095
	1096	#ifdef RBBI_DEBUG
	1097	if (fTrace) {
	1098	RBBIDebugPuts("Handle Next pos char state category");
	1099	}
	1100	#endif
	1101
	1102	// No matter what, handleNext alway correctly sets the break tag value.
	1103	fLastStatusIndexValid = TRUE;
	1104	fLastRuleStatusIndex = 0;
	1105
	1106	// if we're already at the end of the text, return DONE.
	1107	initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1108	result = initialPosition;
	1109	c = UTEXT_NEXT32(fText);
	1110	if (fData == NULL \|\| c==U_SENTINEL) {
	1111	return BreakIterator::DONE;
	1112	}
	1113
	1114	// Set the initial state for the state machine
	1115	state = START_STATE;
	1116	row = (RBBIStateTableRow *)
	1117	//(statetable->fTableData + (statetable->fRowLen * state));
	1118	(tableData + tableRowLen * state);
	1119
	1120
	1121	mode = RBBI_RUN;
	1122	if (statetable->fFlags & RBBI_BOF_REQUIRED) {
	1123	category = 2;
	1124	mode = RBBI_START;
	1125	}
	1126
	1127
	1128	// loop until we reach the end of the text or transition to state 0
	1129	//
	1130	for (;;) {
	1131	if (c == U_SENTINEL) {
	1132	// Reached end of input string.
	1133	if (mode == RBBI_END) {
	1134	// We have already run the loop one last time with the
	1135	// character set to the psueudo {eof} value. Now it is time
	1136	// to unconditionally bail out.
	1137	break;
	1138	}
	1139	// Run the loop one last time with the fake end-of-input character category.
	1140	mode = RBBI_END;
	1141	category = 1;
	1142	}
	1143
	1144	//
	1145	// Get the char category. An incoming category of 1 or 2 means that
	1146	// we are preset for doing the beginning or end of input, and
	1147	// that we shouldn't get a category from an actual text input character.
	1148	//
	1149	if (mode == RBBI_RUN) {
	1150	// look up the current character's character category, which tells us
	1151	// which column in the state table to look at.
	1152	// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
	1153	// not the size of the character going in, which is a UChar32.
	1154	//
	1155	UTRIE_GET16(&fData->fTrie, c, category);
	1156
	1157	// Check the dictionary bit in the character's category.
	1158	// Counter is only used by dictionary based iterators (subclasses).
	1159	// Chars that need to be handled by a dictionary have a flag bit set
	1160	// in their category values.
	1161	//
	1162	if ((category & 0x4000) != 0) {
	1163	fDictionaryCharCount++;
	1164	// And off the dictionary flag bit.
	1165	category &= ~0x4000;
	1166	}
	1167	}
	1168
	1169	#ifdef RBBI_DEBUG
	1170	if (fTrace) {
	1171	RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText));
	1172	if (0x20<=c && c<0x7f) {
	1173	RBBIDebugPrintf("\"%c\" ", c);
	1174	} else {
	1175	RBBIDebugPrintf("%5x ", c);
	1176	}
	1177	RBBIDebugPrintf("%3d %3d\n", state, category);
	1178	}
	1179	#endif
	1180
	1181	// State Transition - move machine to its next state
	1182	//
	1183
	1184	// Note: fNextState is defined as uint16_t[2], but we are casting
	1185	// a generated RBBI table to RBBIStateTableRow and some tables
	1186	// actually have more than 2 categories.
	1187	U_ASSERT(category<fData->fHeader->fCatCount);
	1188	state = row->fNextState[category]; /Not accessing beyond memory/
	1189	row = (RBBIStateTableRow *)
	1190	// (statetable->fTableData + (statetable->fRowLen * state));
	1191	(tableData + tableRowLen * state);
	1192
	1193
	1194	if (row->fAccepting == -1) {
	1195	// Match found, common case.
	1196	if (mode != RBBI_START) {
	1197	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1198	}
	1199	fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
	1200	}
	1201
	1202	int16_t completedRule = row->fAccepting;
	1203	if (completedRule > 0) {
	1204	// Lookahead match is completed.
	1205	int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
	1206	if (lookaheadResult >= 0) {
	1207	fLastRuleStatusIndex = row->fTagIdx;
	1208	UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
	1209	return lookaheadResult;
	1210	}
	1211	}
	1212	int16_t rule = row->fLookAhead;
	1213	if (rule != 0) {
	1214	// At the position of a '/' in a look-ahead match. Record it.
	1215	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1216	lookAheadMatches.setPosition(rule, pos);
	1217	}
	1218
	1219	if (state == STOP_STATE) {
	1220	// This is the normal exit from the lookup state machine.
	1221	// We have advanced through the string until it is certain that no
	1222	// longer match is possible, no matter what characters follow.
	1223	break;
	1224	}
	1225
	1226	// Advance to the next character.
	1227	// If this is a beginning-of-input loop iteration, don't advance
	1228	// the input position. The next iteration will be processing the
	1229	// first real input character.
	1230	if (mode == RBBI_RUN) {
	1231	c = UTEXT_NEXT32(fText);
	1232	} else {
	1233	if (mode == RBBI_START) {
	1234	mode = RBBI_RUN;
	1235	}
	1236	}
	1237
	1238
	1239	}
	1240
	1241	// The state machine is done. Check whether it found a match...
	1242
	1243	// If the iterator failed to advance in the match engine, force it ahead by one.
	1244	// (This really indicates a defect in the break rules. They should always match
	1245	// at least one character.)
	1246	if (result == initialPosition) {
	1247	UTEXT_SETNATIVEINDEX(fText, initialPosition);
	1248	UTEXT_NEXT32(fText);
	1249	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1250	}
	1251
	1252	// Leave the iterator at our result position.
	1253	UTEXT_SETNATIVEINDEX(fText, result);
	1254	#ifdef RBBI_DEBUG
	1255	if (fTrace) {
	1256	RBBIDebugPrintf("result = %d\n\n", result);
	1257	}
	1258	#endif
	1259	return result;
	1260	}
	1261
	1262
	1263
	1264	//-----------------------------------------------------------------------------------
	1265	//
	1266	// handlePrevious()
	1267	//
	1268	// Iterate backwards, according to the logic of the reverse rules.
	1269	// This version handles the exact style backwards rules.
	1270	//
	1271	// The logic of this function is very similar to handleNext(), above.
	1272	//
	1273	//-----------------------------------------------------------------------------------
	1274	int32_t RuleBasedBreakIterator57::handlePrevious(const RBBIStateTable *statetable) {
	1275	int32_t state;
	1276	uint16_t category = 0;
	1277	RBBIRunMode mode;
	1278	RBBIStateTableRow *row;
	1279	UChar32 c;
	1280	LookAheadResults lookAheadMatches;
	1281	int32_t result = 0;
	1282	int32_t initialPosition = 0;
	1283
	1284	#ifdef RBBI_DEBUG
	1285	if (fTrace) {
	1286	RBBIDebugPuts("Handle Previous pos char state category");
	1287	}
	1288	#endif
	1289
	1290	// handlePrevious() never gets the rule status.
	1291	// Flag the status as invalid; if the user ever asks for status, we will need
	1292	// to back up, then re-find the break position using handleNext(), which does
	1293	// get the status value.
	1294	fLastStatusIndexValid = FALSE;
	1295	fLastRuleStatusIndex = 0;
	1296
	1297	// if we're already at the start of the text, return DONE.
	1298	if (fText == NULL \|\| fData == NULL \|\| UTEXT_GETNATIVEINDEX(fText)==0) {
	1299	return BreakIterator::DONE;
	1300	}
	1301
	1302	// Set up the starting char.
	1303	initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1304	result = initialPosition;
	1305	c = UTEXT_PREVIOUS32(fText);
	1306
	1307	// Set the initial state for the state machine
	1308	state = START_STATE;
	1309	row = (RBBIStateTableRow *)
	1310	(statetable->fTableData + (statetable->fRowLen * state));
	1311	category = 3;
	1312	mode = RBBI_RUN;
	1313	if (statetable->fFlags & RBBI_BOF_REQUIRED) {
	1314	category = 2;
	1315	mode = RBBI_START;
	1316	}
	1317
	1318
	1319	// loop until we reach the start of the text or transition to state 0
	1320	//
	1321	for (;;) {
	1322	if (c == U_SENTINEL) {
	1323	// Reached end of input string.
	1324	if (mode == RBBI_END) {
	1325	// We have already run the loop one last time with the
	1326	// character set to the psueudo {eof} value. Now it is time
	1327	// to unconditionally bail out.
	1328	if (result == initialPosition) {
	1329	// Ran off start, no match found.
	1330	// move one index one (towards the start, since we are doing a previous())
	1331	UTEXT_SETNATIVEINDEX(fText, initialPosition);
	1332	(void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
	1333	}
	1334	break;
	1335	}
	1336	// Run the loop one last time with the fake end-of-input character category.
	1337	mode = RBBI_END;
	1338	category = 1;
	1339	}
	1340
	1341	//
	1342	// Get the char category. An incoming category of 1 or 2 means that
	1343	// we are preset for doing the beginning or end of input, and
	1344	// that we shouldn't get a category from an actual text input character.
	1345	//
	1346	if (mode == RBBI_RUN) {
	1347	// look up the current character's character category, which tells us
	1348	// which column in the state table to look at.
	1349	// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
	1350	// not the size of the character going in, which is a UChar32.
	1351	//
	1352	UTRIE_GET16(&fData->fTrie, c, category);
	1353
	1354	// Check the dictionary bit in the character's category.
	1355	// Counter is only used by dictionary based iterators (subclasses).
	1356	// Chars that need to be handled by a dictionary have a flag bit set
	1357	// in their category values.
	1358	//
	1359	if ((category & 0x4000) != 0) {
	1360	fDictionaryCharCount++;
	1361	// And off the dictionary flag bit.
	1362	category &= ~0x4000;
	1363	}
	1364	}
	1365
	1366	#ifdef RBBI_DEBUG
	1367	if (fTrace) {
	1368	RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
	1369	if (0x20<=c && c<0x7f) {
	1370	RBBIDebugPrintf("\"%c\" ", c);
	1371	} else {
	1372	RBBIDebugPrintf("%5x ", c);
	1373	}
	1374	RBBIDebugPrintf("%3d %3d\n", state, category);
	1375	}
	1376	#endif
	1377
	1378	// State Transition - move machine to its next state
	1379	//
	1380
	1381	// Note: fNextState is defined as uint16_t[2], but we are casting
	1382	// a generated RBBI table to RBBIStateTableRow and some tables
	1383	// actually have more than 2 categories.
	1384	U_ASSERT(category<fData->fHeader->fCatCount);
	1385	state = row->fNextState[category]; /Not accessing beyond memory/
	1386	row = (RBBIStateTableRow *)
	1387	(statetable->fTableData + (statetable->fRowLen * state));
	1388
	1389	if (row->fAccepting == -1) {
	1390	// Match found, common case.
	1391	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1392	}
	1393
	1394	int16_t completedRule = row->fAccepting;
	1395	if (completedRule > 0) {
	1396	// Lookahead match is completed.
	1397	int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
	1398	if (lookaheadResult >= 0) {
	1399	UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
	1400	return lookaheadResult;
	1401	}
	1402	}
	1403	int16_t rule = row->fLookAhead;
	1404	if (rule != 0) {
	1405	// At the position of a '/' in a look-ahead match. Record it.
	1406	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1407	lookAheadMatches.setPosition(rule, pos);
	1408	}
	1409
	1410	if (state == STOP_STATE) {
	1411	// This is the normal exit from the lookup state machine.
	1412	// We have advanced through the string until it is certain that no
	1413	// longer match is possible, no matter what characters follow.
	1414	break;
	1415	}
	1416
	1417	// Move (backwards) to the next character to process.
	1418	// If this is a beginning-of-input loop iteration, don't advance
	1419	// the input position. The next iteration will be processing the
	1420	// first real input character.
	1421	if (mode == RBBI_RUN) {
	1422	c = UTEXT_PREVIOUS32(fText);
	1423	} else {
	1424	if (mode == RBBI_START) {
	1425	mode = RBBI_RUN;
	1426	}
	1427	}
	1428	}
	1429
	1430	// The state machine is done. Check whether it found a match...
	1431
	1432	// If the iterator failed to advance in the match engine, force it ahead by one.
	1433	// (This really indicates a defect in the break rules. They should always match
	1434	// at least one character.)
	1435	if (result == initialPosition) {
	1436	UTEXT_SETNATIVEINDEX(fText, initialPosition);
	1437	UTEXT_PREVIOUS32(fText);
	1438	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1439	}
	1440
	1441	// Leave the iterator at our result position.
	1442	UTEXT_SETNATIVEINDEX(fText, result);
	1443	#ifdef RBBI_DEBUG
	1444	if (fTrace) {
	1445	RBBIDebugPrintf("result = %d\n\n", result);
	1446	}
	1447	#endif
	1448	return result;
	1449	}
	1450
	1451
	1452	void
	1453	RuleBasedBreakIterator57::reset()
	1454	{
	1455	if (fCachedBreakPositions) {
	1456	uprv_free(fCachedBreakPositions);
	1457	}
	1458	fCachedBreakPositions = NULL;
	1459	fNumCachedBreakPositions = 0;
	1460	fDictionaryCharCount = 0;
	1461	fPositionInCache = 0;
	1462	}
	1463
	1464
	1465
	1466	//-------------------------------------------------------------------------------
	1467	//
	1468	// getRuleStatus() Return the break rule tag associated with the current
	1469	// iterator position. If the iterator arrived at its current
	1470	// position by iterating forwards, the value will have been
	1471	// cached by the handleNext() function.
	1472	//
	1473	// If no cached status value is available, the status is
	1474	// found by doing a previous() followed by a next(), which
	1475	// leaves the iterator where it started, and computes the
	1476	// status while doing the next().
	1477	//
	1478	//-------------------------------------------------------------------------------
	1479	void RuleBasedBreakIterator57::makeRuleStatusValid() {
	1480	if (fLastStatusIndexValid == FALSE) {
	1481	// No cached status is available.
	1482	if (fText == NULL \|\| current() == 0) {
	1483	// At start of text, or there is no text. Status is always zero.
	1484	fLastRuleStatusIndex = 0;
	1485	fLastStatusIndexValid = TRUE;
	1486	} else {
	1487	// Not at start of text. Find status the tedious way.
	1488	int32_t pa = current();
	1489	previous();
	1490	if (fNumCachedBreakPositions > 0) {
	1491	reset(); // Blow off the dictionary cache
	1492	}
	1493	int32_t pb = next();
	1494	if (pa != pb) {
	1495	// note: the if (pa != pb) test is here only to eliminate warnings for
	1496	// unused local variables on gcc. Logically, it isn't needed.
	1497	U_ASSERT(pa == pb);
	1498	}
	1499	}
	1500	}
	1501	U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
	1502	}
	1503
	1504
	1505	int32_t RuleBasedBreakIterator57::getRuleStatus() const {
	1506	RuleBasedBreakIterator57 nonConstThis = (RuleBasedBreakIterator57 )this;
	1507	nonConstThis->makeRuleStatusValid();
	1508
	1509	// fLastRuleStatusIndex indexes to the start of the appropriate status record
	1510	// (the number of status values.)
	1511	// This function returns the last (largest) of the array of status values.
	1512	int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
	1513	int32_t tagVal = fData->fRuleStatusTable[idx];
	1514
	1515	return tagVal;
	1516	}
	1517
	1518
	1519
	1520
	1521	int32_t RuleBasedBreakIterator57::getRuleStatusVec(
	1522	int32_t *fillInVec, int32_t capacity, UErrorCode &status)
	1523	{
	1524	if (U_FAILURE(status)) {
	1525	return 0;
	1526	}
	1527
	1528	RuleBasedBreakIterator57 nonConstThis = (RuleBasedBreakIterator57 )this;
	1529	nonConstThis->makeRuleStatusValid();
	1530	int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
	1531	int32_t numValsToCopy = numVals;
	1532	if (numVals > capacity) {
	1533	status = U_BUFFER_OVERFLOW_ERROR;
	1534	numValsToCopy = capacity;
	1535	}
	1536	int i;
	1537	for (i=0; i<numValsToCopy; i++) {
	1538	fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
	1539	}
	1540	return numVals;
	1541	}
	1542
	1543
	1544
	1545	//-------------------------------------------------------------------------------
	1546	//
	1547	// getBinaryRules Access to the compiled form of the rules,
	1548	// for use by build system tools that save the data
	1549	// for standard iterator types.
	1550	//
	1551	//-------------------------------------------------------------------------------
	1552	const uint8_t *RuleBasedBreakIterator57::getBinaryRules(uint32_t &length) {
	1553	const uint8_t *retPtr = NULL;
	1554	length = 0;
	1555
	1556	if (fData != NULL) {
	1557	retPtr = (const uint8_t *)fData->fHeader;
	1558	length = fData->fHeader->fLength;
	1559	}
	1560	return retPtr;
	1561	}
	1562
	1563
	1564	BreakIterator * RuleBasedBreakIterator57::createBufferClone(void * /stackBuffer/,
	1565	int32_t &bufferSize,
	1566	UErrorCode &status)
	1567	{
	1568	if (U_FAILURE(status)){
	1569	return NULL;
	1570	}
	1571
	1572	if (bufferSize == 0) {
	1573	bufferSize = 1; // preflighting for deprecated functionality
	1574	return NULL;
	1575	}
	1576
	1577	BreakIterator *clonedBI = clone();
	1578	if (clonedBI == NULL) {
	1579	status = U_MEMORY_ALLOCATION_ERROR;
	1580	} else {
	1581	status = U_SAFECLONE_ALLOCATED_WARNING;
	1582	}
	1583	return (RuleBasedBreakIterator57 *)clonedBI;
	1584	}
	1585
	1586	//-------------------------------------------------------------------------------
	1587	//
	1588	// checkDictionary This function handles all processing of characters in
	1589	// the "dictionary" set. It will determine the appropriate
	1590	// course of action, and possibly set up a cache in the
	1591	// process.
	1592	//
	1593	//-------------------------------------------------------------------------------
	1594	int32_t RuleBasedBreakIterator57::checkDictionary(int32_t startPos,
	1595	int32_t endPos,
	1596	UBool reverse) {
	1597	// Reset the old break cache first.
	1598	reset();
	1599
	1600	// note: code segment below assumes that dictionary chars are in the
	1601	// startPos-endPos range
	1602	// value returned should be next character in sequence
	1603	if ((endPos - startPos) <= 1) {
	1604	return (reverse ? startPos : endPos);
	1605	}
	1606
	1607	// Starting from the starting point, scan towards the proposed result,
	1608	// looking for the first dictionary character (which may be the one
	1609	// we're on, if we're starting in the middle of a range).
	1610	utext_setNativeIndex(fText, reverse ? endPos : startPos);
	1611	if (reverse) {
	1612	UTEXT_PREVIOUS32(fText);
	1613	}
	1614
	1615	int32_t rangeStart = startPos;
	1616	int32_t rangeEnd = endPos;
	1617
	1618	uint16_t category;
	1619	int32_t current;
	1620	UErrorCode status = U_ZERO_ERROR;
	1621	UVector32 breaks(status); // changed from UStack in ICU 57
	1622	int32_t foundBreakCount = 0;
	1623	UChar32 c = utext_current32(fText);
	1624
	1625	UTRIE_GET16(&fData->fTrie, c, category);
	1626
	1627	// Is the character we're starting on a dictionary character? If so, we
	1628	// need to back up to include the entire run; otherwise the results of
	1629	// the break algorithm will differ depending on where we start. Since
	1630	// the result is cached and there is typically a non-dictionary break
	1631	// within a small number of words, there should be little performance impact.
	1632	if (category & 0x4000) {
	1633	if (reverse) {
	1634	do {
	1635	utext_next32(fText); // TODO: recast to work directly with postincrement.
	1636	c = utext_current32(fText);
	1637	UTRIE_GET16(&fData->fTrie, c, category);
	1638	} while (c != U_SENTINEL && (category & 0x4000));
	1639	// Back up to the last dictionary character
	1640	rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
	1641	if (c == U_SENTINEL) {
	1642	// c = fText->last32();
	1643	// TODO: why was this if needed?
	1644	c = UTEXT_PREVIOUS32(fText);
	1645	}
	1646	else {
	1647	c = UTEXT_PREVIOUS32(fText);
	1648	}
	1649	}
	1650	else {
	1651	do {
	1652	c = UTEXT_PREVIOUS32(fText);
	1653	UTRIE_GET16(&fData->fTrie, c, category);
	1654	}
	1655	while (c != U_SENTINEL && (category & 0x4000));
	1656	// Back up to the last dictionary character
	1657	if (c == U_SENTINEL) {
	1658	// c = fText->first32();
	1659	c = utext_current32(fText);
	1660	}
	1661	else {
	1662	utext_next32(fText);
	1663	c = utext_current32(fText);
	1664	}
	1665	rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
	1666	}
	1667	UTRIE_GET16(&fData->fTrie, c, category);
	1668	}
	1669
	1670	// Loop through the text, looking for ranges of dictionary characters.
	1671	// For each span, find the appropriate break engine, and ask it to find
	1672	// any breaks within the span.
	1673	// Note: we always do this in the forward direction, so that the break
	1674	// cache is built in the right order.
	1675	if (reverse) {
	1676	utext_setNativeIndex(fText, rangeStart);
	1677	c = utext_current32(fText);
	1678	UTRIE_GET16(&fData->fTrie, c, category);
	1679	}
	1680	while(U_SUCCESS(status)) {
	1681	while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
	1682	utext_next32(fText); // TODO: tweak for post-increment operation
	1683	c = utext_current32(fText);
	1684	UTRIE_GET16(&fData->fTrie, c, category);
	1685	}
	1686	if (current >= rangeEnd) {
	1687	break;
	1688	}
	1689
	1690	// We now have a dictionary character. Get the appropriate language object
	1691	// to deal with it.
	1692	const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
	1693
	1694	// Ask the language object if there are any breaks. It will leave the text
	1695	// pointer on the other side of its range, ready to search for the next one.
	1696	if (lbe != NULL) {
	1697	foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, breaks);
	1698	}
	1699
	1700	// Reload the loop variables for the next go-round
	1701	c = utext_current32(fText);
	1702	UTRIE_GET16(&fData->fTrie, c, category);
	1703	}
	1704
	1705	// If we found breaks, build a new break cache. The first and last entries must
	1706	// be the original starting and ending position.
	1707	if (foundBreakCount > 0) {
	1708	U_ASSERT(foundBreakCount == breaks.size());
	1709	int32_t totalBreaks = foundBreakCount;
	1710	if (startPos < breaks.elementAti(0)) {
	1711	totalBreaks += 1;
	1712	}
	1713	if (endPos > breaks.peeki()) {
	1714	totalBreaks += 1;
	1715	}
	1716	fCachedBreakPositions = (int32_t )uprv_malloc(totalBreaks sizeof(int32_t));
	1717	if (fCachedBreakPositions != NULL) {
	1718	int32_t out = 0;
	1719	fNumCachedBreakPositions = totalBreaks;
	1720	if (startPos < breaks.elementAti(0)) {
	1721	fCachedBreakPositions[out++] = startPos;
	1722	}
	1723	for (int32_t i = 0; i < foundBreakCount; ++i) {
	1724	fCachedBreakPositions[out++] = breaks.elementAti(i);
	1725	}
	1726	if (endPos > fCachedBreakPositions[out-1]) {
	1727	fCachedBreakPositions[out] = endPos;
	1728	}
	1729	// If there are breaks, then by definition, we are replacing the original
	1730	// proposed break by one of the breaks we found. Use following() and
	1731	// preceding() to do the work. They should never recurse in this case.
	1732	if (reverse) {
	1733	return preceding(endPos);
	1734	}
	1735	else {
	1736	return following(startPos);
	1737	}
	1738	}
	1739	// If the allocation failed, just fall through to the "no breaks found" case.
	1740	}
	1741
	1742	// If we get here, there were no language-based breaks. Set the text pointer
	1743	// to the original proposed break.
	1744	utext_setNativeIndex(fText, reverse ? startPos : endPos);
	1745	return (reverse ? startPos : endPos);
	1746	}
	1747
	1748	U_NAMESPACE_END
	1749
	1750
	1751	static icu::UStack *gLanguageBreakFactories = NULL;
	1752	static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
	1753
	1754	/**
	1755	* Release all static memory held by breakiterator.
	1756	*/
	1757	U_CDECL_BEGIN
	1758	static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
	1759	if (gLanguageBreakFactories) {
	1760	delete gLanguageBreakFactories;
	1761	gLanguageBreakFactories = NULL;
	1762	}
	1763	gLanguageBreakFactoriesInitOnce.reset();
	1764	return TRUE;
	1765	}
	1766	U_CDECL_END
	1767
	1768	U_CDECL_BEGIN
	1769	static void U_CALLCONV _deleteFactory(void *obj) {
	1770	delete (icu::LanguageBreakFactory *) obj;
	1771	}
	1772	U_CDECL_END
	1773	U_NAMESPACE_BEGIN
	1774
	1775	static void U_CALLCONV initLanguageFactories() {
	1776	UErrorCode status = U_ZERO_ERROR;
	1777	U_ASSERT(gLanguageBreakFactories == NULL);
	1778	gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
	1779	if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
	1780	ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
	1781	gLanguageBreakFactories->push(builtIn, status);
	1782	#ifdef U_LOCAL_SERVICE_HOOK
	1783	LanguageBreakFactory extra = (LanguageBreakFactory )uprv_svc_hook("languageBreakFactory", &status);
	1784	if (extra != NULL) {
	1785	gLanguageBreakFactories->push(extra, status);
	1786	}
	1787	#endif
	1788	}
	1789	ucln_common_registerCleanup(UCLN_COMMON_RBBI57, breakiterator_cleanup_dict);
	1790	}
	1791
	1792
	1793	static const LanguageBreakEngine*
	1794	getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
	1795	{
	1796	umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
	1797	if (gLanguageBreakFactories == NULL) {
	1798	return NULL;
	1799	}
	1800
	1801	int32_t i = gLanguageBreakFactories->size();
	1802	const LanguageBreakEngine *lbe = NULL;
	1803	while (--i >= 0) {
	1804	LanguageBreakFactory factory = (LanguageBreakFactory )(gLanguageBreakFactories->elementAt(i));
	1805	lbe = factory->getEngineFor(c);
	1806	if (lbe != NULL) {
	1807	break;
	1808	}
	1809	}
	1810	return lbe;
	1811	}
	1812
	1813
	1814	//-------------------------------------------------------------------------------
	1815	//
	1816	// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
	1817	// the character c.
	1818	//
	1819	//-------------------------------------------------------------------------------
	1820	const LanguageBreakEngine *
	1821	RuleBasedBreakIterator57::getLanguageBreakEngine(UChar32 c) {
	1822	const LanguageBreakEngine *lbe = NULL;
	1823	UErrorCode status = U_ZERO_ERROR;
	1824
	1825	if (fLanguageBreakEngines == NULL) {
	1826	fLanguageBreakEngines = new UStack(status);
	1827	if (fLanguageBreakEngines == NULL \|\| U_FAILURE(status)) {
	1828	delete fLanguageBreakEngines;
	1829	fLanguageBreakEngines = 0;
	1830	return NULL;
	1831	}
	1832	}
	1833
	1834	int32_t i = fLanguageBreakEngines->size();
	1835	while (--i >= 0) {
	1836	lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
	1837	if (lbe->handles(c)) {
	1838	return lbe;
	1839	}
	1840	}
	1841
	1842	// No existing dictionary took the character. See if a factory wants to
	1843	// give us a new LanguageBreakEngine for this character.
	1844	lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
	1845
	1846	// If we got one, use it and push it on our stack.
	1847	if (lbe != NULL) {
	1848	fLanguageBreakEngines->push((void *)lbe, status);
	1849	// Even if we can't remember it, we can keep looking it up, so
	1850	// return it even if the push fails.
	1851	return lbe;
	1852	}
	1853
	1854	// No engine is forthcoming for this character. Add it to the
	1855	// reject set. Create the reject break engine if needed.
	1856	if (fUnhandledBreakEngine == NULL) {
	1857	fUnhandledBreakEngine = new UnhandledEngine(status);
	1858	if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
	1859	status = U_MEMORY_ALLOCATION_ERROR;
	1860	}
	1861	// Put it last so that scripts for which we have an engine get tried
	1862	// first.
	1863	fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
	1864	// If we can't insert it, or creation failed, get rid of it
	1865	if (U_FAILURE(status)) {
	1866	delete fUnhandledBreakEngine;
	1867	fUnhandledBreakEngine = 0;
	1868	return NULL;
	1869	}
	1870	}
	1871
	1872	// Tell the reject engine about the character; at its discretion, it may
	1873	// add more than just the one character.
	1874	fUnhandledBreakEngine->handleCharacter(c);
	1875
	1876	return fUnhandledBreakEngine;
	1877	}
	1878
	1879	void RuleBasedBreakIterator57::setBreakType(int32_t type) {
	1880	fBreakType = type;
	1881	reset();
	1882	}
	1883
	1884	U_NAMESPACE_END
	1885
	1886	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */