[apple/icu.git] / icuSources / common / rbtok.cpp

/*
***************************************************************************
* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.                 *
***************************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "rbtok.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "rbbidata.h"

U_NAMESPACE_BEGIN


#if defined(__GNUC__) && (__GNUC__ >= 4)
#pragma GCC optimization_level 3
#endif

static const int16_t START_STATE = 1;     // The state number of the starting state
static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"

int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
{
    RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
    RuleBasedTokenRange *outTokenP = outTokenRanges;
    int32_t             state;
    int16_t             category;
    
    const RBBIStateTableRow  *row;
    const RBBIStateTableRow  *const startRow = fStartRow;

    int32_t             lastAcceptingState = 0;
    UChar32             c = 0;
    signed long         prev;
    signed long         result;
    const char         *const tableData       = fData->fForwardTable->fTableData;
    const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
    UText *text = fText;

    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPuts("Handle Next   pos   char  state category");
        }
    #endif

    fLastStatusIndexValid = FALSE;

    // if we're already at the end of the text, return DONE.
    prev = (signed long)UTEXT_GETNATIVEINDEX(text);
    
    // loop until we reach the end of the text or transition to state 0
    //
    const UTrie         *trie = &fData->fTrie;
    while (outTokenP < outTokenLimit) {
        c               = UTEXT_NEXT32(text);
        if (c == U_SENTINEL)
        {
            goto exitTokenizer;
        }
        //  Set the initial state for the state machine
        state = START_STATE;
        row = startRow;
            
        // if we have cached break positions and we're still in the range
        // covered by them, just move one step forward in the cache
        if (fCachedBreakPositions != NULL) {
            if (fPositionInCache < fNumCachedBreakPositions - 1) {
                ++fPositionInCache;
                result = fCachedBreakPositions[fPositionInCache];
                goto emitToken;
            }
            else {
                reset();
            }
        }

        while (c != U_SENTINEL) {
            //
            // Get the char category.  An incoming category of 1 or 2 means that
            //      we are preset for doing the beginning or end of input, and
            //      that we shouldn't get a category from an actual text input character.
            //
                // look up the current character's character category, which tells us
                // which column in the state table to look at.
                // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
                //        not the size of the character going in, which is a UChar32.
                //
                if (__builtin_expect((c < 0x100), 1))
                    category = fLatin1Cat[c];
                else
                    UTRIE_GET16(trie, c, category);
    
                // Check the dictionary bit in the character's category.
                //    Counter is only used by dictionary based iterators (subclasses).
                //    Chars that need to be handled by a dictionary have a flag bit set
                //    in their category values.
                //
                if (__builtin_expect((category & 0x4000) != 0, 0))  {
                    fDictionaryCharCount++;
                    //  And off the dictionary flag bit.
                    category &= ~0x4000;
                }
    
            #ifdef RBBI_DEBUG
                if (fTrace) {
                    RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
                    if (0x20<=c && c<0x7f) {
                        RBBIDebugPrintf("\"%c\"  ", c);
                    } else {
                        RBBIDebugPrintf("%5x  ", c);
                    }
                    RBBIDebugPrintf("%3d  %3d\n", state, category);
                }
            #endif
    
            // State Transition - move machine to its next state
            //
            state = row->fNextState[category];
            row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
    
            if (row->fAccepting == -1) {
                // Match found, common case.
                    result = (signed long)UTEXT_GETNATIVEINDEX(text);
                //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
                //lastStatusRow = row;
                lastAcceptingState = state;
            }
    
            if (state == STOP_STATE) {
                // This is the normal exit from the lookup state machine.
                // We have advanced through the string until it is certain that no
                //   longer match is possible, no matter what characters follow.
                break;
            }
            
            // Advance to the next character.  
            // If this is a beginning-of-input loop iteration, don't advance
            //    the input position.  The next iteration will be processing the
            //    first real input character.
                c = UTEXT_NEXT32(text);
        }

        if (fDictionaryCharCount > 0) {
            result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
        }

emitToken:
        // The state machine is done.  Check whether it found a match...
    
        // Leave the iterator at our result position.
        UTEXT_SETNATIVEINDEX(text, result);

        RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
        int32_t flags = fStateFlags[lastAcceptingState];

        if (flags == -1)
            goto skipToken;

        *outTokenP++ = range;
        if (outTokenFlags)
        {
            *outTokenFlags++ = (unsigned long) flags;
        }

        if (flags & 0x40000000)
            goto exitTokenizer;

skipToken:
        prev = result;
    }
    
exitTokenizer:
    return (outTokenP - outTokenRanges);
}

#if defined (__GNUC__) && (__GNUC__ >= 4)
#pragma GCC optimization_level reset
#endif

void
RuleBasedTokenizer::init()
{
    const RBBIStateTable *statetable = fData->fForwardTable;
    setBreakType(UBRK_WORD);
    fStartRow = (const RBBIStateTableRow *)
        (statetable->fTableData + (statetable->fRowLen * START_STATE));
    UChar i;
    const UTrie         *trie = &fData->fTrie;
    int16_t category;
    fLatin1Cat = new int16_t[256];
    for (i = 0; i < 256; ++i)
    {
        //UTRIE_GET16(trie, i, category);
        //fLatin1Cat[i] = category;
        fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
    }
    fStateFlags = new int32_t[statetable->fNumStates];
    for (i = 0; i < statetable->fNumStates; ++i)
    {
        const RBBIStateTableRow *row = (const RBBIStateTableRow *)
            (statetable->fTableData + (statetable->fRowLen * i));
        int32_t flags = 0;
        if (row->fAccepting == -1)
        {
            const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
            const int32_t *valLimit = vals + 1;
            valLimit += *vals++;
            while (vals < valLimit)
            {
                int32_t val = *vals++;
                if (val == 0)
                {
                    break;
                }
                else if (val > 0)
                {
                    flags |= val;
                }
                else
                {
                    flags = val;
                    break;
                }
            }
        }
        fStateFlags[i] = flags;
    }
}

RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
    : RuleBasedBreakIterator(rules, parseErr, err)
{
    init();
}

RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
    : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
{
    init();
}

RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
    : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
{
    init();
}

RuleBasedTokenizer::~RuleBasedTokenizer() {
    delete [] fStateFlags;
    delete [] fLatin1Cat;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
Commit	Line	Data
73c04bcf A	1	/*
73c04bcf A	2	***************************************************************************
46f4442e	3	* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. *
73c04bcf	4	***************************************************************************
73c04bcf A	5	*/
	6
	7	#include "unicode/utypes.h"
	8
	9	#if !UCONFIG_NO_BREAK_ITERATION
	10
	11	#include "rbtok.h"
	12	#include "unicode/ustring.h"
	13	#include "unicode/utext.h"
	14	#include "rbbidata.h"
	15
	16	U_NAMESPACE_BEGIN
	17
	18
	19	#if defined(__GNUC__) && (__GNUC__ >= 4)
	20	#pragma GCC optimization_level 3
	21	#endif
	22
	23	static const int16_t START_STATE = 1; // The state number of the starting state
	24	static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
	25
	26	int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange outTokenRanges, unsigned long outTokenFlags)
	27	{
	28	RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
	29	RuleBasedTokenRange *outTokenP = outTokenRanges;
	30	int32_t state;
	31	int16_t category;
	32
	33	const RBBIStateTableRow *row;
	34	const RBBIStateTableRow *const startRow = fStartRow;
	35
	36	int32_t lastAcceptingState = 0;
	37	UChar32 c = 0;
	38	signed long prev;
	39	signed long result;
	40	const char *const tableData = fData->fForwardTable->fTableData;
	41	const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
	42	UText *text = fText;
	43
	44	#ifdef RBBI_DEBUG
	45	if (fTrace) {
	46	RBBIDebugPuts("Handle Next pos char state category");
	47	}
	48	#endif
	49
	50	fLastStatusIndexValid = FALSE;
	51
	52	// if we're already at the end of the text, return DONE.
	53	prev = (signed long)UTEXT_GETNATIVEINDEX(text);
	54
	55	// loop until we reach the end of the text or transition to state 0
	56	//
	57	const UTrie *trie = &fData->fTrie;
	58	while (outTokenP < outTokenLimit) {
	59	c = UTEXT_NEXT32(text);
	60	if (c == U_SENTINEL)
	61	{
	62	goto exitTokenizer;
	63	}
	64	// Set the initial state for the state machine
	65	state = START_STATE;
	66	row = startRow;
	67
	68	// if we have cached break positions and we're still in the range
69	// covered by them, just move one step forward in the cache
70	if (fCachedBreakPositions != NULL) {
71	if (fPositionInCache < fNumCachedBreakPositions - 1) {
72	++fPositionInCache;
73	result = fCachedBreakPositions[fPositionInCache];
74	goto emitToken;
75	}
76	else {
77	reset();
78	}
79	}
80
81	while (c != U_SENTINEL) {
82	//
83	// Get the char category. An incoming category of 1 or 2 means that
84	// we are preset for doing the beginning or end of input, and
85	// that we shouldn't get a category from an actual text input character.
86	//
87	// look up the current character's character category, which tells us
88	// which column in the state table to look at.
89	// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
90	// not the size of the character going in, which is a UChar32.
91	//
92	if (__builtin_expect((c < 0x100), 1))
93	category = fLatin1Cat[c];
94	else
95	UTRIE_GET16(trie, c, category);
96
97	// Check the dictionary bit in the character's category.
98	// Counter is only used by dictionary based iterators (subclasses).
99	// Chars that need to be handled by a dictionary have a flag bit set
100	// in their category values.
101	//
102	if (__builtin_expect((category & 0x4000) != 0, 0)) {
103	fDictionaryCharCount++;
104	// And off the dictionary flag bit.
105	category &= ~0x4000;
106	}
107
108	#ifdef RBBI_DEBUG
109	if (fTrace) {
110	RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
111	if (0x20<=c && c<0x7f) {
112	RBBIDebugPrintf("\"%c\" ", c);
113	} else {
114	RBBIDebugPrintf("%5x ", c);
115	}
116	RBBIDebugPrintf("%3d %3d\n", state, category);
117	}
118	#endif
119
120	// State Transition - move machine to its next state
121	//
122	state = row->fNextState[category];
123	row = (const RBBIStateTableRow ) (tableData + tableRowLen state);
124
125	if (row->fAccepting == -1) {
126	// Match found, common case.
127	result = (signed long)UTEXT_GETNATIVEINDEX(text);
128	//fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
129	//lastStatusRow = row;
130	lastAcceptingState = state;
131	}
132
133	if (state == STOP_STATE) {
134	// This is the normal exit from the lookup state machine.
135	// We have advanced through the string until it is certain that no
136	// longer match is possible, no matter what characters follow.
137	break;
138	}
139
140	// Advance to the next character.
141	// If this is a beginning-of-input loop iteration, don't advance
142	// the input position. The next iteration will be processing the
143	// first real input character.
144	c = UTEXT_NEXT32(text);
145	}
146
147	if (fDictionaryCharCount > 0) {
148	result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
149	}
150
151	emitToken:
152	// The state machine is done. Check whether it found a match...
153
154	// Leave the iterator at our result position.
155	UTEXT_SETNATIVEINDEX(text, result);
156
157	RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
158	int32_t flags = fStateFlags[lastAcceptingState];
159
160	if (flags == -1)
161	goto skipToken;
162
163	*outTokenP++ = range;
164	if (outTokenFlags)
165	{
166	*outTokenFlags++ = (unsigned long) flags;
167	}
168
169	if (flags & 0x40000000)
170	goto exitTokenizer;
171
172	skipToken:
173	prev = result;
174	}
175
176	exitTokenizer:
177	return (outTokenP - outTokenRanges);
178	}
179
180	#if defined (__GNUC__) && (__GNUC__ >= 4)
181	#pragma GCC optimization_level reset
182	#endif
183
184	void
185	RuleBasedTokenizer::init()
186	{
187	const RBBIStateTable *statetable = fData->fForwardTable;
188	setBreakType(UBRK_WORD);
189	fStartRow = (const RBBIStateTableRow *)
190	(statetable->fTableData + (statetable->fRowLen * START_STATE));
191	UChar i;
192	const UTrie *trie = &fData->fTrie;
193	int16_t category;
194	fLatin1Cat = new int16_t[256];
195	for (i = 0; i < 256; ++i)
196	{
197	//UTRIE_GET16(trie, i, category);
198	//fLatin1Cat[i] = category;
199	fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
200	}
201	fStateFlags = new int32_t[statetable->fNumStates];
202	for (i = 0; i < statetable->fNumStates; ++i)
203	{
204	const RBBIStateTableRow row = (const RBBIStateTableRow )
205	(statetable->fTableData + (statetable->fRowLen * i));
206	int32_t flags = 0;
207	if (row->fAccepting == -1)
208	{
209	const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
210	const int32_t *valLimit = vals + 1;
211	valLimit += *vals++;
212	while (vals < valLimit)
213	{
214	int32_t val = *vals++;
215	if (val == 0)
216	{
217	break;
218	}
219	else if (val > 0)
220	{
221	flags \|= val;
222	}
223	else
224	{
225	flags = val;
226	break;
227	}
228	}
229	}
230	fStateFlags[i] = flags;
231	}
232	}
233
234	RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
235	: RuleBasedBreakIterator(rules, parseErr, err)
236	{
237	init();
238	}
239
240	RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
241	: RuleBasedBreakIterator((RBBIDataHeader *)data, status)
242	{
243	init();
244	}
245
46f4442e A	246	RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
	247	: RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
	248	{
	249	init();
	250	}
	251
73c04bcf A	252	RuleBasedTokenizer::~RuleBasedTokenizer() {
	253	delete [] fStateFlags;
	254	delete [] fLatin1Cat;
	255	}
	256
	257	U_NAMESPACE_END
	258
	259	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */