-
-//-------------------------------------------------------------------------------
-//
-// isDictionaryChar Return true if the category lookup for this char
-// indicates that it is in the set of dictionary lookup
-// chars.
-//
-// This function is intended for use by dictionary based
-// break iterators.
-//
-//-------------------------------------------------------------------------------
-/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
- if (fData == NULL) {
- return FALSE;
- }
- uint16_t category;
- UTRIE_GET16(&fData->fTrie, c, category);
- return (category & 0x4000) != 0;
-}*/
-
-
-//-------------------------------------------------------------------------------
-//
-// checkDictionary This function handles all processing of characters in
-// the "dictionary" set. It will determine the appropriate
-// course of action, and possibly set up a cache in the
-// process.
-//
-//-------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
- int32_t endPos,
- UBool reverse) {
- // Reset the old break cache first.
- reset();
-
- // note: code segment below assumes that dictionary chars are in the
- // startPos-endPos range
- // value returned should be next character in sequence
- if ((endPos - startPos) <= 1) {
- return (reverse ? startPos : endPos);
- }
-
- // Starting from the starting point, scan towards the proposed result,
- // looking for the first dictionary character (which may be the one
- // we're on, if we're starting in the middle of a range).
- utext_setNativeIndex(fText, reverse ? endPos : startPos);
- if (reverse) {
- UTEXT_PREVIOUS32(fText);
- }
-
- int32_t rangeStart = startPos;
- int32_t rangeEnd = endPos;
-
- uint16_t category;
- int32_t current;
- UErrorCode status = U_ZERO_ERROR;
- UStack breaks(status);
- int32_t foundBreakCount = 0;
- UChar32 c = utext_current32(fText);
-
- UTRIE_GET16(&fData->fTrie, c, category);
-
- // Is the character we're starting on a dictionary character? If so, we
- // need to back up to include the entire run; otherwise the results of
- // the break algorithm will differ depending on where we start. Since
- // the result is cached and there is typically a non-dictionary break
- // within a small number of words, there should be little performance impact.
- if (category & 0x4000) {
- if (reverse) {
- do {
- utext_next32(fText); // TODO: recast to work directly with postincrement.
- c = utext_current32(fText);
- UTRIE_GET16(&fData->fTrie, c, category);
- } while (c != U_SENTINEL && (category & 0x4000));
- // Back up to the last dictionary character
- rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
- if (c == U_SENTINEL) {
- // c = fText->last32();
- // TODO: why was this if needed?
- c = UTEXT_PREVIOUS32(fText);
- }
- else {
- c = UTEXT_PREVIOUS32(fText);
- }
- }
- else {
- do {
- c = UTEXT_PREVIOUS32(fText);
- UTRIE_GET16(&fData->fTrie, c, category);
- }
- while (c != U_SENTINEL && (category & 0x4000));
- // Back up to the last dictionary character
- if (c == U_SENTINEL) {
- // c = fText->first32();
- c = utext_current32(fText);
- }
- else {
- utext_next32(fText);
- c = utext_current32(fText);
- }
- rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
- }
- UTRIE_GET16(&fData->fTrie, c, category);
- }
-
- // Loop through the text, looking for ranges of dictionary characters.
- // For each span, find the appropriate break engine, and ask it to find
- // any breaks within the span.
- // Note: we always do this in the forward direction, so that the break
- // cache is built in the right order.
- if (reverse) {
- utext_setNativeIndex(fText, rangeStart);
- c = utext_current32(fText);
- UTRIE_GET16(&fData->fTrie, c, category);
- }
- while(U_SUCCESS(status)) {
- while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
- utext_next32(fText); // TODO: tweak for post-increment operation
- c = utext_current32(fText);
- UTRIE_GET16(&fData->fTrie, c, category);
- }
- if (current >= rangeEnd) {
- break;
- }
-
- // We now have a dictionary character. Get the appropriate language object
- // to deal with it.
- const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
-
- // Ask the language object if there are any breaks. It will leave the text
- // pointer on the other side of its range, ready to search for the next one.
- if (lbe != NULL) {
- foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
- }
-
- // Reload the loop variables for the next go-round
- c = utext_current32(fText);
- UTRIE_GET16(&fData->fTrie, c, category);
- }
-
- // If we found breaks, build a new break cache. The first and last entries must
- // be the original starting and ending position.
- if (foundBreakCount > 0) {
- U_ASSERT(foundBreakCount == breaks.size());
- int32_t totalBreaks = foundBreakCount;
- if (startPos < breaks.elementAti(0)) {
- totalBreaks += 1;
- }
- if (endPos > breaks.peeki()) {
- totalBreaks += 1;
- }
- fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
- if (fCachedBreakPositions != NULL) {
- int32_t out = 0;
- fNumCachedBreakPositions = totalBreaks;
- if (startPos < breaks.elementAti(0)) {
- fCachedBreakPositions[out++] = startPos;
- }
- for (int32_t i = 0; i < foundBreakCount; ++i) {
- fCachedBreakPositions[out++] = breaks.elementAti(i);
- }
- if (endPos > fCachedBreakPositions[out-1]) {
- fCachedBreakPositions[out] = endPos;
- }
- // If there are breaks, then by definition, we are replacing the original
- // proposed break by one of the breaks we found. Use following() and
- // preceding() to do the work. They should never recurse in this case.
- if (reverse) {
- return preceding(endPos);
- }
- else {
- return following(startPos);
- }
- }
- // If the allocation failed, just fall through to the "no breaks found" case.
- }
-
- // If we get here, there were no language-based breaks. Set the text pointer
- // to the original proposed break.
- utext_setNativeIndex(fText, reverse ? startPos : endPos);
- return (reverse ? startPos : endPos);
-}
-