+}*/
+
+
+//-------------------------------------------------------------------------------
+//
+// checkDictionary This function handles all processing of characters in
+// the "dictionary" set. It will determine the appropriate
+// course of action, and possibly set up a cache in the
+// process.
+//
+//-------------------------------------------------------------------------------
+int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
+ int32_t endPos,
+ UBool reverse) {
+ // Reset the old break cache first.
+ reset();
+
+ // note: code segment below assumes that dictionary chars are in the
+ // startPos-endPos range
+ // value returned should be next character in sequence
+ if ((endPos - startPos) <= 1) {
+ return (reverse ? startPos : endPos);
+ }
+
+ // Bug 5532. The dictionary code will crash if the input text is UTF-8
+ // because native indexes are different from UTF-16 indexes.
+ // Temporary hack: skip dictionary lookup for UTF-8 encoded text.
+ // It wont give the right breaks, but it's better than a crash.
+ //
+ // Check the type of the UText by checking its pFuncs field, which
+ // is UText's function dispatch table. It will be the same for all
+ // UTF-8 UTexts and different for any other UText type.
+ //
+ // We have no other type of UText available with non-UTF-16 native indexing.
+ // This whole check will go away once the dictionary code is fixed.
+ static const void *utext_utf8Funcs;
+ if (utext_utf8Funcs == NULL) {
+ // Cache the UTF-8 UText function pointer value.
+ UErrorCode status = U_ZERO_ERROR;
+ UText tempUText = UTEXT_INITIALIZER;
+ utext_openUTF8(&tempUText, NULL, 0, &status);
+ utext_utf8Funcs = tempUText.pFuncs;
+ utext_close(&tempUText);
+ }
+ if (fText->pFuncs == utext_utf8Funcs) {
+ return (reverse ? startPos : endPos);
+ }
+
+ // Starting from the starting point, scan towards the proposed result,
+ // looking for the first dictionary character (which may be the one
+ // we're on, if we're starting in the middle of a range).
+ utext_setNativeIndex(fText, reverse ? endPos : startPos);
+ if (reverse) {
+ UTEXT_PREVIOUS32(fText);
+ }
+
+ int32_t rangeStart = startPos;
+ int32_t rangeEnd = endPos;
+
+ uint16_t category;
+ int32_t current;
+ UErrorCode status = U_ZERO_ERROR;
+ UStack breaks(status);
+ int32_t foundBreakCount = 0;
+ UChar32 c = utext_current32(fText);
+
+ UTRIE_GET16(&fData->fTrie, c, category);
+
+ // Is the character we're starting on a dictionary character? If so, we
+ // need to back up to include the entire run; otherwise the results of
+ // the break algorithm will differ depending on where we start. Since
+ // the result is cached and there is typically a non-dictionary break
+ // within a small number of words, there should be little performance impact.
+ if (category & 0x4000) {
+ if (reverse) {
+ do {
+ utext_next32(fText); // TODO: recast to work directly with postincrement.
+ c = utext_current32(fText);
+ UTRIE_GET16(&fData->fTrie, c, category);
+ } while (c != U_SENTINEL && (category & 0x4000));
+ // Back up to the last dictionary character
+ rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ if (c == U_SENTINEL) {
+ // c = fText->last32();
+ // TODO: why was this if needed?
+ c = UTEXT_PREVIOUS32(fText);
+ }
+ else {
+ c = UTEXT_PREVIOUS32(fText);
+ }
+ }
+ else {
+ do {
+ c = UTEXT_PREVIOUS32(fText);
+ UTRIE_GET16(&fData->fTrie, c, category);
+ }
+ while (c != U_SENTINEL && (category & 0x4000));
+ // Back up to the last dictionary character
+ if (c == U_SENTINEL) {
+ // c = fText->first32();
+ c = utext_current32(fText);
+ }
+ else {
+ utext_next32(fText);
+ c = utext_current32(fText);
+ }
+ rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
+ }
+ UTRIE_GET16(&fData->fTrie, c, category);
+ }
+
+ // Loop through the text, looking for ranges of dictionary characters.
+ // For each span, find the appropriate break engine, and ask it to find
+ // any breaks within the span.
+ // Note: we always do this in the forward direction, so that the break
+ // cache is built in the right order.
+ if (reverse) {
+ utext_setNativeIndex(fText, rangeStart);
+ c = utext_current32(fText);
+ UTRIE_GET16(&fData->fTrie, c, category);
+ }
+ while(U_SUCCESS(status)) {
+ while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
+ utext_next32(fText); // TODO: tweak for post-increment operation
+ c = utext_current32(fText);
+ UTRIE_GET16(&fData->fTrie, c, category);
+ }
+ if (current >= rangeEnd) {
+ break;
+ }
+
+ // We now have a dictionary character. Get the appropriate language object
+ // to deal with it.
+ const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
+
+ // Ask the language object if there are any breaks. It will leave the text
+ // pointer on the other side of its range, ready to search for the next one.
+ if (lbe != NULL) {
+ foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
+ }
+
+ // Reload the loop variables for the next go-round
+ c = utext_current32(fText);
+ UTRIE_GET16(&fData->fTrie, c, category);
+ }
+
+ // If we found breaks, build a new break cache. The first and last entries must
+ // be the original starting and ending position.
+ if (foundBreakCount > 0) {
+ U_ASSERT(foundBreakCount == breaks.size());
+ int32_t totalBreaks = foundBreakCount;
+ if (startPos < breaks.elementAti(0)) {
+ totalBreaks += 1;
+ }
+ if (endPos > breaks.peeki()) {
+ totalBreaks += 1;
+ }
+ fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
+ if (fCachedBreakPositions != NULL) {
+ int32_t out = 0;
+ fNumCachedBreakPositions = totalBreaks;
+ if (startPos < breaks.elementAti(0)) {
+ fCachedBreakPositions[out++] = startPos;
+ }
+ for (int32_t i = 0; i < foundBreakCount; ++i) {
+ fCachedBreakPositions[out++] = breaks.elementAti(i);
+ }
+ if (endPos > fCachedBreakPositions[out-1]) {
+ fCachedBreakPositions[out] = endPos;
+ }
+ // If there are breaks, then by definition, we are replacing the original
+ // proposed break by one of the breaks we found. Use following() and
+ // preceding() to do the work. They should never recurse in this case.
+ if (reverse) {
+ return preceding(endPos);
+ }
+ else {
+ return following(startPos);
+ }
+ }
+ // If the allocation failed, just fall through to the "no breaks found" case.
+ }
+
+ // If we get here, there were no language-based breaks. Set the text pointer
+ // to the original proposed break.
+ utext_setNativeIndex(fText, reverse ? startPos : endPos);
+ return (reverse ? startPos : endPos);