/*
******************************************************************************
-* Copyright (C) 2006-2008, 2017 Apple Inc. All Rights Reserved.
+* Copyright (C) 2006-2008, 2017-2018 Apple Inc. All Rights Reserved.
******************************************************************************
*/
#if !UCONFIG_NO_BREAK_ITERATION
-#include "unicode/utext.h"
#include "unicode/ubrk.h"
#include "unicode/parseerr.h"
/**
* The interfaces here are meant to extend the functionality of the standard
* ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
- * was primarily intended for Spotlight and related processes. Note that
- * urbtok_tokenize here does not fully support all features of ICU break rules.
+ * was primarily intended for Spotlight and related processes. There are two
+ * versions of these:
+ *
+ * The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator
+ * class. These are intended to fully support all of the current rule syntax used
+ * by that class, and should urbtok_tokenize give results equivalent to a loop using a
+ * combination of the standard functions ubrk_next to get the next break (determining
+ * the length of the previous token) and ubrk_getRuleStatusVec to get a flag value
+ * formed as the bitwise OR of all of the values in the returnend vector, skipping all
+ * tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it
+ * assumes only one pass over the text in the forward direction, and shut skips caching
+ * of breaks positions and makes other simplifying assumptions. However, it may not be
+ * fast enough fo Spotlight.
+ *
+ * Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57
+ * version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These
+ * versions do not support any RuleBasedBreakIterator rule sytax enhancements from
+ * later than ICU 57.
+ *
+ * The two different sets of functions should not be mixed; urbtok57_getBinaryRules
+ * should only be used with a UBreakIterator created using urbtok57_openRules;
+ * urbtok57_tokenize should only be used with a UBreakIterator created using
+ * urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the
+ * urbtok_ functions should only be used with other urbtok_ functions.
+ */
+
+/**
+ * struct for returning token results
*/
-
typedef struct RuleBasedTokenRange {
signed long location;
signed long length;
} RuleBasedTokenRange;
+/**
+ * Open a new UBreakIterator for locating text boundaries for a specified locale.
+ * A UBreakIterator may be used for detecting character, line, word,
+ * and sentence breaks in text.
+ * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
+ * UBRK_LINE, UBRK_SENTENCE
+ * @param locale The locale specifying the text-breaking conventions. Note that
+ * locale keys such as "lb" and "ss" may be used to modify text break behavior,
+ * see general discussion of BreakIterator C API.
+ * @param status A UErrorCode to receive any errors.
+ * @return A UBreakIterator for the specified type and locale.
+ * @see ubrk_open
+ * @internal
+ */
+U_INTERNAL UBreakIterator* U_EXPORT2
+urbtok_open(UBreakIteratorType type,
+ const char *locale,
+ UErrorCode *status);
+
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* The rule syntax is ... (TBD)
UErrorCode *status);
+
+/**
+ * Open a new UBreakIterator for tokenizing text using specified breaking rules.
+ * The rule syntax is ... (TBD)
+ * @param rules A set of rules specifying the text breaking conventions.
+ * @param rulesLength The number of characters in rules, or -1 if null-terminated.
+ * @param parseErr Receives position and context information for any syntax errors
+ * detected while parsing the rules.
+ * @param status A UErrorCode to receive any errors.
+ * @return A UBreakIterator for the specified rules.
+ * @see ubrk_open
+ * @internal
+ */
+U_INTERNAL UBreakIterator* U_EXPORT2
+urbtok57_openRules(const UChar *rules,
+ int32_t rulesLength,
+ UParseError *parseErr,
+ UErrorCode *status);
+
+/**
+ * Open a new UBreakIterator for tokenizing text using specified breaking rules.
+ * @param rules A set of rules specifying the text breaking conventions. The binary rules
+ * must be at least 32-bit aligned. Note: This version makes a copy of the
+ * rules, so after calling this function the caller can close or release
+ * the rules that were passed to this function. The copy created by this
+ * call will be freed when ubrk_close() is called on the UBreakIterator*.
+ * @param status A UErrorCode to receive any errors.
+ * @return A UBreakIterator for the specified rules.
+ * @see ubrk_open
+ * @internal
+ */
+U_INTERNAL UBreakIterator* U_EXPORT2
+urbtok57_openBinaryRules(const uint8_t *rules,
+ UErrorCode *status);
+
+/**
+ * Open a new UBreakIterator for tokenizing text using specified breaking rules.
+ * @param rules A set of rules specifying the text breaking conventions. The binary rules
+ * must be at least 32-bit aligned. Note: This version does NOT make a copy
+ * of the rules, so after calling this function the caller must not close or
+ * release the rules passed to this function until after they are finished
+ * with this UBreakIterator* (and any others created using the same rules)
+ * and have called ubrk_close() to close the UBreakIterator* (and any others
+ * using the same rules).
+ * @param status A UErrorCode to receive any errors.
+ * @return A UBreakIterator for the specified rules.
+ * @see ubrk_open
+ * @internal
+ */
+U_INTERNAL UBreakIterator* U_EXPORT2
+urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
+ UErrorCode *status);
+
+/**
+ * Get the (native-endian) binary break rules for this tokenizer.
+ * @param bi The tokenizer to use.
+ * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
+ * @param buffSize The size of the output buffer.
+ * @param status A UErrorCode to receive any errors.
+ * @return The actual size of the binary rules, whether they fit the buffer or not.
+ * @internal
+ */
+U_INTERNAL uint32_t U_EXPORT2
+urbtok57_getBinaryRules(UBreakIterator *bi,
+ uint8_t *buffer,
+ uint32_t buffSize,
+ UErrorCode *status);
+
+/**
+ * Tokenize text using a rule-based tokenizer.
+ * This is primarily intended for speedy batch tokenization using very simple rules.
+ * It does not currently implement support for all of the features of ICU break rules
+ * (adding that would reduce performance). If you need support for all of the ICU rule
+ * features, please use the standard Apple urbtok_tokenize, or a loop with standard
+ * ICU interfaes ubrk_next and ubrk_getRuleStatusVec.
+ *
+ * @param bi The tokenizer to use.
+ * @param maxTokens The maximum number of tokens to return.
+ * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
+ * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
+ * @return The number of tokens returned, 0 if done.
+ * @internal
+ */
+U_INTERNAL int32_t U_EXPORT2
+urbtok57_tokenize(UBreakIterator *bi,
+ int32_t maxTokens,
+ RuleBasedTokenRange *outTokens,
+ unsigned long *outTokenFlags);
+
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif