icuSources/common/unicode/urbtok.h

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 2006-2008, 2017-2018 Apple Inc. All Rights Reserved.
   4 ******************************************************************************
   5 */
   6
   7 #ifndef URBTOK_H
   8 #define URBTOK_H
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "unicode/ubrk.h"
  15 #include "unicode/parseerr.h"
  16
  17 /**
  18  * The interfaces here are meant to extend the functionality of the standard
  19  * ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
  20  * was primarily intended for Spotlight and related processes. There are two
  21  * versions of these:
  22  *
  23  * The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator
  24  * class. These are intended to fully support all of the current rule syntax used
  25  * by that class, and should urbtok_tokenize give results equivalent to a loop using a
  26  * combination of the standard functions ubrk_next to get the next break (determining
  27  * the length of the previous token) and ubrk_getRuleStatusVec to get a flag value
  28  * formed as the bitwise OR of all of the values in the returnend vector, skipping all
  29  * tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it
  30  * assumes only one pass over the text in the forward direction, and shut skips caching
  31  * of breaks positions and makes other simplifying assumptions. However, it may not be
  32  * fast enough fo Spotlight.
  33  *
  34  * Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57
  35  * version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These
  36  * versions do not support any RuleBasedBreakIterator rule sytax enhancements from
  37  * later than ICU 57.
  38  *
  39  * The two different sets of functions should not be mixed; urbtok57_getBinaryRules
  40  * should only be used with a UBreakIterator created using urbtok57_openRules;
  41  * urbtok57_tokenize should only be used with a UBreakIterator created using
  42  * urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the
  43  * urbtok_ functions should only be used with other urbtok_ functions.
  44  */
  45
  46 /**
  47  * struct for returning token results
  48  */
  49 typedef struct RuleBasedTokenRange {
  50     signed long location;
  51     signed long length;
  52 } RuleBasedTokenRange;
  53
  54 /**
  55  * Open a new UBreakIterator for locating text boundaries for a specified locale.
  56  * A UBreakIterator may be used for detecting character, line, word,
  57  * and sentence breaks in text.
  58  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
  59  * UBRK_LINE, UBRK_SENTENCE
  60  * @param locale The locale specifying the text-breaking conventions. Note that
  61  * locale keys such as "lb" and "ss" may be used to modify text break behavior,
  62  * see general discussion of BreakIterator C API.
  63  * @param status A UErrorCode to receive any errors.
  64  * @return A UBreakIterator for the specified type and locale.
  65  * @see ubrk_open
  66  * @internal
  67  */
  68 U_INTERNAL UBreakIterator* U_EXPORT2
  69 urbtok_open(UBreakIteratorType type,
  70            const char *locale,
  71            UErrorCode *status);
  72
  73 /**
  74  * Open a new UBreakIterator for tokenizing text using specified breaking rules.
  75  * The rule syntax is ... (TBD)
  76  * @param rules A set of rules specifying the text breaking conventions.
  77  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
  78  * @param parseErr   Receives position and context information for any syntax errors
  79  *                   detected while parsing the rules.
  80  * @param status A UErrorCode to receive any errors.
  81  * @return A UBreakIterator for the specified rules.
  82  * @see ubrk_open
  83  * @internal
  84  */
  85 U_INTERNAL UBreakIterator* U_EXPORT2
  86 urbtok_openRules(const UChar     *rules,
  87                int32_t         rulesLength,
  88                UParseError     *parseErr,
  89                UErrorCode      *status);
  90
  91 /**
  92  * Open a new UBreakIterator for tokenizing text using specified breaking rules.
  93  * @param rules A set of rules specifying the text breaking conventions. The binary rules
  94  *              must be at least 32-bit aligned. Note: This version makes a copy of the
  95  *                              rules, so after calling this function the caller can close or release
  96  *                              the rules that were passed to this function. The copy created by this
  97  *                              call will be freed when ubrk_close() is called on the UBreakIterator*.
  98  * @param status A UErrorCode to receive any errors.
  99  * @return A UBreakIterator for the specified rules.
 100  * @see ubrk_open
 101  * @internal
 102  */
 103 U_INTERNAL UBreakIterator* U_EXPORT2
 104 urbtok_openBinaryRules(const uint8_t *rules,
 105                UErrorCode      *status);
 106
 107 /**
 108  * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 109  * @param rules A set of rules specifying the text breaking conventions. The binary rules
 110  *              must be at least 32-bit aligned. Note: This version does NOT make a copy
 111  *                              of the rules, so after calling this function the caller must not close or
 112  *                              release the rules passed to this function until after they are finished
 113  *                              with this UBreakIterator* (and any others created using the same rules)
 114   *                             and have called ubrk_close() to close the UBreakIterator* (and any others
 115  *                              using the same rules).
 116  * @param status A UErrorCode to receive any errors.
 117  * @return A UBreakIterator for the specified rules.
 118  * @see ubrk_open
 119  * @internal
 120  */
 121 U_INTERNAL UBreakIterator* U_EXPORT2
 122 urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
 123                UErrorCode      *status);
 124
 125 /**
 126  * Get the (native-endian) binary break rules for this tokenizer.
 127  * @param bi The tokenizer to use.
 128  * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
 129  * @param buffSize The size of the output buffer.
 130  * @param status A UErrorCode to receive any errors.
 131  * @return The actual size of the binary rules, whether they fit the buffer or not.
 132  * @internal
 133  */
 134 U_INTERNAL uint32_t U_EXPORT2
 135 urbtok_getBinaryRules(UBreakIterator      *bi,
 136                 uint8_t             *buffer,
 137                 uint32_t            buffSize,
 138                 UErrorCode          *status);
 139
 140 /**
 141  * Tokenize text using a rule-based tokenizer.
 142  * This is primarily intended for speedy batch tokenization using very simple rules.
 143  * It does not currently implement support for all of the features of ICU break rules
 144  * (adding that would reduce performance). If you need support for all of the ICU rule
 145  * features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,
 146  * use a loop with ubrk_next and ubrk_getRuleStatus.
 147  *
 148  * @param bi The tokenizer to use.
 149  * @param maxTokens The maximum number of tokens to return.
 150  * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
 151  * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
 152  * @return The number of tokens returned, 0 if done.
 153  * @internal
 154  */
 155 U_INTERNAL int32_t U_EXPORT2
 156 urbtok_tokenize(UBreakIterator      *bi,
 157                int32_t              maxTokens,
 158                RuleBasedTokenRange  *outTokens,
 159                unsigned long        *outTokenFlags);
 160
 161 /**
 162  * Swap the endianness of a set of binary break rules.
 163  * @param rules A set of rules which need swapping.
 164  * @param buffer The output buffer for the swapped rules, which must be the same
 165  *               size as the input rules buffer.
 166  * @param inIsBigEndian UBool indicating whether the input is big-endian
 167  * @param outIsBigEndian UBool indicating whether the output should be big-endian
 168  * @param status A UErrorCode to receive any errors.
 169  * @internal
 170  */
 171 U_INTERNAL void U_EXPORT2
 172 urbtok_swapBinaryRules(const uint8_t *rules,
 173                uint8_t          *buffer,
 174                UBool            inIsBigEndian,
 175                UBool            outIsBigEndian,
 176                UErrorCode       *status);
 177
 178
 179
 180 /**
 181  * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 182  * The rule syntax is ... (TBD)
 183  * @param rules A set of rules specifying the text breaking conventions.
 184  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 185  * @param parseErr   Receives position and context information for any syntax errors
 186  *                   detected while parsing the rules.
 187  * @param status A UErrorCode to receive any errors.
 188  * @return A UBreakIterator for the specified rules.
 189  * @see ubrk_open
 190  * @internal
 191  */
 192 U_INTERNAL UBreakIterator* U_EXPORT2
 193 urbtok57_openRules(const UChar     *rules,
 194                int32_t         rulesLength,
 195                UParseError     *parseErr,
 196                UErrorCode      *status);
 197
 198 /**
 199  * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 200  * @param rules A set of rules specifying the text breaking conventions. The binary rules
 201  *              must be at least 32-bit aligned. Note: This version makes a copy of the
 202  *                              rules, so after calling this function the caller can close or release
 203  *                              the rules that were passed to this function. The copy created by this
 204  *                              call will be freed when ubrk_close() is called on the UBreakIterator*.
 205  * @param status A UErrorCode to receive any errors.
 206  * @return A UBreakIterator for the specified rules.
 207  * @see ubrk_open
 208  * @internal
 209  */
 210 U_INTERNAL UBreakIterator* U_EXPORT2
 211 urbtok57_openBinaryRules(const uint8_t *rules,
 212                UErrorCode      *status);
 213
 214 /**
 215  * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 216  * @param rules A set of rules specifying the text breaking conventions. The binary rules
 217  *              must be at least 32-bit aligned. Note: This version does NOT make a copy
 218  *                              of the rules, so after calling this function the caller must not close or
 219  *                              release the rules passed to this function until after they are finished
 220  *                              with this UBreakIterator* (and any others created using the same rules)
 221   *                             and have called ubrk_close() to close the UBreakIterator* (and any others
 222  *                              using the same rules).
 223  * @param status A UErrorCode to receive any errors.
 224  * @return A UBreakIterator for the specified rules.
 225  * @see ubrk_open
 226  * @internal
 227  */
 228 U_INTERNAL UBreakIterator* U_EXPORT2
 229 urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
 230                UErrorCode      *status);
 231
 232 /**
 233  * Get the (native-endian) binary break rules for this tokenizer.
 234  * @param bi The tokenizer to use.
 235  * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
 236  * @param buffSize The size of the output buffer.
 237  * @param status A UErrorCode to receive any errors.
 238  * @return The actual size of the binary rules, whether they fit the buffer or not.
 239  * @internal
 240  */
 241 U_INTERNAL uint32_t U_EXPORT2
 242 urbtok57_getBinaryRules(UBreakIterator      *bi,
 243                 uint8_t             *buffer,
 244                 uint32_t            buffSize,
 245                 UErrorCode          *status);
 246
 247 /**
 248  * Tokenize text using a rule-based tokenizer.
 249  * This is primarily intended for speedy batch tokenization using very simple rules.
 250  * It does not currently implement support for all of the features of ICU break rules
 251  * (adding that would reduce performance). If you need support for all of the ICU rule
 252  * features, please use the standard Apple urbtok_tokenize, or a loop with standard
 253  * ICU interfaes ubrk_next and ubrk_getRuleStatusVec.
 254  *
 255  * @param bi The tokenizer to use.
 256  * @param maxTokens The maximum number of tokens to return.
 257  * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
 258  * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
 259  * @return The number of tokens returned, 0 if done.
 260  * @internal
 261  */
 262 U_INTERNAL int32_t U_EXPORT2
 263 urbtok57_tokenize(UBreakIterator      *bi,
 264                int32_t              maxTokens,
 265                RuleBasedTokenRange  *outTokens,
 266                unsigned long        *outTokenFlags);
 267
 268 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 269
 270 #endif