]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/urbtok.h
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / unicode / urbtok.h
CommitLineData
73c04bcf
A
1/*
2******************************************************************************
0f5d89e8 3* Copyright (C) 2006-2008, 2017-2018 Apple Inc. All Rights Reserved.
73c04bcf
A
4******************************************************************************
5*/
6
7#ifndef URBTOK_H
8#define URBTOK_H
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_BREAK_ITERATION
13
73c04bcf
A
14#include "unicode/ubrk.h"
15#include "unicode/parseerr.h"
16
f3c0d7a5
A
17/**
18 * The interfaces here are meant to extend the functionality of the standard
19 * ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
0f5d89e8
A
20 * was primarily intended for Spotlight and related processes. There are two
21 * versions of these:
22 *
23 * The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator
24 * class. These are intended to fully support all of the current rule syntax used
25 * by that class, and should urbtok_tokenize give results equivalent to a loop using a
26 * combination of the standard functions ubrk_next to get the next break (determining
27 * the length of the previous token) and ubrk_getRuleStatusVec to get a flag value
28 * formed as the bitwise OR of all of the values in the returnend vector, skipping all
29 * tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it
30 * assumes only one pass over the text in the forward direction, and shut skips caching
31 * of breaks positions and makes other simplifying assumptions. However, it may not be
32 * fast enough fo Spotlight.
33 *
34 * Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57
35 * version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These
36 * versions do not support any RuleBasedBreakIterator rule sytax enhancements from
37 * later than ICU 57.
38 *
39 * The two different sets of functions should not be mixed; urbtok57_getBinaryRules
40 * should only be used with a UBreakIterator created using urbtok57_openRules;
41 * urbtok57_tokenize should only be used with a UBreakIterator created using
42 * urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the
43 * urbtok_ functions should only be used with other urbtok_ functions.
44 */
45
46/**
47 * struct for returning token results
f3c0d7a5 48 */
73c04bcf
A
49typedef struct RuleBasedTokenRange {
50 signed long location;
51 signed long length;
52} RuleBasedTokenRange;
53
0f5d89e8
A
54/**
55 * Open a new UBreakIterator for locating text boundaries for a specified locale.
56 * A UBreakIterator may be used for detecting character, line, word,
57 * and sentence breaks in text.
58 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
59 * UBRK_LINE, UBRK_SENTENCE
60 * @param locale The locale specifying the text-breaking conventions. Note that
61 * locale keys such as "lb" and "ss" may be used to modify text break behavior,
62 * see general discussion of BreakIterator C API.
63 * @param status A UErrorCode to receive any errors.
64 * @return A UBreakIterator for the specified type and locale.
65 * @see ubrk_open
66 * @internal
67 */
68U_INTERNAL UBreakIterator* U_EXPORT2
69urbtok_open(UBreakIteratorType type,
70 const char *locale,
71 UErrorCode *status);
72
73c04bcf
A
73/**
74 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
75 * The rule syntax is ... (TBD)
76 * @param rules A set of rules specifying the text breaking conventions.
77 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
78 * @param parseErr Receives position and context information for any syntax errors
79 * detected while parsing the rules.
80 * @param status A UErrorCode to receive any errors.
81 * @return A UBreakIterator for the specified rules.
82 * @see ubrk_open
83 * @internal
84 */
85U_INTERNAL UBreakIterator* U_EXPORT2
86urbtok_openRules(const UChar *rules,
87 int32_t rulesLength,
88 UParseError *parseErr,
89 UErrorCode *status);
90
91/**
92 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
93 * @param rules A set of rules specifying the text breaking conventions. The binary rules
46f4442e
A
94 * must be at least 32-bit aligned. Note: This version makes a copy of the
95 * rules, so after calling this function the caller can close or release
96 * the rules that were passed to this function. The copy created by this
97 * call will be freed when ubrk_close() is called on the UBreakIterator*.
73c04bcf
A
98 * @param status A UErrorCode to receive any errors.
99 * @return A UBreakIterator for the specified rules.
100 * @see ubrk_open
101 * @internal
102 */
103U_INTERNAL UBreakIterator* U_EXPORT2
104urbtok_openBinaryRules(const uint8_t *rules,
46f4442e
A
105 UErrorCode *status);
106
107/**
108 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
109 * @param rules A set of rules specifying the text breaking conventions. The binary rules
110 * must be at least 32-bit aligned. Note: This version does NOT make a copy
111 * of the rules, so after calling this function the caller must not close or
112 * release the rules passed to this function until after they are finished
113 * with this UBreakIterator* (and any others created using the same rules)
114 * and have called ubrk_close() to close the UBreakIterator* (and any others
115 * using the same rules).
116 * @param status A UErrorCode to receive any errors.
117 * @return A UBreakIterator for the specified rules.
118 * @see ubrk_open
119 * @internal
120 */
121U_INTERNAL UBreakIterator* U_EXPORT2
122urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
73c04bcf
A
123 UErrorCode *status);
124
125/**
126 * Get the (native-endian) binary break rules for this tokenizer.
127 * @param bi The tokenizer to use.
128 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
129 * @param buffSize The size of the output buffer.
130 * @param status A UErrorCode to receive any errors.
131 * @return The actual size of the binary rules, whether they fit the buffer or not.
132 * @internal
133 */
134U_INTERNAL uint32_t U_EXPORT2
135urbtok_getBinaryRules(UBreakIterator *bi,
136 uint8_t *buffer,
137 uint32_t buffSize,
138 UErrorCode *status);
139
140/**
141 * Tokenize text using a rule-based tokenizer.
f3c0d7a5
A
142 * This is primarily intended for speedy batch tokenization using very simple rules.
143 * It does not currently implement support for all of the features of ICU break rules
144 * (adding that would reduce performance). If you need support for all of the ICU rule
145 * features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,
146 * use a loop with ubrk_next and ubrk_getRuleStatus.
147 *
73c04bcf
A
148 * @param bi The tokenizer to use.
149 * @param maxTokens The maximum number of tokens to return.
150 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
151 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
152 * @return The number of tokens returned, 0 if done.
153 * @internal
154 */
155U_INTERNAL int32_t U_EXPORT2
156urbtok_tokenize(UBreakIterator *bi,
157 int32_t maxTokens,
158 RuleBasedTokenRange *outTokens,
159 unsigned long *outTokenFlags);
160
161/**
162 * Swap the endianness of a set of binary break rules.
163 * @param rules A set of rules which need swapping.
164 * @param buffer The output buffer for the swapped rules, which must be the same
165 * size as the input rules buffer.
166 * @param inIsBigEndian UBool indicating whether the input is big-endian
167 * @param outIsBigEndian UBool indicating whether the output should be big-endian
168 * @param status A UErrorCode to receive any errors.
169 * @internal
170 */
171U_INTERNAL void U_EXPORT2
172urbtok_swapBinaryRules(const uint8_t *rules,
173 uint8_t *buffer,
174 UBool inIsBigEndian,
175 UBool outIsBigEndian,
176 UErrorCode *status);
177
178
0f5d89e8
A
179
180/**
181 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
182 * The rule syntax is ... (TBD)
183 * @param rules A set of rules specifying the text breaking conventions.
184 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
185 * @param parseErr Receives position and context information for any syntax errors
186 * detected while parsing the rules.
187 * @param status A UErrorCode to receive any errors.
188 * @return A UBreakIterator for the specified rules.
189 * @see ubrk_open
190 * @internal
191 */
192U_INTERNAL UBreakIterator* U_EXPORT2
193urbtok57_openRules(const UChar *rules,
194 int32_t rulesLength,
195 UParseError *parseErr,
196 UErrorCode *status);
197
198/**
199 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
200 * @param rules A set of rules specifying the text breaking conventions. The binary rules
201 * must be at least 32-bit aligned. Note: This version makes a copy of the
202 * rules, so after calling this function the caller can close or release
203 * the rules that were passed to this function. The copy created by this
204 * call will be freed when ubrk_close() is called on the UBreakIterator*.
205 * @param status A UErrorCode to receive any errors.
206 * @return A UBreakIterator for the specified rules.
207 * @see ubrk_open
208 * @internal
209 */
210U_INTERNAL UBreakIterator* U_EXPORT2
211urbtok57_openBinaryRules(const uint8_t *rules,
212 UErrorCode *status);
213
214/**
215 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
216 * @param rules A set of rules specifying the text breaking conventions. The binary rules
217 * must be at least 32-bit aligned. Note: This version does NOT make a copy
218 * of the rules, so after calling this function the caller must not close or
219 * release the rules passed to this function until after they are finished
220 * with this UBreakIterator* (and any others created using the same rules)
221 * and have called ubrk_close() to close the UBreakIterator* (and any others
222 * using the same rules).
223 * @param status A UErrorCode to receive any errors.
224 * @return A UBreakIterator for the specified rules.
225 * @see ubrk_open
226 * @internal
227 */
228U_INTERNAL UBreakIterator* U_EXPORT2
229urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
230 UErrorCode *status);
231
232/**
233 * Get the (native-endian) binary break rules for this tokenizer.
234 * @param bi The tokenizer to use.
235 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
236 * @param buffSize The size of the output buffer.
237 * @param status A UErrorCode to receive any errors.
238 * @return The actual size of the binary rules, whether they fit the buffer or not.
239 * @internal
240 */
241U_INTERNAL uint32_t U_EXPORT2
242urbtok57_getBinaryRules(UBreakIterator *bi,
243 uint8_t *buffer,
244 uint32_t buffSize,
245 UErrorCode *status);
246
247/**
248 * Tokenize text using a rule-based tokenizer.
249 * This is primarily intended for speedy batch tokenization using very simple rules.
250 * It does not currently implement support for all of the features of ICU break rules
251 * (adding that would reduce performance). If you need support for all of the ICU rule
252 * features, please use the standard Apple urbtok_tokenize, or a loop with standard
253 * ICU interfaes ubrk_next and ubrk_getRuleStatusVec.
254 *
255 * @param bi The tokenizer to use.
256 * @param maxTokens The maximum number of tokens to return.
257 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
258 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
259 * @return The number of tokens returned, 0 if done.
260 * @internal
261 */
262U_INTERNAL int32_t U_EXPORT2
263urbtok57_tokenize(UBreakIterator *bi,
264 int32_t maxTokens,
265 RuleBasedTokenRange *outTokens,
266 unsigned long *outTokenFlags);
267
73c04bcf
A
268#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
269
270#endif