git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/urbtok.h

1 /*

2 ******************************************************************************

4 ******************************************************************************

5 */

7 #ifndef URBTOK_H

8 #define URBTOK_H

10 #include "unicode/utypes.h"

12 #if !UCONFIG_NO_BREAK_ITERATION

14 #include "unicode/ubrk.h"

15 #include "unicode/parseerr.h"

17 /**

18 * The interfaces here are meant to extend the functionality of the standard

19 * ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This

20 * was primarily intended for Spotlight and related processes. There are two

21 * versions of these:

22 *

23 * The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator

24 * class. These are intended to fully support all of the current rule syntax used

25 * by that class, and should urbtok_tokenize give results equivalent to a loop using a

26 * combination of the standard functions ubrk_next to get the next break (determining

27 * the length of the previous token) and ubrk_getRuleStatusVec to get a flag value

28 * formed as the bitwise OR of all of the values in the returnend vector, skipping all

29 * tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it

30 * assumes only one pass over the text in the forward direction, and shut skips caching

31 * of breaks positions and makes other simplifying assumptions. However, it may not be

32 * fast enough fo Spotlight.

33 *

34 * Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57

35 * version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These

36 * versions do not support any RuleBasedBreakIterator rule sytax enhancements from

37 * later than ICU 57.

38 *

39 * The two different sets of functions should not be mixed; urbtok57_getBinaryRules

40 * should only be used with a UBreakIterator created using urbtok57_openRules;

41 * urbtok57_tokenize should only be used with a UBreakIterator created using

42 * urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the

43 * urbtok_ functions should only be used with other urbtok_ functions.

44 */

46 /**

47 * struct for returning token results

48 */

49 typedef struct RuleBasedTokenRange {

50 signed long location;

51 signed long length;

52 } RuleBasedTokenRange;

54 /**

55 * Open a new UBreakIterator for locating text boundaries for a specified locale.

56 * A UBreakIterator may be used for detecting character, line, word,

57 * and sentence breaks in text.

58 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,

59 * UBRK_LINE, UBRK_SENTENCE

60 * @param locale The locale specifying the text-breaking conventions. Note that

61 * locale keys such as "lb" and "ss" may be used to modify text break behavior,

62 * see general discussion of BreakIterator C API.

63 * @param status A UErrorCode to receive any errors.

64 * @return A UBreakIterator for the specified type and locale.

65 * @see ubrk_open

66 * @internal

67 */

68 U_INTERNAL UBreakIterator* U_EXPORT2

69 urbtok_open(UBreakIteratorType type,

70 const char *locale,

71 UErrorCode *status);

73 /**

74 * Open a new UBreakIterator for tokenizing text using specified breaking rules.

75 * The rule syntax is ... (TBD)

76 * @param rules A set of rules specifying the text breaking conventions.

77 * @param rulesLength The number of characters in rules, or -1 if null-terminated.

78 * @param parseErr Receives position and context information for any syntax errors

79 * detected while parsing the rules.

80 * @param status A UErrorCode to receive any errors.

81 * @return A UBreakIterator for the specified rules.

82 * @see ubrk_open

83 * @internal

84 */

85 U_INTERNAL UBreakIterator* U_EXPORT2

 urbtok_openRules(const UChar     *rules,

87 int32_t rulesLength,

88 UParseError *parseErr,

89 UErrorCode *status);

91 /**

92 * Open a new UBreakIterator for tokenizing text using specified breaking rules.

93 * @param rules A set of rules specifying the text breaking conventions. The binary rules

94 * must be at least 32-bit aligned. Note: This version makes a copy of the

95 * rules, so after calling this function the caller can close or release

96 * the rules that were passed to this function. The copy created by this

97 * call will be freed when ubrk_close() is called on the UBreakIterator*.

98 * @param status A UErrorCode to receive any errors.

99 * @return A UBreakIterator for the specified rules.

100 * @see ubrk_open

101 * @internal

102 */

103 U_INTERNAL UBreakIterator* U_EXPORT2

 urbtok_openBinaryRules(const uint8_t *rules,

105 UErrorCode *status);

106

107 /**

108 * Open a new UBreakIterator for tokenizing text using specified breaking rules.

109 * @param rules A set of rules specifying the text breaking conventions. The binary rules

110 * must be at least 32-bit aligned. Note: This version does NOT make a copy

111 * of the rules, so after calling this function the caller must not close or

112 * release the rules passed to this function until after they are finished

113 * with this UBreakIterator* (and any others created using the same rules)

114 * and have called ubrk_close() to close the UBreakIterator* (and any others

115 * using the same rules).

116 * @param status A UErrorCode to receive any errors.

117 * @return A UBreakIterator for the specified rules.

118 * @see ubrk_open

119 * @internal

120 */

121 U_INTERNAL UBreakIterator* U_EXPORT2

 urbtok_openBinaryRulesNoCopy(const uint8_t *rules,

123 UErrorCode *status);

124

125 /**

126 * Get the (native-endian) binary break rules for this tokenizer.

127 * @param bi The tokenizer to use.

128 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.

129 * @param buffSize The size of the output buffer.

130 * @param status A UErrorCode to receive any errors.

131 * @return The actual size of the binary rules, whether they fit the buffer or not.

132 * @internal

133 */

134 U_INTERNAL uint32_t U_EXPORT2

135 urbtok_getBinaryRules(UBreakIterator *bi,

136 uint8_t *buffer,

137 uint32_t buffSize,

138 UErrorCode *status);

139

140 /**

141 * Tokenize text using a rule-based tokenizer.

142 * This is primarily intended for speedy batch tokenization using very simple rules.

143 * It does not currently implement support for all of the features of ICU break rules

144 * (adding that would reduce performance). If you need support for all of the ICU rule

145 * features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,

146 * use a loop with ubrk_next and ubrk_getRuleStatus.

147 *

148 * @param bi The tokenizer to use.

149 * @param maxTokens The maximum number of tokens to return.

150 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.

151 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.

152 * @return The number of tokens returned, 0 if done.

153 * @internal

154 */

155 U_INTERNAL int32_t U_EXPORT2

156 urbtok_tokenize(UBreakIterator *bi,

157 int32_t maxTokens,

158 RuleBasedTokenRange *outTokens,

159 unsigned long *outTokenFlags);

160

161 /**

162 * Swap the endianness of a set of binary break rules.

163 * @param rules A set of rules which need swapping.

164 * @param buffer The output buffer for the swapped rules, which must be the same

165 * size as the input rules buffer.

166 * @param inIsBigEndian UBool indicating whether the input is big-endian

167 * @param outIsBigEndian UBool indicating whether the output should be big-endian

168 * @param status A UErrorCode to receive any errors.

169 * @internal

170 */

171 U_INTERNAL void U_EXPORT2

 urbtok_swapBinaryRules(const uint8_t *rules,

173 uint8_t *buffer,

174 UBool inIsBigEndian,

175 UBool outIsBigEndian,

176 UErrorCode *status);

177

178

179

180 /**

181 * Open a new UBreakIterator for tokenizing text using specified breaking rules.

182 * The rule syntax is ... (TBD)

183 * @param rules A set of rules specifying the text breaking conventions.

184 * @param rulesLength The number of characters in rules, or -1 if null-terminated.

185 * @param parseErr Receives position and context information for any syntax errors

186 * detected while parsing the rules.

187 * @param status A UErrorCode to receive any errors.

188 * @return A UBreakIterator for the specified rules.

189 * @see ubrk_open

190 * @internal

191 */

192 U_INTERNAL UBreakIterator* U_EXPORT2

 urbtok57_openRules(const UChar     *rules,

194 int32_t rulesLength,

195 UParseError *parseErr,

196 UErrorCode *status);

197

198 /**

199 * Open a new UBreakIterator for tokenizing text using specified breaking rules.

200 * @param rules A set of rules specifying the text breaking conventions. The binary rules

201 * must be at least 32-bit aligned. Note: This version makes a copy of the

202 * rules, so after calling this function the caller can close or release

203 * the rules that were passed to this function. The copy created by this

204 * call will be freed when ubrk_close() is called on the UBreakIterator*.

205 * @param status A UErrorCode to receive any errors.

206 * @return A UBreakIterator for the specified rules.

207 * @see ubrk_open

208 * @internal

209 */

210 U_INTERNAL UBreakIterator* U_EXPORT2

 urbtok57_openBinaryRules(const uint8_t *rules,

212 UErrorCode *status);

213

214 /**

215 * Open a new UBreakIterator for tokenizing text using specified breaking rules.

216 * @param rules A set of rules specifying the text breaking conventions. The binary rules

217 * must be at least 32-bit aligned. Note: This version does NOT make a copy

218 * of the rules, so after calling this function the caller must not close or

219 * release the rules passed to this function until after they are finished

220 * with this UBreakIterator* (and any others created using the same rules)

221 * and have called ubrk_close() to close the UBreakIterator* (and any others

222 * using the same rules).

223 * @param status A UErrorCode to receive any errors.

224 * @return A UBreakIterator for the specified rules.

225 * @see ubrk_open

226 * @internal

227 */

228 U_INTERNAL UBreakIterator* U_EXPORT2

 urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,

230 UErrorCode *status);

231

232 /**

233 * Get the (native-endian) binary break rules for this tokenizer.

234 * @param bi The tokenizer to use.

235 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.

236 * @param buffSize The size of the output buffer.

237 * @param status A UErrorCode to receive any errors.

238 * @return The actual size of the binary rules, whether they fit the buffer or not.

239 * @internal

240 */

241 U_INTERNAL uint32_t U_EXPORT2

242 urbtok57_getBinaryRules(UBreakIterator *bi,

243 uint8_t *buffer,

244 uint32_t buffSize,

245 UErrorCode *status);

246

247 /**

248 * Tokenize text using a rule-based tokenizer.

249 * This is primarily intended for speedy batch tokenization using very simple rules.

250 * It does not currently implement support for all of the features of ICU break rules

251 * (adding that would reduce performance). If you need support for all of the ICU rule

252 * features, please use the standard Apple urbtok_tokenize, or a loop with standard

253 * ICU interfaes ubrk_next and ubrk_getRuleStatusVec.

254 *

255 * @param bi The tokenizer to use.

256 * @param maxTokens The maximum number of tokens to return.

257 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.

258 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.

259 * @return The number of tokens returned, 0 if done.

260 * @internal

261 */

262 U_INTERNAL int32_t U_EXPORT2

263 urbtok57_tokenize(UBreakIterator *bi,

264 int32_t maxTokens,

265 RuleBasedTokenRange *outTokens,

266 unsigned long *outTokenFlags);

267

268 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

269

270 #endif