]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/urbtok.h
ICU-59180.0.1.tar.gz
[apple/icu.git] / icuSources / common / unicode / urbtok.h
1 /*
2 ******************************************************************************
3 * Copyright (C) 2006-2008, 2017 Apple Inc. All Rights Reserved.
4 ******************************************************************************
5 */
6
7 #ifndef URBTOK_H
8 #define URBTOK_H
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/utext.h"
15 #include "unicode/ubrk.h"
16 #include "unicode/parseerr.h"
17
18 /**
19 * The interfaces here are meant to extend the functionality of the standard
20 * ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
21 * was primarily intended for Spotlight and related processes. Note that
22 * urbtok_tokenize here does not fully support all features of ICU break rules.
23 */
24
25 typedef struct RuleBasedTokenRange {
26 signed long location;
27 signed long length;
28 } RuleBasedTokenRange;
29
30 /**
31 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
32 * The rule syntax is ... (TBD)
33 * @param rules A set of rules specifying the text breaking conventions.
34 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
35 * @param parseErr Receives position and context information for any syntax errors
36 * detected while parsing the rules.
37 * @param status A UErrorCode to receive any errors.
38 * @return A UBreakIterator for the specified rules.
39 * @see ubrk_open
40 * @internal
41 */
42 U_INTERNAL UBreakIterator* U_EXPORT2
43 urbtok_openRules(const UChar *rules,
44 int32_t rulesLength,
45 UParseError *parseErr,
46 UErrorCode *status);
47
48 /**
49 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
50 * @param rules A set of rules specifying the text breaking conventions. The binary rules
51 * must be at least 32-bit aligned. Note: This version makes a copy of the
52 * rules, so after calling this function the caller can close or release
53 * the rules that were passed to this function. The copy created by this
54 * call will be freed when ubrk_close() is called on the UBreakIterator*.
55 * @param status A UErrorCode to receive any errors.
56 * @return A UBreakIterator for the specified rules.
57 * @see ubrk_open
58 * @internal
59 */
60 U_INTERNAL UBreakIterator* U_EXPORT2
61 urbtok_openBinaryRules(const uint8_t *rules,
62 UErrorCode *status);
63
64 /**
65 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
66 * @param rules A set of rules specifying the text breaking conventions. The binary rules
67 * must be at least 32-bit aligned. Note: This version does NOT make a copy
68 * of the rules, so after calling this function the caller must not close or
69 * release the rules passed to this function until after they are finished
70 * with this UBreakIterator* (and any others created using the same rules)
71 * and have called ubrk_close() to close the UBreakIterator* (and any others
72 * using the same rules).
73 * @param status A UErrorCode to receive any errors.
74 * @return A UBreakIterator for the specified rules.
75 * @see ubrk_open
76 * @internal
77 */
78 U_INTERNAL UBreakIterator* U_EXPORT2
79 urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
80 UErrorCode *status);
81
82 /**
83 * Get the (native-endian) binary break rules for this tokenizer.
84 * @param bi The tokenizer to use.
85 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
86 * @param buffSize The size of the output buffer.
87 * @param status A UErrorCode to receive any errors.
88 * @return The actual size of the binary rules, whether they fit the buffer or not.
89 * @internal
90 */
91 U_INTERNAL uint32_t U_EXPORT2
92 urbtok_getBinaryRules(UBreakIterator *bi,
93 uint8_t *buffer,
94 uint32_t buffSize,
95 UErrorCode *status);
96
97 /**
98 * Tokenize text using a rule-based tokenizer.
99 * This is primarily intended for speedy batch tokenization using very simple rules.
100 * It does not currently implement support for all of the features of ICU break rules
101 * (adding that would reduce performance). If you need support for all of the ICU rule
102 * features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,
103 * use a loop with ubrk_next and ubrk_getRuleStatus.
104 *
105 * @param bi The tokenizer to use.
106 * @param maxTokens The maximum number of tokens to return.
107 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
108 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
109 * @return The number of tokens returned, 0 if done.
110 * @internal
111 */
112 U_INTERNAL int32_t U_EXPORT2
113 urbtok_tokenize(UBreakIterator *bi,
114 int32_t maxTokens,
115 RuleBasedTokenRange *outTokens,
116 unsigned long *outTokenFlags);
117
118 /**
119 * Swap the endianness of a set of binary break rules.
120 * @param rules A set of rules which need swapping.
121 * @param buffer The output buffer for the swapped rules, which must be the same
122 * size as the input rules buffer.
123 * @param inIsBigEndian UBool indicating whether the input is big-endian
124 * @param outIsBigEndian UBool indicating whether the output should be big-endian
125 * @param status A UErrorCode to receive any errors.
126 * @internal
127 */
128 U_INTERNAL void U_EXPORT2
129 urbtok_swapBinaryRules(const uint8_t *rules,
130 uint8_t *buffer,
131 UBool inIsBigEndian,
132 UBool outIsBigEndian,
133 UErrorCode *status);
134
135
136 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
137
138 #endif