]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ****************************************************************************** | |
46f4442e | 3 | * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. |
73c04bcf A |
4 | ****************************************************************************** |
5 | */ | |
6 | ||
7 | #ifndef URBTOK_H | |
8 | #define URBTOK_H | |
9 | ||
10 | #include "unicode/utypes.h" | |
11 | ||
12 | #if !UCONFIG_NO_BREAK_ITERATION | |
13 | ||
14 | #include "unicode/utext.h" | |
15 | #include "unicode/ubrk.h" | |
16 | #include "unicode/parseerr.h" | |
17 | ||
18 | ||
19 | typedef struct RuleBasedTokenRange { | |
20 | signed long location; | |
21 | signed long length; | |
22 | } RuleBasedTokenRange; | |
23 | ||
24 | /** | |
25 | * Open a new UBreakIterator for tokenizing text using specified breaking rules. | |
26 | * The rule syntax is ... (TBD) | |
27 | * @param rules A set of rules specifying the text breaking conventions. | |
28 | * @param rulesLength The number of characters in rules, or -1 if null-terminated. | |
29 | * @param parseErr Receives position and context information for any syntax errors | |
30 | * detected while parsing the rules. | |
31 | * @param status A UErrorCode to receive any errors. | |
32 | * @return A UBreakIterator for the specified rules. | |
33 | * @see ubrk_open | |
34 | * @internal | |
35 | */ | |
36 | U_INTERNAL UBreakIterator* U_EXPORT2 | |
37 | urbtok_openRules(const UChar *rules, | |
38 | int32_t rulesLength, | |
39 | UParseError *parseErr, | |
40 | UErrorCode *status); | |
41 | ||
42 | /** | |
43 | * Open a new UBreakIterator for tokenizing text using specified breaking rules. | |
44 | * @param rules A set of rules specifying the text breaking conventions. The binary rules | |
46f4442e A |
45 | * must be at least 32-bit aligned. Note: This version makes a copy of the |
46 | * rules, so after calling this function the caller can close or release | |
47 | * the rules that were passed to this function. The copy created by this | |
48 | * call will be freed when ubrk_close() is called on the UBreakIterator*. | |
73c04bcf A |
49 | * @param status A UErrorCode to receive any errors. |
50 | * @return A UBreakIterator for the specified rules. | |
51 | * @see ubrk_open | |
52 | * @internal | |
53 | */ | |
54 | U_INTERNAL UBreakIterator* U_EXPORT2 | |
55 | urbtok_openBinaryRules(const uint8_t *rules, | |
46f4442e A |
56 | UErrorCode *status); |
57 | ||
58 | /** | |
59 | * Open a new UBreakIterator for tokenizing text using specified breaking rules. | |
60 | * @param rules A set of rules specifying the text breaking conventions. The binary rules | |
61 | * must be at least 32-bit aligned. Note: This version does NOT make a copy | |
62 | * of the rules, so after calling this function the caller must not close or | |
63 | * release the rules passed to this function until after they are finished | |
64 | * with this UBreakIterator* (and any others created using the same rules) | |
65 | * and have called ubrk_close() to close the UBreakIterator* (and any others | |
66 | * using the same rules). | |
67 | * @param status A UErrorCode to receive any errors. | |
68 | * @return A UBreakIterator for the specified rules. | |
69 | * @see ubrk_open | |
70 | * @internal | |
71 | */ | |
72 | U_INTERNAL UBreakIterator* U_EXPORT2 | |
73 | urbtok_openBinaryRulesNoCopy(const uint8_t *rules, | |
73c04bcf A |
74 | UErrorCode *status); |
75 | ||
76 | /** | |
77 | * Get the (native-endian) binary break rules for this tokenizer. | |
78 | * @param bi The tokenizer to use. | |
79 | * @param buffer The output buffer for the rules. You can pass 0 to get the required size. | |
80 | * @param buffSize The size of the output buffer. | |
81 | * @param status A UErrorCode to receive any errors. | |
82 | * @return The actual size of the binary rules, whether they fit the buffer or not. | |
83 | * @internal | |
84 | */ | |
85 | U_INTERNAL uint32_t U_EXPORT2 | |
86 | urbtok_getBinaryRules(UBreakIterator *bi, | |
87 | uint8_t *buffer, | |
88 | uint32_t buffSize, | |
89 | UErrorCode *status); | |
90 | ||
91 | /** | |
92 | * Tokenize text using a rule-based tokenizer. | |
93 | * @param bi The tokenizer to use. | |
94 | * @param maxTokens The maximum number of tokens to return. | |
95 | * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens. | |
96 | * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags. | |
97 | * @return The number of tokens returned, 0 if done. | |
98 | * @internal | |
99 | */ | |
100 | U_INTERNAL int32_t U_EXPORT2 | |
101 | urbtok_tokenize(UBreakIterator *bi, | |
102 | int32_t maxTokens, | |
103 | RuleBasedTokenRange *outTokens, | |
104 | unsigned long *outTokenFlags); | |
105 | ||
106 | /** | |
107 | * Swap the endianness of a set of binary break rules. | |
108 | * @param rules A set of rules which need swapping. | |
109 | * @param buffer The output buffer for the swapped rules, which must be the same | |
110 | * size as the input rules buffer. | |
111 | * @param inIsBigEndian UBool indicating whether the input is big-endian | |
112 | * @param outIsBigEndian UBool indicating whether the output should be big-endian | |
113 | * @param status A UErrorCode to receive any errors. | |
114 | * @internal | |
115 | */ | |
116 | U_INTERNAL void U_EXPORT2 | |
117 | urbtok_swapBinaryRules(const uint8_t *rules, | |
118 | uint8_t *buffer, | |
119 | UBool inIsBigEndian, | |
120 | UBool outIsBigEndian, | |
121 | UErrorCode *status); | |
122 | ||
123 | ||
124 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
125 | ||
126 | #endif |