2 ******************************************************************************
3 * Copyright (C) 2006, Apple Computer, Inc.
5 ******************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
15 #include "unicode/utext.h"
16 #include "unicode/ubrk.h"
17 #include "unicode/parseerr.h"
20 typedef struct RuleBasedTokenRange
{
23 } RuleBasedTokenRange
;
26 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
27 * The rule syntax is ... (TBD)
28 * @param rules A set of rules specifying the text breaking conventions.
29 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
30 * @param parseErr Receives position and context information for any syntax errors
31 * detected while parsing the rules.
32 * @param status A UErrorCode to receive any errors.
33 * @return A UBreakIterator for the specified rules.
37 U_INTERNAL UBreakIterator
* U_EXPORT2
38 urbtok_openRules(const UChar
*rules
,
40 UParseError
*parseErr
,
44 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
45 * @param rules A set of rules specifying the text breaking conventions. The binary rules
46 * must be at least 32-bit aligned.
47 * @param status A UErrorCode to receive any errors.
48 * @return A UBreakIterator for the specified rules.
52 U_INTERNAL UBreakIterator
* U_EXPORT2
53 urbtok_openBinaryRules(const uint8_t *rules
,
57 * Get the (native-endian) binary break rules for this tokenizer.
58 * @param bi The tokenizer to use.
59 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
60 * @param buffSize The size of the output buffer.
61 * @param status A UErrorCode to receive any errors.
62 * @return The actual size of the binary rules, whether they fit the buffer or not.
65 U_INTERNAL
uint32_t U_EXPORT2
66 urbtok_getBinaryRules(UBreakIterator
*bi
,
72 * Tokenize text using a rule-based tokenizer.
73 * @param bi The tokenizer to use.
74 * @param maxTokens The maximum number of tokens to return.
75 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
76 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
77 * @return The number of tokens returned, 0 if done.
80 U_INTERNAL
int32_t U_EXPORT2
81 urbtok_tokenize(UBreakIterator
*bi
,
83 RuleBasedTokenRange
*outTokens
,
84 unsigned long *outTokenFlags
);
87 * Swap the endianness of a set of binary break rules.
88 * @param rules A set of rules which need swapping.
89 * @param buffer The output buffer for the swapped rules, which must be the same
90 * size as the input rules buffer.
91 * @param inIsBigEndian UBool indicating whether the input is big-endian
92 * @param outIsBigEndian UBool indicating whether the output should be big-endian
93 * @param status A UErrorCode to receive any errors.
96 U_INTERNAL
void U_EXPORT2
97 urbtok_swapBinaryRules(const uint8_t *rules
,
100 UBool outIsBigEndian
,
104 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */