]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/urbtok.h
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / common / unicode / urbtok.h
1 /*
2 ******************************************************************************
3 * Copyright (C) 2006, Apple Computer, Inc.
4 * All Rights Reserved.
5 ******************************************************************************
6 */
7
8 #ifndef URBTOK_H
9 #define URBTOK_H
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_BREAK_ITERATION
14
15 #include "unicode/utext.h"
16 #include "unicode/ubrk.h"
17 #include "unicode/parseerr.h"
18
19
20 typedef struct RuleBasedTokenRange {
21 signed long location;
22 signed long length;
23 } RuleBasedTokenRange;
24
25 /**
26 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
27 * The rule syntax is ... (TBD)
28 * @param rules A set of rules specifying the text breaking conventions.
29 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
30 * @param parseErr Receives position and context information for any syntax errors
31 * detected while parsing the rules.
32 * @param status A UErrorCode to receive any errors.
33 * @return A UBreakIterator for the specified rules.
34 * @see ubrk_open
35 * @internal
36 */
37 U_INTERNAL UBreakIterator* U_EXPORT2
38 urbtok_openRules(const UChar *rules,
39 int32_t rulesLength,
40 UParseError *parseErr,
41 UErrorCode *status);
42
43 /**
44 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
45 * @param rules A set of rules specifying the text breaking conventions. The binary rules
46 * must be at least 32-bit aligned.
47 * @param status A UErrorCode to receive any errors.
48 * @return A UBreakIterator for the specified rules.
49 * @see ubrk_open
50 * @internal
51 */
52 U_INTERNAL UBreakIterator* U_EXPORT2
53 urbtok_openBinaryRules(const uint8_t *rules,
54 UErrorCode *status);
55
56 /**
57 * Get the (native-endian) binary break rules for this tokenizer.
58 * @param bi The tokenizer to use.
59 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
60 * @param buffSize The size of the output buffer.
61 * @param status A UErrorCode to receive any errors.
62 * @return The actual size of the binary rules, whether they fit the buffer or not.
63 * @internal
64 */
65 U_INTERNAL uint32_t U_EXPORT2
66 urbtok_getBinaryRules(UBreakIterator *bi,
67 uint8_t *buffer,
68 uint32_t buffSize,
69 UErrorCode *status);
70
71 /**
72 * Tokenize text using a rule-based tokenizer.
73 * @param bi The tokenizer to use.
74 * @param maxTokens The maximum number of tokens to return.
75 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
76 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
77 * @return The number of tokens returned, 0 if done.
78 * @internal
79 */
80 U_INTERNAL int32_t U_EXPORT2
81 urbtok_tokenize(UBreakIterator *bi,
82 int32_t maxTokens,
83 RuleBasedTokenRange *outTokens,
84 unsigned long *outTokenFlags);
85
86 /**
87 * Swap the endianness of a set of binary break rules.
88 * @param rules A set of rules which need swapping.
89 * @param buffer The output buffer for the swapped rules, which must be the same
90 * size as the input rules buffer.
91 * @param inIsBigEndian UBool indicating whether the input is big-endian
92 * @param outIsBigEndian UBool indicating whether the output should be big-endian
93 * @param status A UErrorCode to receive any errors.
94 * @internal
95 */
96 U_INTERNAL void U_EXPORT2
97 urbtok_swapBinaryRules(const uint8_t *rules,
98 uint8_t *buffer,
99 UBool inIsBigEndian,
100 UBool outIsBigEndian,
101 UErrorCode *status);
102
103
104 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
105
106 #endif