]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/urbtok.h
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / common / unicode / urbtok.h
CommitLineData
73c04bcf
A
1/*
2******************************************************************************
46f4442e 3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.
73c04bcf
A
4******************************************************************************
5*/
6
7#ifndef URBTOK_H
8#define URBTOK_H
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_BREAK_ITERATION
13
14#include "unicode/utext.h"
15#include "unicode/ubrk.h"
16#include "unicode/parseerr.h"
17
18
19typedef struct RuleBasedTokenRange {
20 signed long location;
21 signed long length;
22} RuleBasedTokenRange;
23
24/**
25 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
26 * The rule syntax is ... (TBD)
27 * @param rules A set of rules specifying the text breaking conventions.
28 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
29 * @param parseErr Receives position and context information for any syntax errors
30 * detected while parsing the rules.
31 * @param status A UErrorCode to receive any errors.
32 * @return A UBreakIterator for the specified rules.
33 * @see ubrk_open
34 * @internal
35 */
36U_INTERNAL UBreakIterator* U_EXPORT2
37urbtok_openRules(const UChar *rules,
38 int32_t rulesLength,
39 UParseError *parseErr,
40 UErrorCode *status);
41
42/**
43 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
44 * @param rules A set of rules specifying the text breaking conventions. The binary rules
46f4442e
A
45 * must be at least 32-bit aligned. Note: This version makes a copy of the
46 * rules, so after calling this function the caller can close or release
47 * the rules that were passed to this function. The copy created by this
48 * call will be freed when ubrk_close() is called on the UBreakIterator*.
73c04bcf
A
49 * @param status A UErrorCode to receive any errors.
50 * @return A UBreakIterator for the specified rules.
51 * @see ubrk_open
52 * @internal
53 */
54U_INTERNAL UBreakIterator* U_EXPORT2
55urbtok_openBinaryRules(const uint8_t *rules,
46f4442e
A
56 UErrorCode *status);
57
58/**
59 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
60 * @param rules A set of rules specifying the text breaking conventions. The binary rules
61 * must be at least 32-bit aligned. Note: This version does NOT make a copy
62 * of the rules, so after calling this function the caller must not close or
63 * release the rules passed to this function until after they are finished
64 * with this UBreakIterator* (and any others created using the same rules)
65 * and have called ubrk_close() to close the UBreakIterator* (and any others
66 * using the same rules).
67 * @param status A UErrorCode to receive any errors.
68 * @return A UBreakIterator for the specified rules.
69 * @see ubrk_open
70 * @internal
71 */
72U_INTERNAL UBreakIterator* U_EXPORT2
73urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
73c04bcf
A
74 UErrorCode *status);
75
76/**
77 * Get the (native-endian) binary break rules for this tokenizer.
78 * @param bi The tokenizer to use.
79 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
80 * @param buffSize The size of the output buffer.
81 * @param status A UErrorCode to receive any errors.
82 * @return The actual size of the binary rules, whether they fit the buffer or not.
83 * @internal
84 */
85U_INTERNAL uint32_t U_EXPORT2
86urbtok_getBinaryRules(UBreakIterator *bi,
87 uint8_t *buffer,
88 uint32_t buffSize,
89 UErrorCode *status);
90
91/**
92 * Tokenize text using a rule-based tokenizer.
93 * @param bi The tokenizer to use.
94 * @param maxTokens The maximum number of tokens to return.
95 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
96 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
97 * @return The number of tokens returned, 0 if done.
98 * @internal
99 */
100U_INTERNAL int32_t U_EXPORT2
101urbtok_tokenize(UBreakIterator *bi,
102 int32_t maxTokens,
103 RuleBasedTokenRange *outTokens,
104 unsigned long *outTokenFlags);
105
106/**
107 * Swap the endianness of a set of binary break rules.
108 * @param rules A set of rules which need swapping.
109 * @param buffer The output buffer for the swapped rules, which must be the same
110 * size as the input rules buffer.
111 * @param inIsBigEndian UBool indicating whether the input is big-endian
112 * @param outIsBigEndian UBool indicating whether the output should be big-endian
113 * @param status A UErrorCode to receive any errors.
114 * @internal
115 */
116U_INTERNAL void U_EXPORT2
117urbtok_swapBinaryRules(const uint8_t *rules,
118 uint8_t *buffer,
119 UBool inIsBigEndian,
120 UBool outIsBigEndian,
121 UErrorCode *status);
122
123
124#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
125
126#endif