git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/urbtok.h

Commit	Line	Data
73c04bcf A	1	/*
73c04bcf A	2	******************************************************************************
0f5d89e8	3	* Copyright (C) 2006-2008, 2017-2018 Apple Inc. All Rights Reserved.
73c04bcf A	4	******************************************************************************
	5	*/
	6
	7	#ifndef URBTOK_H
	8	#define URBTOK_H
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_BREAK_ITERATION
	13
73c04bcf A	14	#include "unicode/ubrk.h"
	15	#include "unicode/parseerr.h"
	16
f3c0d7a5 A	17	/**
	18	* The interfaces here are meant to extend the functionality of the standard
	19	* ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
0f5d89e8 A	20	* was primarily intended for Spotlight and related processes. There are two
	21	* versions of these:
	22	*
	23	* The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator
	24	* class. These are intended to fully support all of the current rule syntax used
	25	* by that class, and should urbtok_tokenize give results equivalent to a loop using a
	26	* combination of the standard functions ubrk_next to get the next break (determining
	27	* the length of the previous token) and ubrk_getRuleStatusVec to get a flag value
	28	* formed as the bitwise OR of all of the values in the returnend vector, skipping all
	29	* tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it
	30	* assumes only one pass over the text in the forward direction, and shut skips caching
	31	* of breaks positions and makes other simplifying assumptions. However, it may not be
	32	* fast enough fo Spotlight.
	33	*
	34	* Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57
	35	* version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These
	36	* versions do not support any RuleBasedBreakIterator rule sytax enhancements from
	37	* later than ICU 57.
	38	*
	39	* The two different sets of functions should not be mixed; urbtok57_getBinaryRules
	40	* should only be used with a UBreakIterator created using urbtok57_openRules;
	41	* urbtok57_tokenize should only be used with a UBreakIterator created using
	42	* urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the
	43	* urbtok_ functions should only be used with other urbtok_ functions.
	44	*/
	45
	46	/**
	47	* struct for returning token results
f3c0d7a5	48	*/
73c04bcf A	49	typedef struct RuleBasedTokenRange {
	50	signed long location;
	51	signed long length;
	52	} RuleBasedTokenRange;
	53
0f5d89e8 A	54	/**
	55	* Open a new UBreakIterator for locating text boundaries for a specified locale.
	56	* A UBreakIterator may be used for detecting character, line, word,
	57	* and sentence breaks in text.
	58	* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
	59	* UBRK_LINE, UBRK_SENTENCE
	60	* @param locale The locale specifying the text-breaking conventions. Note that
	61	* locale keys such as "lb" and "ss" may be used to modify text break behavior,
	62	* see general discussion of BreakIterator C API.
	63	* @param status A UErrorCode to receive any errors.
	64	* @return A UBreakIterator for the specified type and locale.
	65	* @see ubrk_open
	66	* @internal
	67	*/
	68	U_INTERNAL UBreakIterator* U_EXPORT2
	69	urbtok_open(UBreakIteratorType type,
	70	const char *locale,
	71	UErrorCode *status);
	72
73c04bcf A	73	/**
	74	* Open a new UBreakIterator for tokenizing text using specified breaking rules.
	75	* The rule syntax is ... (TBD)
	76	* @param rules A set of rules specifying the text breaking conventions.
	77	* @param rulesLength The number of characters in rules, or -1 if null-terminated.
	78	* @param parseErr Receives position and context information for any syntax errors
	79	* detected while parsing the rules.
	80	* @param status A UErrorCode to receive any errors.
	81	* @return A UBreakIterator for the specified rules.
	82	* @see ubrk_open
	83	* @internal
	84	*/
	85	U_INTERNAL UBreakIterator* U_EXPORT2
	86	urbtok_openRules(const UChar *rules,
	87	int32_t rulesLength,
	88	UParseError *parseErr,
	89	UErrorCode *status);
	90
	91	/**
	92	* Open a new UBreakIterator for tokenizing text using specified breaking rules.
	93	* @param rules A set of rules specifying the text breaking conventions. The binary rules
46f4442e A	94	* must be at least 32-bit aligned. Note: This version makes a copy of the
	95	* rules, so after calling this function the caller can close or release
	96	* the rules that were passed to this function. The copy created by this
	97	* call will be freed when ubrk_close() is called on the UBreakIterator*.
73c04bcf A	98	* @param status A UErrorCode to receive any errors.
	99	* @return A UBreakIterator for the specified rules.
	100	* @see ubrk_open
	101	* @internal
	102	*/
	103	U_INTERNAL UBreakIterator* U_EXPORT2
	104	urbtok_openBinaryRules(const uint8_t *rules,
46f4442e A	105	UErrorCode *status);
	106
	107	/**
	108	* Open a new UBreakIterator for tokenizing text using specified breaking rules.
	109	* @param rules A set of rules specifying the text breaking conventions. The binary rules
	110	* must be at least 32-bit aligned. Note: This version does NOT make a copy
	111	* of the rules, so after calling this function the caller must not close or
	112	* release the rules passed to this function until after they are finished
	113	* with this UBreakIterator* (and any others created using the same rules)
	114	* and have called ubrk_close() to close the UBreakIterator* (and any others
	115	* using the same rules).
	116	* @param status A UErrorCode to receive any errors.
	117	* @return A UBreakIterator for the specified rules.
	118	* @see ubrk_open
	119	* @internal
	120	*/
	121	U_INTERNAL UBreakIterator* U_EXPORT2
	122	urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
73c04bcf A	123	UErrorCode *status);
	124
	125	/**
	126	* Get the (native-endian) binary break rules for this tokenizer.
	127	* @param bi The tokenizer to use.
	128	* @param buffer The output buffer for the rules. You can pass 0 to get the required size.
	129	* @param buffSize The size of the output buffer.
	130	* @param status A UErrorCode to receive any errors.
	131	* @return The actual size of the binary rules, whether they fit the buffer or not.
	132	* @internal
	133	*/
	134	U_INTERNAL uint32_t U_EXPORT2
	135	urbtok_getBinaryRules(UBreakIterator *bi,
	136	uint8_t *buffer,
	137	uint32_t buffSize,
	138	UErrorCode *status);
	139
	140	/**
	141	* Tokenize text using a rule-based tokenizer.
f3c0d7a5 A	142	* This is primarily intended for speedy batch tokenization using very simple rules.
	143	* It does not currently implement support for all of the features of ICU break rules
	144	* (adding that would reduce performance). If you need support for all of the ICU rule
	145	* features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,
	146	* use a loop with ubrk_next and ubrk_getRuleStatus.
	147	*
73c04bcf A	148	* @param bi The tokenizer to use.
	149	* @param maxTokens The maximum number of tokens to return.
	150	* @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
	151	* @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
	152	* @return The number of tokens returned, 0 if done.
	153	* @internal
	154	*/
	155	U_INTERNAL int32_t U_EXPORT2
	156	urbtok_tokenize(UBreakIterator *bi,
	157	int32_t maxTokens,
	158	RuleBasedTokenRange *outTokens,
	159	unsigned long *outTokenFlags);
	160
	161	/**
	162	* Swap the endianness of a set of binary break rules.
	163	* @param rules A set of rules which need swapping.
	164	* @param buffer The output buffer for the swapped rules, which must be the same
	165	* size as the input rules buffer.
	166	* @param inIsBigEndian UBool indicating whether the input is big-endian
	167	* @param outIsBigEndian UBool indicating whether the output should be big-endian
	168	* @param status A UErrorCode to receive any errors.
	169	* @internal
	170	*/
	171	U_INTERNAL void U_EXPORT2
	172	urbtok_swapBinaryRules(const uint8_t *rules,
	173	uint8_t *buffer,
	174	UBool inIsBigEndian,
	175	UBool outIsBigEndian,
	176	UErrorCode *status);
	177
	178
0f5d89e8 A	179
	180	/**
	181	* Open a new UBreakIterator for tokenizing text using specified breaking rules.
	182	* The rule syntax is ... (TBD)
	183	* @param rules A set of rules specifying the text breaking conventions.
	184	* @param rulesLength The number of characters in rules, or -1 if null-terminated.
	185	* @param parseErr Receives position and context information for any syntax errors
	186	* detected while parsing the rules.
	187	* @param status A UErrorCode to receive any errors.
	188	* @return A UBreakIterator for the specified rules.
	189	* @see ubrk_open
	190	* @internal
	191	*/
	192	U_INTERNAL UBreakIterator* U_EXPORT2
	193	urbtok57_openRules(const UChar *rules,
	194	int32_t rulesLength,
	195	UParseError *parseErr,
	196	UErrorCode *status);
	197
	198	/**
	199	* Open a new UBreakIterator for tokenizing text using specified breaking rules.
	200	* @param rules A set of rules specifying the text breaking conventions. The binary rules
	201	* must be at least 32-bit aligned. Note: This version makes a copy of the
	202	* rules, so after calling this function the caller can close or release
	203	* the rules that were passed to this function. The copy created by this
	204	* call will be freed when ubrk_close() is called on the UBreakIterator*.
	205	* @param status A UErrorCode to receive any errors.
	206	* @return A UBreakIterator for the specified rules.
	207	* @see ubrk_open
	208	* @internal
	209	*/
	210	U_INTERNAL UBreakIterator* U_EXPORT2
	211	urbtok57_openBinaryRules(const uint8_t *rules,
	212	UErrorCode *status);
	213
	214	/**
	215	* Open a new UBreakIterator for tokenizing text using specified breaking rules.
	216	* @param rules A set of rules specifying the text breaking conventions. The binary rules
	217	* must be at least 32-bit aligned. Note: This version does NOT make a copy
	218	* of the rules, so after calling this function the caller must not close or
	219	* release the rules passed to this function until after they are finished
	220	* with this UBreakIterator* (and any others created using the same rules)
	221	* and have called ubrk_close() to close the UBreakIterator* (and any others
	222	* using the same rules).
	223	* @param status A UErrorCode to receive any errors.
	224	* @return A UBreakIterator for the specified rules.
	225	* @see ubrk_open
	226	* @internal
	227	*/
	228	U_INTERNAL UBreakIterator* U_EXPORT2
	229	urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
	230	UErrorCode *status);
	231
	232	/**
	233	* Get the (native-endian) binary break rules for this tokenizer.
	234	* @param bi The tokenizer to use.
	235	* @param buffer The output buffer for the rules. You can pass 0 to get the required size.
	236	* @param buffSize The size of the output buffer.
	237	* @param status A UErrorCode to receive any errors.
	238	* @return The actual size of the binary rules, whether they fit the buffer or not.
	239	* @internal
	240	*/
	241	U_INTERNAL uint32_t U_EXPORT2
	242	urbtok57_getBinaryRules(UBreakIterator *bi,
243	uint8_t *buffer,
244	uint32_t buffSize,
245	UErrorCode *status);
246
247	/**
248	* Tokenize text using a rule-based tokenizer.
249	* This is primarily intended for speedy batch tokenization using very simple rules.
250	* It does not currently implement support for all of the features of ICU break rules
251	* (adding that would reduce performance). If you need support for all of the ICU rule
252	* features, please use the standard Apple urbtok_tokenize, or a loop with standard
253	* ICU interfaes ubrk_next and ubrk_getRuleStatusVec.
254	*
255	* @param bi The tokenizer to use.
256	* @param maxTokens The maximum number of tokens to return.
257	* @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
258	* @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
259	* @return The number of tokens returned, 0 if done.
260	* @internal
261	*/
262	U_INTERNAL int32_t U_EXPORT2
263	urbtok57_tokenize(UBreakIterator *bi,
264	int32_t maxTokens,
265	RuleBasedTokenRange *outTokens,
266	unsigned long *outTokenFlags);
267
73c04bcf A	268	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	269
	270	#endif