[apple/icu.git] / icuSources / tools / toolutil / uparse.h

/*
*******************************************************************************
*
*   Copyright (C) 2000-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uparse.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2000apr18
*   created by: Markus W. Scherer
*
*   This file provides a parser for files that are delimited by one single
*   character like ';' or TAB. Example: the Unicode Character Properties files
*   like UnicodeData.txt are semicolon-delimited.
*/

#ifndef __UPARSE_H__
#define __UPARSE_H__

#include "unicode/utypes.h"

/**
 * Is c an invariant-character whitespace?
 * @param c invariant character
 */
#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')

U_CDECL_BEGIN

/**
 * Skip space ' ' and TAB '\t' characters.
 *
 * @param s Pointer to characters.
 * @return Pointer to first character at or after s that is not a space or TAB.
 */
U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char *s);

/**
 * Trim whitespace (including line endings) from the end of the string.
 *
 * @param s Pointer to the string.
 * @return Pointer to the new end of the string.
 */
U_CAPI char * U_EXPORT2
u_rtrim(char *s);

/** Function type for u_parseDelimitedFile(). */
typedef void U_CALLCONV
UParseLineFn(void *context,
              char *fields[][2],
              int32_t fieldCount,
              UErrorCode *pErrorCode);

/**
 * Parser for files that are similar to UnicodeData.txt:
 * This function opens the file and reads it line by line. It skips empty lines
 * and comment lines that start with a '#'.
 * All other lines are separated into fields with one delimiter character
 * (semicolon for Unicode Properties files) between two fields. The last field in
 * a line does not need to be terminated with a delimiter.
 *
 * For each line, after segmenting it, a line function is called.
 * It gets passed the array of field start and limit pointers that is
 * passed into this parser and filled by it for each line.
 * For each field i of the line, the start pointer in fields[i][0]
 * points to the beginning of the field, while the limit pointer in fields[i][1]
 * points behind the field, i.e., to the delimiter or the line end.
 *
 * The context parameter of the line function is
 * the same as the one for the parse function.
 *
 * The line function may modify the contents of the fields including the
 * limit characters.
 *
 * If the file cannot be opened, or there is a parsing error or a field function
 * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
 */
U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char *filename, char delimiter,
                     char *fields[][2], int32_t fieldCount,
                     UParseLineFn *lineFn, void *context,
                     UErrorCode *pErrorCode);

/**
 * Parse a string of code points like 0061 0308 0300.
 * s must end with either ';' or NUL.
 *
 * @return Number of code points.
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char *s,
                  uint32_t *dest, int32_t destCapacity,
                  UErrorCode *pErrorCode);

/**
 * Parse a list of code points like 0061 0308 0300
 * into a UChar * string.
 * s must end with either ';' or NUL.
 *
 * Set the first code point in *pFirst.
 *
 * @param s Input char * string.
 * @param dest Output string buffer.
 * @param destCapacity Capacity of dest in numbers of UChars.
 * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
 *               code point in the string.
 * @param pErrorCode ICU error code.
 * @return The length of the string in numbers of UChars.
 */
U_CAPI int32_t U_EXPORT2
u_parseString(const char *s,
              UChar *dest, int32_t destCapacity,
              uint32_t *pFirst,
              UErrorCode *pErrorCode);

/**
 * Parse a code point range like
 * 0085 or
 * 4E00..9FA5.
 *
 * s must contain such a range and end with either ';' or NUL.
 *
 * @return Length of code point range, end-start+1
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
                      uint32_t *pStart, uint32_t *pEnd,
                      UErrorCode *pErrorCode);

/**
 * Same as u_parseCodePointRange() but the range may be terminated by
 * any character. The position of the terminating character is returned via
 * the *terminator output parameter.
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char *s,
                                   uint32_t *pStart, uint32_t *pEnd,
                                   const char **terminator,
                                   UErrorCode *pErrorCode);

U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);

U_CDECL_END

#endif
Commit	Line	Data
b75a7d8f A	1	/*
	2	*******************************************************************************
	3	*
729e4ab9	4	* Copyright (C) 2000-2010, International Business Machines
b75a7d8f A	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: uparse.h
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2000apr18
	14	* created by: Markus W. Scherer
	15	*
	16	* This file provides a parser for files that are delimited by one single
	17	* character like ';' or TAB. Example: the Unicode Character Properties files
	18	* like UnicodeData.txt are semicolon-delimited.
	19	*/
	20
	21	#ifndef __UPARSE_H__
	22	#define __UPARSE_H__
	23
	24	#include "unicode/utypes.h"
	25
729e4ab9 A	26	/**
	27	* Is c an invariant-character whitespace?
	28	* @param c invariant character
	29	*/
	30	#define U_IS_INV_WHITESPACE(c) ((c)==' ' \|\| (c)=='\t' \|\| (c)=='\r' \|\| (c)=='\n')
	31
b75a7d8f A	32	U_CDECL_BEGIN
	33
	34	/**
	35	* Skip space ' ' and TAB '\t' characters.
	36	*
	37	* @param s Pointer to characters.
	38	* @return Pointer to first character at or after s that is not a space or TAB.
	39	*/
	40	U_CAPI const char * U_EXPORT2
	41	u_skipWhitespace(const char *s);
	42
729e4ab9 A	43	/**
	44	* Trim whitespace (including line endings) from the end of the string.
	45	*
	46	* @param s Pointer to the string.
	47	* @return Pointer to the new end of the string.
	48	*/
	49	U_CAPI char * U_EXPORT2
	50	u_rtrim(char *s);
	51
b75a7d8f A	52	/** Function type for u_parseDelimitedFile(). */
	53	typedef void U_CALLCONV
	54	UParseLineFn(void *context,
	55	char *fields[][2],
	56	int32_t fieldCount,
	57	UErrorCode *pErrorCode);
	58
	59	/**
	60	* Parser for files that are similar to UnicodeData.txt:
	61	* This function opens the file and reads it line by line. It skips empty lines
	62	* and comment lines that start with a '#'.
	63	* All other lines are separated into fields with one delimiter character
	64	* (semicolon for Unicode Properties files) between two fields. The last field in
	65	* a line does not need to be terminated with a delimiter.
	66	*
	67	* For each line, after segmenting it, a line function is called.
	68	* It gets passed the array of field start and limit pointers that is
	69	* passed into this parser and filled by it for each line.
	70	* For each field i of the line, the start pointer in fields[i][0]
	71	* points to the beginning of the field, while the limit pointer in fields[i][1]
	72	* points behind the field, i.e., to the delimiter or the line end.
	73	*
	74	* The context parameter of the line function is
	75	* the same as the one for the parse function.
	76	*
	77	* The line function may modify the contents of the fields including the
	78	* limit characters.
	79	*
	80	* If the file cannot be opened, or there is a parsing error or a field function
	81	* sets pErrorCode, then the parser returns with pErrorCode set to an error code.
	82	*/
	83	U_CAPI void U_EXPORT2
	84	u_parseDelimitedFile(const char *filename, char delimiter,
	85	char *fields[][2], int32_t fieldCount,
	86	UParseLineFn lineFn, void context,
	87	UErrorCode *pErrorCode);
	88
	89	/**
	90	* Parse a string of code points like 0061 0308 0300.
	91	* s must end with either ';' or NUL.
	92	*
	93	* @return Number of code points.
	94	*/
	95	U_CAPI int32_t U_EXPORT2
	96	u_parseCodePoints(const char *s,
	97	uint32_t *dest, int32_t destCapacity,
	98	UErrorCode *pErrorCode);
	99
	100	/**
	101	* Parse a list of code points like 0061 0308 0300
	102	* into a UChar * string.
	103	* s must end with either ';' or NUL.
	104	*
	105	* Set the first code point in *pFirst.
	106	*
	107	* @param s Input char * string.
	108	* @param dest Output string buffer.
	109	* @param destCapacity Capacity of dest in numbers of UChars.
	110	* @param pFirst If pFirst!=NULL the *pFirst will be set to the first
	111	* code point in the string.
	112	* @param pErrorCode ICU error code.
	113	* @return The length of the string in numbers of UChars.
	114	*/
	115	U_CAPI int32_t U_EXPORT2
116	u_parseString(const char *s,
117	UChar *dest, int32_t destCapacity,
118	uint32_t *pFirst,
119	UErrorCode *pErrorCode);
120
121	/**
122	* Parse a code point range like
123	* 0085 or
124	* 4E00..9FA5.
125	*
126	* s must contain such a range and end with either ';' or NUL.
127	*
128	* @return Length of code point range, end-start+1
129	*/
130	U_CAPI int32_t U_EXPORT2
131	u_parseCodePointRange(const char *s,
132	uint32_t pStart, uint32_t pEnd,
133	UErrorCode *pErrorCode);
134
729e4ab9 A	135	/**
	136	* Same as u_parseCodePointRange() but the range may be terminated by
	137	* any character. The position of the terminating character is returned via
	138	* the *terminator output parameter.
	139	*/
	140	U_CAPI int32_t U_EXPORT2
	141	u_parseCodePointRangeAnyTerminator(const char *s,
	142	uint32_t pStart, uint32_t pEnd,
	143	const char **terminator,
	144	UErrorCode *pErrorCode);
b75a7d8f	145
b75a7d8f A	146	U_CAPI int32_t U_EXPORT2
	147	u_parseUTF8(const char source, int32_t sLen, char dest, int32_t destCapacity, UErrorCode *status);
	148
	149	U_CDECL_END
	150
	151	#endif