[apple/icu.git] / icuSources / i18n / rbt_pars.h

/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION
#ifdef __cplusplus

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"
#include "hash.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;
class UnicodeFunctor;
class ParseData;
class RuleHalf;
class ParsePosition;
class StringMatcher;

class TransliteratorParser : public UMemory {

 public:

    /**
     * A Vector of TransliterationRuleData objects, one for each discrete group
     * of rules in the rule set
     */
    UVector dataVector;

    /**
     * PUBLIC data member.
     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
     */
    UVector idBlockVector;

    /**
     * PUBLIC data member containing the parsed compound filter, if any.
     */
    UnicodeSet* compoundFilter;

 private:

    /**
     * The current data object for which we are parsing rules
     */
    TransliterationRuleData* curData;

    UTransDirection direction;

    /**
     * Parse error information.
     */
    UParseError parseError;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of matcher variables.  When parsing is complete, this
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
    UVector variablesVector;

    /**
     * Temporary table of variable names.  When parsing is complete, this is
     * copied into data.variableNames.
     */
    Hashtable variableNames;    
    
    /**
     * String of standins for segments.  Used during the parsing of a single
     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UnicodeString segmentStandins;

    /**
     * Vector of StringMatcher objects for segments.  Used during the
     * parsing of a single rule.  
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UVector segmentObjects;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    UChar variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    UChar variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal
     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
     * Instead, we save the name of the undefined variable, and substitute
     * in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    UnicodeString undefinedVariableName;

    /**
     * The stand-in character for the 'dot' set, represented by '.' in
     * patterns.  This is allocated the first time it is needed, and
     * reused thereafter.
     */
    UChar dotStandIn;

public:

    /**
     * Constructor.
     */
    TransliteratorParser(UErrorCode &statusReturn);

    /**
     * Destructor.
     */
    ~TransliteratorParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once after construction.
     *
     * Parse the given rules, in the given direction.  After this call
     * returns, query the public data members for results.  The caller
     * owns the 'data' and 'compoundFilter' data members after this
     * call returns.
     * @param rules      rules, separated by ';'
     * @param direction  either FORWARD or REVERSE.
     * @param pe         Struct to recieve information on position 
     *                   of error if an error is encountered
     * @param ec         Output param set to success/failure code.
     */
    void parse(const UnicodeString& rules,
               UTransDirection direction,
               UParseError& pe,
               UErrorCode& ec);

    /**
     * Return the compound filter parsed by parse().  Caller owns result.
     * @return the compound filter parsed by parse().
     */ 
    UnicodeSet* orphanCompoundFilter();

private:

    /**
     * Return a representation of this transliterator as source rules.
     * @param rules      Output param to receive the rules.
     * @param direction  either FORWARD or REVERSE.
     */
    void parseRules(const UnicodeString& rules,
                    UTransDirection direction,
                    UErrorCode& status);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     * @param rules      Output param to receive the rules.
     * @param pos        the starting position.
     * @param limit      pointer past the last character of the rule.
     * @return           the index after the last character parsed.
     */
    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Set the variable range to [start, end] (inclusive).
     * @param start    the start value of the range.
     * @param end      the end value of the range.
     */
    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

    /**
     * Assert that the given character is NOT within the variable range.
     * If it is, return FALSE.  This is neccesary to ensure that the
     * variable range does not overlap characters used in a rule.
     * @param ch     the given character.
     * @return       True, if the given character is NOT within the variable range.
     */
    UBool checkVariableRange(UChar32 ch) const;

    /**
     * Set the maximum backup to 'backup', in response to a pragma
     * statement.
     * @param backup    the new value to be set.
     */
    void pragmaMaximumBackup(int32_t backup);

    /**
     * Begin normalizing all rules using the given mode, in response
     * to a pragma statement.
     * @param mode    the given mode.
     */
    void pragmaNormalizeRules(UNormalizationMode mode);

    /**
     * Return true if the given rule looks like a pragma.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return true if the given rule looks like a pragma.
     */
    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Parse a pragma.  This method assumes resemblesPragma() has
     * already returned true.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return the position index after the final ';' of the pragma,
     * or -1 on failure.
     */
    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param parseErrorCode error code.
     * @param msg error description.
     * @param start position of first character of current rule.
     * @return start position of first character of current rule.
     */
    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
                        UErrorCode& status);

    /**
     * Parse a UnicodeSet out, store it, and return the stand-in character
     * used to represent it.
     *
     * @param rule    the rule for UnicodeSet.
     * @param pos     the position in pattern at which to start parsing.
     * @return        the stand-in character used to represent it.
     */
    UChar parseSet(const UnicodeString& rule,
                   ParsePosition& pos,
                   UErrorCode& status);

    /**
     * Generate and return a stand-in for a new UnicodeFunctor.  Store
     * the matcher (adopt it).
     * @param adopted the UnicodeFunctor to be adopted.
     * @return        a stand-in for a new UnicodeFunctor.
     */
    UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

    /**
     * Return the standin for segment seg (1-based).
     * @param seg    the given segment.
     * @return       the standIn character for the given segment.
     */
    UChar getSegmentStandin(int32_t seg, UErrorCode& status);

    /**
     * Set the object for segment seg (1-based).
     * @param seg      the given segment.
     * @param adopted  the StringMatcher to be adopted.
     */
    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

    /**
     * Return the stand-in for the dot set.  It is allocated the first
     * time and reused thereafter.
     * @return    the stand-in for the dot set.
     */
    UChar getDotStandIn(UErrorCode& status);

    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
     * @param name    the variable name to be appended.
     * @param buf     the given UnicodeString to append to.
     */
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf,
                           UErrorCode& status);

    /**
     * Glue method to get around access restrictions in C++.
     */
    /*static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);*/

    friend class RuleHalf;

    // Disallowed methods; no impl.
    /**
     * Copy constructor
     */
    TransliteratorParser(const TransliteratorParser&);
    
    /**
     * Assignment operator
     */
    TransliteratorParser& operator=(const TransliteratorParser&);
};

U_NAMESPACE_END

#endif /* #ifdef __cplusplus */

/**
 * Strip/convert the following from the transliterator rules:
 * comments
 * newlines
 * white space at the beginning and end of a line
 * unescape \u notation
 *
 * The target must be equal in size as the source.
 * @internal
 */
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif
Commit	Line	Data
b75a7d8f	1	/*
73c04bcf	2	**********************************************************************
4388f060	3	* Copyright (C) 1999-2011, International Business Machines Corporation
73c04bcf	4	* and others. All Rights Reserved.
b75a7d8f A	5	**********************************************************************
	6	* Date Name Description
	7	* 11/17/99 aliu Creation.
	8	**********************************************************************
	9	*/
	10	#ifndef RBT_PARS_H
	11	#define RBT_PARS_H
	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_TRANSLITERATION
4388f060	16	#ifdef __cplusplus
b75a7d8f A	17
	18	#include "unicode/uobject.h"
	19	#include "unicode/parseerr.h"
	20	#include "unicode/unorm.h"
	21	#include "rbt.h"
73c04bcf A	22	#include "hash.h"
73c04bcf A	23	#include "uvector.h"
b75a7d8f A	24
	25	U_NAMESPACE_BEGIN
	26
	27	class TransliterationRuleData;
	28	class UnicodeFunctor;
	29	class ParseData;
	30	class RuleHalf;
	31	class ParsePosition;
b75a7d8f A	32	class StringMatcher;
	33
	34	class TransliteratorParser : public UMemory {
	35
	36	public:
	37
	38	/**
73c04bcf A	39	* A Vector of TransliterationRuleData objects, one for each discrete group
73c04bcf A	40	* of rules in the rule set
b75a7d8f	41	*/
73c04bcf	42	UVector dataVector;
b75a7d8f A	43
	44	/**
	45	* PUBLIC data member.
73c04bcf	46	* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
b75a7d8f	47	*/
73c04bcf	48	UVector idBlockVector;
b75a7d8f A	49
	50	/**
	51	* PUBLIC data member containing the parsed compound filter, if any.
	52	*/
	53	UnicodeSet* compoundFilter;
	54
	55	private:
	56
b75a7d8f	57	/**
73c04bcf	58	* The current data object for which we are parsing rules
b75a7d8f	59	*/
73c04bcf A	60	TransliterationRuleData* curData;
	61
	62	UTransDirection direction;
b75a7d8f A	63
	64	/**
	65	* Parse error information.
	66	*/
	67	UParseError parseError;
	68
	69	/**
	70	* Temporary symbol table used during parsing.
	71	*/
	72	ParseData* parseData;
	73
	74	/**
	75	* Temporary vector of matcher variables. When parsing is complete, this
	76	* is copied into the array data.variables. As with data.variables,
	77	* element 0 corresponds to character data.variablesBase.
	78	*/
73c04bcf	79	UVector variablesVector;
b75a7d8f	80
73c04bcf A	81	/**
	82	* Temporary table of variable names. When parsing is complete, this is
	83	* copied into data.variableNames.
	84	*/
	85	Hashtable variableNames;
	86
b75a7d8f A	87	/**
	88	* String of standins for segments. Used during the parsing of a single
	89	* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
	90	* to StringMatcher object segmentObjects.elementAt(0), etc.
	91	*/
	92	UnicodeString segmentStandins;
	93
	94	/**
	95	* Vector of StringMatcher objects for segments. Used during the
	96	* parsing of a single rule.
	97	* segmentStandins.charAt(0) is the standin for "$1" and corresponds
	98	* to StringMatcher object segmentObjects.elementAt(0), etc.
	99	*/
73c04bcf	100	UVector segmentObjects;
b75a7d8f A	101
	102	/**
	103	* The next available stand-in for variables. This starts at some point in
	104	* the private use area (discovered dynamically) and increments up toward
	105	* <code>variableLimit</code>. At any point during parsing, available
	106	* variables are <code>variableNext..variableLimit-1</code>.
	107	*/
	108	UChar variableNext;
	109
	110	/**
	111	* The last available stand-in for variables. This is discovered
	112	* dynamically. At any point during parsing, available variables are
	113	* <code>variableNext..variableLimit-1</code>.
	114	*/
	115	UChar variableLimit;
	116
	117	/**
	118	* When we encounter an undefined variable, we do not immediately signal
	119	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
	120	* Instead, we save the name of the undefined variable, and substitute
	121	* in the placeholder char variableLimit - 1, and decrement
	122	* variableLimit.
	123	*/
	124	UnicodeString undefinedVariableName;
	125
	126	/**
	127	* The stand-in character for the 'dot' set, represented by '.' in
	128	* patterns. This is allocated the first time it is needed, and
	129	* reused thereafter.
	130	*/
	131	UChar dotStandIn;
	132
	133	public:
	134
	135	/**
	136	* Constructor.
	137	*/
73c04bcf	138	TransliteratorParser(UErrorCode &statusReturn);
b75a7d8f A	139
	140	/**
	141	* Destructor.
	142	*/
	143	~TransliteratorParser();
	144
	145	/**
	146	* Parse the given string as a sequence of rules, separated by newline
	147	* characters ('\n'), and cause this object to implement those rules. Any
	148	* previous rules are discarded. Typically this method is called exactly
	149	* once after construction.
	150	*
	151	* Parse the given rules, in the given direction. After this call
	152	* returns, query the public data members for results. The caller
	153	* owns the 'data' and 'compoundFilter' data members after this
	154	* call returns.
	155	* @param rules rules, separated by ';'
	156	* @param direction either FORWARD or REVERSE.
	157	* @param pe Struct to recieve information on position
	158	* of error if an error is encountered
	159	* @param ec Output param set to success/failure code.
	160	*/
	161	void parse(const UnicodeString& rules,
	162	UTransDirection direction,
	163	UParseError& pe,
	164	UErrorCode& ec);
	165
	166	/**
	167	* Return the compound filter parsed by parse(). Caller owns result.
	168	* @return the compound filter parsed by parse().
	169	*/
	170	UnicodeSet* orphanCompoundFilter();
	171
b75a7d8f A	172	private:
	173
	174	/**
	175	* Return a representation of this transliterator as source rules.
	176	* @param rules Output param to receive the rules.
	177	* @param direction either FORWARD or REVERSE.
	178	*/
	179	void parseRules(const UnicodeString& rules,
73c04bcf A	180	UTransDirection direction,
73c04bcf A	181	UErrorCode& status);
b75a7d8f A	182
	183	/**
	184	* MAIN PARSER. Parse the next rule in the given rule string, starting
	185	* at pos. Return the index after the last character parsed. Do not
	186	* parse characters at or after limit.
	187	*
	188	* Important: The character at pos must be a non-whitespace character
	189	* that is not the comment character.
	190	*
	191	* This method handles quoting, escaping, and whitespace removal. It
	192	* parses the end-of-rule character. It recognizes context and cursor
	193	* indicators. Once it does a lexical breakdown of the rule at pos, it
	194	* creates a rule object and adds it to our rule list.
	195	* @param rules Output param to receive the rules.
	196	* @param pos the starting position.
	197	* @param limit pointer past the last character of the rule.
	198	* @return the index after the last character parsed.
	199	*/
73c04bcf	200	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
b75a7d8f A	201
	202	/**
	203	* Set the variable range to [start, end] (inclusive).
	204	* @param start the start value of the range.
	205	* @param end the end value of the range.
	206	*/
73c04bcf	207	void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
b75a7d8f A	208
	209	/**
	210	* Assert that the given character is NOT within the variable range.
	211	* If it is, return FALSE. This is neccesary to ensure that the
	212	* variable range does not overlap characters used in a rule.
	213	* @param ch the given character.
	214	* @return True, if the given character is NOT within the variable range.
	215	*/
	216	UBool checkVariableRange(UChar32 ch) const;
	217
	218	/**
	219	* Set the maximum backup to 'backup', in response to a pragma
	220	* statement.
	221	* @param backup the new value to be set.
	222	*/
	223	void pragmaMaximumBackup(int32_t backup);
	224
	225	/**
	226	* Begin normalizing all rules using the given mode, in response
	227	* to a pragma statement.
	228	* @param mode the given mode.
	229	*/
	230	void pragmaNormalizeRules(UNormalizationMode mode);
	231
	232	/**
	233	* Return true if the given rule looks like a pragma.
	234	* @param pos offset to the first non-whitespace character
	235	* of the rule.
	236	* @param limit pointer past the last character of the rule.
	237	* @return true if the given rule looks like a pragma.
	238	*/
	239	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
	240
	241	/**
	242	* Parse a pragma. This method assumes resemblesPragma() has
	243	* already returned true.
	244	* @param pos offset to the first non-whitespace character
	245	* of the rule.
	246	* @param limit pointer past the last character of the rule.
	247	* @return the position index after the final ';' of the pragma,
	248	* or -1 on failure.
	249	*/
73c04bcf	250	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
b75a7d8f A	251
	252	/**
	253	* Called by main parser upon syntax error. Search the rule string
	254	* for the probable end of the rule. Of course, if the error is that
	255	* the end of rule marker is missing, then the rule end will not be found.
	256	* In any case the rule start will be correctly reported.
	257	* @param parseErrorCode error code.
	258	* @param msg error description.
	259	* @param start position of first character of current rule.
	260	* @return start position of first character of current rule.
	261	*/
73c04bcf A	262	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
73c04bcf A	263	UErrorCode& status);
b75a7d8f A	264
	265	/**
	266	* Parse a UnicodeSet out, store it, and return the stand-in character
	267	* used to represent it.
	268	*
	269	* @param rule the rule for UnicodeSet.
	270	* @param pos the position in pattern at which to start parsing.
	271	* @return the stand-in character used to represent it.
	272	*/
	273	UChar parseSet(const UnicodeString& rule,
73c04bcf A	274	ParsePosition& pos,
73c04bcf A	275	UErrorCode& status);
b75a7d8f A	276
	277	/**
	278	* Generate and return a stand-in for a new UnicodeFunctor. Store
	279	* the matcher (adopt it).
	280	* @param adopted the UnicodeFunctor to be adopted.
	281	* @return a stand-in for a new UnicodeFunctor.
	282	*/
73c04bcf	283	UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
b75a7d8f A	284
	285	/**
	286	* Return the standin for segment seg (1-based).
	287	* @param seg the given segment.
	288	* @return the standIn character for the given segment.
	289	*/
73c04bcf	290	UChar getSegmentStandin(int32_t seg, UErrorCode& status);
b75a7d8f A	291
	292	/**
	293	* Set the object for segment seg (1-based).
	294	* @param seg the given segment.
	295	* @param adopted the StringMatcher to be adopted.
	296	*/
73c04bcf	297	void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
b75a7d8f A	298
	299	/**
	300	* Return the stand-in for the dot set. It is allocated the first
	301	* time and reused thereafter.
	302	* @return the stand-in for the dot set.
	303	*/
73c04bcf	304	UChar getDotStandIn(UErrorCode& status);
b75a7d8f A	305
	306	/**
	307	* Append the value of the given variable name to the given
	308	* UnicodeString.
	309	* @param name the variable name to be appended.
	310	* @param buf the given UnicodeString to append to.
	311	*/
	312	void appendVariableDef(const UnicodeString& name,
73c04bcf A	313	UnicodeString& buf,
73c04bcf A	314	UErrorCode& status);
b75a7d8f A	315
	316	/**
	317	* Glue method to get around access restrictions in C++.
	318	*/
46f4442e A	319	/static Transliterator createBasicInstance(const UnicodeString& id,
46f4442e A	320	const UnicodeString* canonID);*/
b75a7d8f A	321
	322	friend class RuleHalf;
	323
	324	// Disallowed methods; no impl.
	325	/**
	326	* Copy constructor
	327	*/
	328	TransliteratorParser(const TransliteratorParser&);
	329
	330	/**
	331	* Assignment operator
	332	*/
	333	TransliteratorParser& operator=(const TransliteratorParser&);
	334	};
	335
	336	U_NAMESPACE_END
	337
4388f060	338	#endif /* #ifdef __cplusplus */
73c04bcf A	339
	340	/**
	341	* Strip/convert the following from the transliterator rules:
	342	* comments
	343	* newlines
	344	* white space at the beginning and end of a line
	345	* unescape \u notation
	346	*
	347	* The target must be equal in size as the source.
	348	* @internal
	349	*/
	350	U_CAPI int32_t
	351	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status);
	352
b75a7d8f A	353	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	354
	355	#endif