git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	**********************************************************************
	5	* Copyright (C) 1999-2011, International Business Machines Corporation
	6	* and others. All Rights Reserved.
	7	**********************************************************************
	8	* Date Name Description
	9	* 11/17/99 aliu Creation.
	10	**********************************************************************
	11	*/
	12	#ifndef RBT_PARS_H
	13	#define RBT_PARS_H
	14
	15	#include "unicode/utypes.h"
	16
	17	#if !UCONFIG_NO_TRANSLITERATION
	18	#ifdef __cplusplus
	19
	20	#include "unicode/uobject.h"
	21	#include "unicode/parseerr.h"
	22	#include "unicode/unorm.h"
	23	#include "rbt.h"
	24	#include "hash.h"
	25	#include "uvector.h"
	26
	27	U_NAMESPACE_BEGIN
	28
	29	class TransliterationRuleData;
	30	class UnicodeFunctor;
	31	class ParseData;
	32	class RuleHalf;
	33	class ParsePosition;
	34	class StringMatcher;
	35
	36	class TransliteratorParser : public UMemory {
	37
	38	public:
	39
	40	/**
	41	* A Vector of TransliterationRuleData objects, one for each discrete group
	42	* of rules in the rule set
	43	*/
	44	UVector dataVector;
	45
	46	/**
	47	* PUBLIC data member.
	48	* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
	49	*/
	50	UVector idBlockVector;
	51
	52	/**
	53	* PUBLIC data member containing the parsed compound filter, if any.
	54	*/
	55	UnicodeSet* compoundFilter;
	56
	57	private:
	58
	59	/**
	60	* The current data object for which we are parsing rules
	61	*/
	62	TransliterationRuleData* curData;
	63
	64	UTransDirection direction;
	65
	66	/**
	67	* Parse error information.
	68	*/
	69	UParseError parseError;
	70
	71	/**
	72	* Temporary symbol table used during parsing.
	73	*/
	74	ParseData* parseData;
	75
	76	/**
	77	* Temporary vector of matcher variables. When parsing is complete, this
	78	* is copied into the array data.variables. As with data.variables,
	79	* element 0 corresponds to character data.variablesBase.
	80	*/
	81	UVector variablesVector;
	82
	83	/**
	84	* Temporary table of variable names. When parsing is complete, this is
	85	* copied into data.variableNames.
	86	*/
	87	Hashtable variableNames;
	88
	89	/**
	90	* String of standins for segments. Used during the parsing of a single
	91	* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
	92	* to StringMatcher object segmentObjects.elementAt(0), etc.
	93	*/
	94	UnicodeString segmentStandins;
	95
	96	/**
	97	* Vector of StringMatcher objects for segments. Used during the
	98	* parsing of a single rule.
	99	* segmentStandins.charAt(0) is the standin for "$1" and corresponds
	100	* to StringMatcher object segmentObjects.elementAt(0), etc.
	101	*/
	102	UVector segmentObjects;
	103
	104	/**
	105	* The next available stand-in for variables. This starts at some point in
	106	* the private use area (discovered dynamically) and increments up toward
	107	* <code>variableLimit</code>. At any point during parsing, available
	108	* variables are <code>variableNext..variableLimit-1</code>.
	109	*/
	110	UChar variableNext;
	111
	112	/**
	113	* The last available stand-in for variables. This is discovered
	114	* dynamically. At any point during parsing, available variables are
	115	* <code>variableNext..variableLimit-1</code>.
	116	*/
	117	UChar variableLimit;
	118
	119	/**
	120	* When we encounter an undefined variable, we do not immediately signal
	121	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
	122	* Instead, we save the name of the undefined variable, and substitute
	123	* in the placeholder char variableLimit - 1, and decrement
	124	* variableLimit.
	125	*/
	126	UnicodeString undefinedVariableName;
	127
	128	/**
	129	* The stand-in character for the 'dot' set, represented by '.' in
	130	* patterns. This is allocated the first time it is needed, and
	131	* reused thereafter.
	132	*/
	133	UChar dotStandIn;
	134
	135	public:
	136
	137	/**
	138	* Constructor.
	139	*/
	140	TransliteratorParser(UErrorCode &statusReturn);
	141
	142	/**
	143	* Destructor.
	144	*/
	145	~TransliteratorParser();
	146
	147	/**
	148	* Parse the given string as a sequence of rules, separated by newline
	149	* characters ('\n'), and cause this object to implement those rules. Any
	150	* previous rules are discarded. Typically this method is called exactly
	151	* once after construction.
	152	*
	153	* Parse the given rules, in the given direction. After this call
	154	* returns, query the public data members for results. The caller
	155	* owns the 'data' and 'compoundFilter' data members after this
	156	* call returns.
	157	* @param rules rules, separated by ';'
	158	* @param direction either FORWARD or REVERSE.
	159	* @param pe Struct to recieve information on position
	160	* of error if an error is encountered
	161	* @param ec Output param set to success/failure code.
	162	*/
	163	void parse(const UnicodeString& rules,
	164	UTransDirection direction,
	165	UParseError& pe,
	166	UErrorCode& ec);
	167
	168	/**
	169	* Return the compound filter parsed by parse(). Caller owns result.
	170	* @return the compound filter parsed by parse().
	171	*/
	172	UnicodeSet* orphanCompoundFilter();
	173
	174	private:
	175
	176	/**
	177	* Return a representation of this transliterator as source rules.
	178	* @param rules Output param to receive the rules.
	179	* @param direction either FORWARD or REVERSE.
	180	*/
	181	void parseRules(const UnicodeString& rules,
	182	UTransDirection direction,
	183	UErrorCode& status);
	184
	185	/**
	186	* MAIN PARSER. Parse the next rule in the given rule string, starting
	187	* at pos. Return the index after the last character parsed. Do not
	188	* parse characters at or after limit.
	189	*
	190	* Important: The character at pos must be a non-whitespace character
	191	* that is not the comment character.
	192	*
	193	* This method handles quoting, escaping, and whitespace removal. It
	194	* parses the end-of-rule character. It recognizes context and cursor
	195	* indicators. Once it does a lexical breakdown of the rule at pos, it
	196	* creates a rule object and adds it to our rule list.
	197	* @param rules Output param to receive the rules.
	198	* @param pos the starting position.
	199	* @param limit pointer past the last character of the rule.
	200	* @return the index after the last character parsed.
	201	*/
	202	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
	203
	204	/**
	205	* Set the variable range to [start, end] (inclusive).
	206	* @param start the start value of the range.
	207	* @param end the end value of the range.
	208	*/
	209	void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
	210
	211	/**
	212	* Assert that the given character is NOT within the variable range.
	213	* If it is, return FALSE. This is neccesary to ensure that the
	214	* variable range does not overlap characters used in a rule.
	215	* @param ch the given character.
	216	* @return True, if the given character is NOT within the variable range.
	217	*/
	218	UBool checkVariableRange(UChar32 ch) const;
	219
	220	/**
	221	* Set the maximum backup to 'backup', in response to a pragma
	222	* statement.
	223	* @param backup the new value to be set.
	224	*/
	225	void pragmaMaximumBackup(int32_t backup);
	226
	227	/**
	228	* Begin normalizing all rules using the given mode, in response
	229	* to a pragma statement.
	230	* @param mode the given mode.
	231	*/
	232	void pragmaNormalizeRules(UNormalizationMode mode);
	233
	234	/**
	235	* Return true if the given rule looks like a pragma.
	236	* @param pos offset to the first non-whitespace character
	237	* of the rule.
	238	* @param limit pointer past the last character of the rule.
	239	* @return true if the given rule looks like a pragma.
	240	*/
	241	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
	242
	243	/**
	244	* Parse a pragma. This method assumes resemblesPragma() has
	245	* already returned true.
	246	* @param pos offset to the first non-whitespace character
	247	* of the rule.
	248	* @param limit pointer past the last character of the rule.
	249	* @return the position index after the final ';' of the pragma,
	250	* or -1 on failure.
	251	*/
	252	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
	253
	254	/**
	255	* Called by main parser upon syntax error. Search the rule string
	256	* for the probable end of the rule. Of course, if the error is that
	257	* the end of rule marker is missing, then the rule end will not be found.
	258	* In any case the rule start will be correctly reported.
	259	* @param parseErrorCode error code.
	260	* @param msg error description.
	261	* @param start position of first character of current rule.
	262	* @return start position of first character of current rule.
	263	*/
	264	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
	265	UErrorCode& status);
	266
	267	/**
	268	* Parse a UnicodeSet out, store it, and return the stand-in character
	269	* used to represent it.
	270	*
	271	* @param rule the rule for UnicodeSet.
	272	* @param pos the position in pattern at which to start parsing.
	273	* @return the stand-in character used to represent it.
	274	*/
	275	UChar parseSet(const UnicodeString& rule,
	276	ParsePosition& pos,
	277	UErrorCode& status);
	278
	279	/**
	280	* Generate and return a stand-in for a new UnicodeFunctor. Store
	281	* the matcher (adopt it).
	282	* @param adopted the UnicodeFunctor to be adopted.
	283	* @return a stand-in for a new UnicodeFunctor.
	284	*/
	285	UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
	286
	287	/**
	288	* Return the standin for segment seg (1-based).
	289	* @param seg the given segment.
	290	* @return the standIn character for the given segment.
	291	*/
	292	UChar getSegmentStandin(int32_t seg, UErrorCode& status);
	293
	294	/**
	295	* Set the object for segment seg (1-based).
	296	* @param seg the given segment.
	297	* @param adopted the StringMatcher to be adopted.
	298	*/
	299	void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
	300
	301	/**
	302	* Return the stand-in for the dot set. It is allocated the first
	303	* time and reused thereafter.
	304	* @return the stand-in for the dot set.
	305	*/
	306	UChar getDotStandIn(UErrorCode& status);
	307
	308	/**
	309	* Append the value of the given variable name to the given
	310	* UnicodeString.
	311	* @param name the variable name to be appended.
	312	* @param buf the given UnicodeString to append to.
	313	*/
	314	void appendVariableDef(const UnicodeString& name,
	315	UnicodeString& buf,
	316	UErrorCode& status);
	317
	318	/**
	319	* Glue method to get around access restrictions in C++.
	320	*/
	321	/static Transliterator createBasicInstance(const UnicodeString& id,
	322	const UnicodeString* canonID);*/
	323
	324	friend class RuleHalf;
	325
	326	// Disallowed methods; no impl.
	327	/**
	328	* Copy constructor
	329	*/
	330	TransliteratorParser(const TransliteratorParser&);
	331
	332	/**
	333	* Assignment operator
	334	*/
	335	TransliteratorParser& operator=(const TransliteratorParser&);
	336	};
	337
	338	U_NAMESPACE_END
	339
	340	#endif /* #ifdef __cplusplus */
	341
	342	/**
	343	* Strip/convert the following from the transliterator rules:
	344	* comments
	345	* newlines
	346	* white space at the beginning and end of a line
	347	* unescape \u notation
	348	*
	349	* The target must be equal in size as the source.
	350	* @internal
	351	*/
	352	U_CAPI int32_t
	353	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status);
	354
	355	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	356
	357	#endif