+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
-* Copyright (C) {1999-2003}, International Business Machines Corporation and others. All Rights Reserved.
+**********************************************************************
+* Copyright (C) 1999-2011, International Business Machines Corporation
+* and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
+#ifdef __cplusplus
#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"
+#include "hash.h"
+#include "uvector.h"
U_NAMESPACE_BEGIN
class ParseData;
class RuleHalf;
class ParsePosition;
-class UVector;
class StringMatcher;
class TransliteratorParser : public UMemory {
public:
/**
- * PUBLIC data member containing the parsed data object, or null if
- * there were no rules.
- */
- TransliterationRuleData* data;
-
- /**
- * PUBLIC data member.
- * The block of ::IDs, both at the top and at the bottom.
- * Inserted into these may be additional rules at the
- * idSplitPoint.
+ * A Vector of TransliterationRuleData objects, one for each discrete group
+ * of rules in the rule set
*/
- UnicodeString idBlock;
+ UVector dataVector;
/**
* PUBLIC data member.
- * In a compound RBT, the index at which the RBT rules are
- * inserted into the ID block. Index 0 means before any IDs
- * in the block. Index idBlock.length() means after all IDs
- * in the block. Index is a string index.
+ * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
*/
- int32_t idSplitPoint;
+ UVector idBlockVector;
/**
* PUBLIC data member containing the parsed compound filter, if any.
private:
- // The number of rules parsed. This tells us if there were
- // any actual transliterator rules, or if there were just ::ID
- // block IDs.
- int32_t ruleCount;
-
- UTransDirection direction;
-
/**
- * We use a single error code during parsing. Rather than pass it
- * through each API, we keep it here.
+ * The current data object for which we are parsing rules
*/
- UErrorCode status;
+ TransliterationRuleData* curData;
+
+ UTransDirection direction;
/**
* Parse error information.
* is copied into the array data.variables. As with data.variables,
* element 0 corresponds to character data.variablesBase.
*/
- UVector* variablesVector;
+ UVector variablesVector;
+ /**
+ * Temporary table of variable names. When parsing is complete, this is
+ * copied into data.variableNames.
+ */
+ Hashtable variableNames;
+
/**
* String of standins for segments. Used during the parsing of a single
* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
* segmentStandins.charAt(0) is the standin for "$1" and corresponds
* to StringMatcher object segmentObjects.elementAt(0), etc.
*/
- UVector* segmentObjects;
+ UVector segmentObjects;
/**
* The next available stand-in for variables. This starts at some point in
/**
* Constructor.
*/
- TransliteratorParser();
+ TransliteratorParser(UErrorCode &statusReturn);
/**
* Destructor.
*/
UnicodeSet* orphanCompoundFilter();
- /**
- * Return the data object parsed by parse(). Caller owns result.
- * @return the data object parsed by parse().
- */
- TransliterationRuleData* orphanData();
-
private:
/**
* @param direction either FORWARD or REVERSE.
*/
void parseRules(const UnicodeString& rules,
- UTransDirection direction);
+ UTransDirection direction,
+ UErrorCode& status);
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* @param limit pointer past the last character of the rule.
* @return the index after the last character parsed.
*/
- int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);
+ int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
/**
* Set the variable range to [start, end] (inclusive).
* @param start the start value of the range.
* @param end the end value of the range.
*/
- void setVariableRange(int32_t start, int32_t end);
+ void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
/**
* Assert that the given character is NOT within the variable range.
* @return the position index after the final ';' of the pragma,
* or -1 on failure.
*/
- int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);
+ int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
/**
* Called by main parser upon syntax error. Search the rule string
* @param start position of first character of current rule.
* @return start position of first character of current rule.
*/
- int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);
+ int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
+ UErrorCode& status);
/**
* Parse a UnicodeSet out, store it, and return the stand-in character
* @return the stand-in character used to represent it.
*/
UChar parseSet(const UnicodeString& rule,
- ParsePosition& pos);
+ ParsePosition& pos,
+ UErrorCode& status);
/**
* Generate and return a stand-in for a new UnicodeFunctor. Store
* @param adopted the UnicodeFunctor to be adopted.
* @return a stand-in for a new UnicodeFunctor.
*/
- UChar generateStandInFor(UnicodeFunctor* adopted);
+ UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
/**
* Return the standin for segment seg (1-based).
* @param seg the given segment.
* @return the standIn character for the given segment.
*/
- UChar getSegmentStandin(int32_t seg);
+ UChar getSegmentStandin(int32_t seg, UErrorCode& status);
/**
* Set the object for segment seg (1-based).
* @param seg the given segment.
* @param adopted the StringMatcher to be adopted.
*/
- void setSegmentObject(int32_t seg, StringMatcher* adopted);
+ void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
/**
* Return the stand-in for the dot set. It is allocated the first
* time and reused thereafter.
* @return the stand-in for the dot set.
*/
- UChar getDotStandIn();
+ UChar getDotStandIn(UErrorCode& status);
/**
* Append the value of the given variable name to the given
* @param buf the given UnicodeString to append to.
*/
void appendVariableDef(const UnicodeString& name,
- UnicodeString& buf);
+ UnicodeString& buf,
+ UErrorCode& status);
/**
* Glue method to get around access restrictions in C++.
*/
- static Transliterator* createBasicInstance(const UnicodeString& id,
- const UnicodeString* canonID);
+ /*static Transliterator* createBasicInstance(const UnicodeString& id,
+ const UnicodeString* canonID);*/
friend class RuleHalf;
U_NAMESPACE_END
+#endif /* #ifdef __cplusplus */
+
+/**
+ * Strip/convert the following from the transliterator rules:
+ * comments
+ * newlines
+ * white space at the beginning and end of a line
+ * unescape \u notation
+ *
+ * The target must be equal in size as the source.
+ * @internal
+ */
+U_CAPI int32_t
+utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
+
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
#endif