icuSources/i18n/rbt_pars.h

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 * Copyright (C) 1999-2011, International Business Machines Corporation
   6 * and others. All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   11/17/99    aliu        Creation.
  10 **********************************************************************
  11 */
  12 #ifndef RBT_PARS_H
  13 #define RBT_PARS_H
  14
  15 #include "unicode/utypes.h"
  16
  17 #if !UCONFIG_NO_TRANSLITERATION
  18 #ifdef __cplusplus
  19
  20 #include "unicode/uobject.h"
  21 #include "unicode/parseerr.h"
  22 #include "unicode/unorm.h"
  23 #include "rbt.h"
  24 #include "hash.h"
  25 #include "uvector.h"
  26
  27 U_NAMESPACE_BEGIN
  28
  29 class TransliterationRuleData;
  30 class UnicodeFunctor;
  31 class ParseData;
  32 class RuleHalf;
  33 class ParsePosition;
  34 class StringMatcher;
  35
  36 class TransliteratorParser : public UMemory {
  37
  38  public:
  39
  40     /**
  41      * A Vector of TransliterationRuleData objects, one for each discrete group
  42      * of rules in the rule set
  43      */
  44     UVector dataVector;
  45
  46     /**
  47      * PUBLIC data member.
  48      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
  49      */
  50     UVector idBlockVector;
  51
  52     /**
  53      * PUBLIC data member containing the parsed compound filter, if any.
  54      */
  55     UnicodeSet* compoundFilter;
  56
  57  private:
  58
  59     /**
  60      * The current data object for which we are parsing rules
  61      */
  62     TransliterationRuleData* curData;
  63
  64     UTransDirection direction;
  65
  66     /**
  67      * Parse error information.
  68      */
  69     UParseError parseError;
  70
  71     /**
  72      * Temporary symbol table used during parsing.
  73      */
  74     ParseData* parseData;
  75
  76     /**
  77      * Temporary vector of matcher variables.  When parsing is complete, this
  78      * is copied into the array data.variables.  As with data.variables,
  79      * element 0 corresponds to character data.variablesBase.
  80      */
  81     UVector variablesVector;
  82
  83     /**
  84      * Temporary table of variable names.  When parsing is complete, this is
  85      * copied into data.variableNames.
  86      */
  87     Hashtable variableNames;
  88
  89     /**
  90      * String of standins for segments.  Used during the parsing of a single
  91      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
  92      * to StringMatcher object segmentObjects.elementAt(0), etc.
  93      */
  94     UnicodeString segmentStandins;
  95
  96     /**
  97      * Vector of StringMatcher objects for segments.  Used during the
  98      * parsing of a single rule.
  99      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
 100      * to StringMatcher object segmentObjects.elementAt(0), etc.
 101      */
 102     UVector segmentObjects;
 103
 104     /**
 105      * The next available stand-in for variables.  This starts at some point in
 106      * the private use area (discovered dynamically) and increments up toward
 107      * <code>variableLimit</code>.  At any point during parsing, available
 108      * variables are <code>variableNext..variableLimit-1</code>.
 109      */
 110     UChar variableNext;
 111
 112     /**
 113      * The last available stand-in for variables.  This is discovered
 114      * dynamically.  At any point during parsing, available variables are
 115      * <code>variableNext..variableLimit-1</code>.
 116      */
 117     UChar variableLimit;
 118
 119     /**
 120      * When we encounter an undefined variable, we do not immediately signal
 121      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
 122      * Instead, we save the name of the undefined variable, and substitute
 123      * in the placeholder char variableLimit - 1, and decrement
 124      * variableLimit.
 125      */
 126     UnicodeString undefinedVariableName;
 127
 128     /**
 129      * The stand-in character for the 'dot' set, represented by '.' in
 130      * patterns.  This is allocated the first time it is needed, and
 131      * reused thereafter.
 132      */
 133     UChar dotStandIn;
 134
 135 public:
 136
 137     /**
 138      * Constructor.
 139      */
 140     TransliteratorParser(UErrorCode &statusReturn);
 141
 142     /**
 143      * Destructor.
 144      */
 145     ~TransliteratorParser();
 146
 147     /**
 148      * Parse the given string as a sequence of rules, separated by newline
 149      * characters ('\n'), and cause this object to implement those rules.  Any
 150      * previous rules are discarded.  Typically this method is called exactly
 151      * once after construction.
 152      *
 153      * Parse the given rules, in the given direction.  After this call
 154      * returns, query the public data members for results.  The caller
 155      * owns the 'data' and 'compoundFilter' data members after this
 156      * call returns.
 157      * @param rules      rules, separated by ';'
 158      * @param direction  either FORWARD or REVERSE.
 159      * @param pe         Struct to recieve information on position
 160      *                   of error if an error is encountered
 161      * @param ec         Output param set to success/failure code.
 162      */
 163     void parse(const UnicodeString& rules,
 164                UTransDirection direction,
 165                UParseError& pe,
 166                UErrorCode& ec);
 167
 168     /**
 169      * Return the compound filter parsed by parse().  Caller owns result.
 170      * @return the compound filter parsed by parse().
 171      */
 172     UnicodeSet* orphanCompoundFilter();
 173
 174 private:
 175
 176     /**
 177      * Return a representation of this transliterator as source rules.
 178      * @param rules      Output param to receive the rules.
 179      * @param direction  either FORWARD or REVERSE.
 180      */
 181     void parseRules(const UnicodeString& rules,
 182                     UTransDirection direction,
 183                     UErrorCode& status);
 184
 185     /**
 186      * MAIN PARSER.  Parse the next rule in the given rule string, starting
 187      * at pos.  Return the index after the last character parsed.  Do not
 188      * parse characters at or after limit.
 189      *
 190      * Important:  The character at pos must be a non-whitespace character
 191      * that is not the comment character.
 192      *
 193      * This method handles quoting, escaping, and whitespace removal.  It
 194      * parses the end-of-rule character.  It recognizes context and cursor
 195      * indicators.  Once it does a lexical breakdown of the rule at pos, it
 196      * creates a rule object and adds it to our rule list.
 197      * @param rules      Output param to receive the rules.
 198      * @param pos        the starting position.
 199      * @param limit      pointer past the last character of the rule.
 200      * @return           the index after the last character parsed.
 201      */
 202     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
 203
 204     /**
 205      * Set the variable range to [start, end] (inclusive).
 206      * @param start    the start value of the range.
 207      * @param end      the end value of the range.
 208      */
 209     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
 210
 211     /**
 212      * Assert that the given character is NOT within the variable range.
 213      * If it is, return FALSE.  This is neccesary to ensure that the
 214      * variable range does not overlap characters used in a rule.
 215      * @param ch     the given character.
 216      * @return       True, if the given character is NOT within the variable range.
 217      */
 218     UBool checkVariableRange(UChar32 ch) const;
 219
 220     /**
 221      * Set the maximum backup to 'backup', in response to a pragma
 222      * statement.
 223      * @param backup    the new value to be set.
 224      */
 225     void pragmaMaximumBackup(int32_t backup);
 226
 227     /**
 228      * Begin normalizing all rules using the given mode, in response
 229      * to a pragma statement.
 230      * @param mode    the given mode.
 231      */
 232     void pragmaNormalizeRules(UNormalizationMode mode);
 233
 234     /**
 235      * Return true if the given rule looks like a pragma.
 236      * @param pos offset to the first non-whitespace character
 237      * of the rule.
 238      * @param limit pointer past the last character of the rule.
 239      * @return true if the given rule looks like a pragma.
 240      */
 241     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
 242
 243     /**
 244      * Parse a pragma.  This method assumes resemblesPragma() has
 245      * already returned true.
 246      * @param pos offset to the first non-whitespace character
 247      * of the rule.
 248      * @param limit pointer past the last character of the rule.
 249      * @return the position index after the final ';' of the pragma,
 250      * or -1 on failure.
 251      */
 252     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
 253
 254     /**
 255      * Called by main parser upon syntax error.  Search the rule string
 256      * for the probable end of the rule.  Of course, if the error is that
 257      * the end of rule marker is missing, then the rule end will not be found.
 258      * In any case the rule start will be correctly reported.
 259      * @param parseErrorCode error code.
 260      * @param msg error description.
 261      * @param start position of first character of current rule.
 262      * @return start position of first character of current rule.
 263      */
 264     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
 265                         UErrorCode& status);
 266
 267     /**
 268      * Parse a UnicodeSet out, store it, and return the stand-in character
 269      * used to represent it.
 270      *
 271      * @param rule    the rule for UnicodeSet.
 272      * @param pos     the position in pattern at which to start parsing.
 273      * @return        the stand-in character used to represent it.
 274      */
 275     UChar parseSet(const UnicodeString& rule,
 276                    ParsePosition& pos,
 277                    UErrorCode& status);
 278
 279     /**
 280      * Generate and return a stand-in for a new UnicodeFunctor.  Store
 281      * the matcher (adopt it).
 282      * @param adopted the UnicodeFunctor to be adopted.
 283      * @return        a stand-in for a new UnicodeFunctor.
 284      */
 285     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
 286
 287     /**
 288      * Return the standin for segment seg (1-based).
 289      * @param seg    the given segment.
 290      * @return       the standIn character for the given segment.
 291      */
 292     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
 293
 294     /**
 295      * Set the object for segment seg (1-based).
 296      * @param seg      the given segment.
 297      * @param adopted  the StringMatcher to be adopted.
 298      */
 299     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
 300
 301     /**
 302      * Return the stand-in for the dot set.  It is allocated the first
 303      * time and reused thereafter.
 304      * @return    the stand-in for the dot set.
 305      */
 306     UChar getDotStandIn(UErrorCode& status);
 307
 308     /**
 309      * Append the value of the given variable name to the given
 310      * UnicodeString.
 311      * @param name    the variable name to be appended.
 312      * @param buf     the given UnicodeString to append to.
 313      */
 314     void appendVariableDef(const UnicodeString& name,
 315                            UnicodeString& buf,
 316                            UErrorCode& status);
 317
 318     /**
 319      * Glue method to get around access restrictions in C++.
 320      */
 321     /*static Transliterator* createBasicInstance(const UnicodeString& id,
 322                                                const UnicodeString* canonID);*/
 323
 324     friend class RuleHalf;
 325
 326     // Disallowed methods; no impl.
 327     /**
 328      * Copy constructor
 329      */
 330     TransliteratorParser(const TransliteratorParser&);
 331
 332     /**
 333      * Assignment operator
 334      */
 335     TransliteratorParser& operator=(const TransliteratorParser&);
 336 };
 337
 338 U_NAMESPACE_END
 339
 340 #endif /* #ifdef __cplusplus */
 341
 342 /**
 343  * Strip/convert the following from the transliterator rules:
 344  * comments
 345  * newlines
 346  * white space at the beginning and end of a line
 347  * unescape \u notation
 348  *
 349  * The target must be equal in size as the source.
 350  * @internal
 351  */
 352 U_CAPI int32_t
 353 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
 354
 355 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 356
 357 #endif