+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
//
// regexcmp.h
//
-// Copyright (C) 2002-2003, International Business Machines Corporation and others.
+// Copyright (C) 2002-2016, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-#include "unicode/uobject.h"
-#include "unicode/uniset.h"
#include "unicode/parseerr.h"
+#include "unicode/uniset.h"
+#include "unicode/uobject.h"
+#include "unicode/utext.h"
#include "uhash.h"
#include "uvector.h"
+#include "uvectr32.h"
// class RegexCompile Contains the regular expression compiler.
//
//--------------------------------------------------------------------------------
-static const int kStackSize = 100; // The size of the state stack for
- // pattern parsing. Corresponds roughly
- // to the depth of parentheses nesting
- // that is allowed in the rules.
-
-enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for
- // actions that are specified in the
- // rule parsing state table.
struct RegexTableEl;
class RegexPattern;
-class RegexCompile : public UMemory {
+class U_I18N_API RegexCompile : public UMemory {
public:
+ enum {
+ kStackSize = 100 // The size of the state stack for
+ }; // pattern parsing. Corresponds roughly
+ // to the depth of parentheses nesting
+ // that is allowed in the rules.
+
struct RegexPatternChar {
UChar32 fChar;
UBool fQuoted;
};
RegexCompile(RegexPattern *rp, UErrorCode &e);
-
- void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
+ void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
+ void compile(UText *pat, UParseError &pp, UErrorCode &e);
+
virtual ~RegexCompile();
// determines the code to be generated when the matching close ) is encountered.
enum EParenClass {
plain = -1, // No special handling
- capturing = -2,
+ capturing = -2,
atomic = -3,
lookAhead = -4,
negLookAhead = -5,
private:
- UBool doParseActions(EParseAction a);
+ UBool doParseActions(int32_t a);
void error(UErrorCode e); // error reporting convenience function.
UChar32 nextCharLL();
UChar32 peekCharLL();
- UnicodeSet *scanSet();
UnicodeSet *scanProp();
+ UnicodeSet *scanPosixProp();
void handleCloseParen();
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
// at the top of the just completed block
int32_t LoopOp);
UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
void literalChar(UChar32 c); // Compile a literal char
- void fixLiterals(UBool split=FALSE); // Fix literal strings.
+ void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
- void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
- // taking case mode into account.
+ void appendOp(int32_t op); // Append a new op to the compiled pattern.
+ void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
+ int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction.
+ int32_t allocateData(int32_t size); // Allocate space in the matcher data area.
+ // Return index of the newly allocated data.
+ int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame.
+ // Return offset index in the frame.
int32_t minMatchLength(int32_t start,
int32_t end);
int32_t maxMatchLength(int32_t start,
int32_t end);
void matchStartType();
void stripNOPs();
- void OptDotStar();
+
+ void setEval(int32_t op);
+ void setPushOp(int32_t op);
+ UChar32 scanNamedChar();
+ UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
+
+public: // Public for testing only.
+ static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
+private:
UErrorCode *fStatus;
//
// Data associated with low level character scanning
//
- int32_t fScanIndex; // Index of current character being processed
+ int64_t fScanIndex; // Index of current character being processed
// in the rule input string.
- int32_t fNextIndex; // Index of the next character, which
- // is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
- UBool fEOLComments; // When scan is just after '(?', inhibit #... to
+ UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
- int fLineNum; // Line number in input file.
- int fCharNum; // Char position within the line.
+ int64_t fLineNum; // Line number in input file.
+ int64_t fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
// parsing. index by p[state][char-class]
uint16_t fStack[kStackSize]; // State stack, holds state pushes
- int fStackPtr; // and pops as specified in the state
+ int32_t fStackPtr; // and pops as specified in the state
// transition rules.
//
// Data associated with the generation of the pcode for the match engine
//
int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.)
+ // Always has high bit (31) set so that flag values
+ // on the paren stack are distinguished from relocatable
+ // pcode addresses.
int32_t fNewModeFlags; // New flags, while compiling (?i, holds state
// until last flag is scanned.
UBool fSetModeFlag; // true for (?ismx, false for (?-ismx
+ UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here.
+ // Once completed, meaning that some non-literal pattern
+ // construct is encountered, the appropriate opcodes
+ // to match the literal will be generated, and this
+ // string will be cleared.
- int32_t fStringOpStart; // While a literal string is being scanned
- // holds the start index within RegexPattern.
- // fLiteralText where the string is being stored.
-
- int32_t fPatternLength; // Length of the input pattern string.
-
+ int64_t fPatternLength; // Length of the input pattern string.
+
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
- // needing fixup, followed by negative value. The
+ // needing fixup, followed by negative value. The
// first entry in each frame is the position of the
// spot reserved for use when a quantifier
// needs to add a SAVE at the start of a (block)
int32_t fMatchOpenParen; // The position in the compiled pattern
// of the slot reserved for a state save
// at the start of the most recently processed
- // parenthesized block.
+ // parenthesized block. Updated when processing
+ // a close to the location for the corresponding open.
+
int32_t fMatchCloseParen; // The position in the pattern of the first
// location after the most recently processed
// parenthesized block.
// -1 for the upper interval value means none
// was specified (unlimited occurences.)
- int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
+ int64_t fNameStartPos; // Starting position of a \N{NAME} name in a
// pattern, valid while remainder of name is
// scanned.
+
+ UStack fSetStack; // Stack of UnicodeSets, used while evaluating
+ // (at compile time) set expressions within
+ // the pattern.
+ UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
+
+ UChar32 fLastSetLiteral; // The last single code point added to a set.
+ // needed when "-y" is scanned, and we need
+ // to turn "x-y" into a range.
+
+ UnicodeString *fCaptureName; // Named Capture, the group name is built up
+ // in this string while being scanned.
};
+// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
+// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
+
+enum SetOperations {
+ setStart = 0 << 16 | 1,
+ setEnd = 1 << 16 | 2,
+ setNegation = 2 << 16 | 3,
+ setCaseClose = 2 << 16 | 9,
+ setDifference2 = 3 << 16 | 4, // '--' set difference operator
+ setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
+ setUnion = 4 << 16 | 6, // implicit union of adjacent items
+ setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
+ setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
+ };
+
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif // RBBISCAN_H