//
// regexcmp.h
//
-// Copyright (C) 2002-2008, International Business Machines Corporation and others.
+// Copyright (C) 2002-2016, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-#include "unicode/uobject.h"
-#include "unicode/uniset.h"
#include "unicode/parseerr.h"
+#include "unicode/uniset.h"
+#include "unicode/uobject.h"
+#include "unicode/utext.h"
#include "uhash.h"
#include "uvector.h"
+#include "uvectr32.h"
class RegexPattern;
-class RegexCompile : public UMemory {
+class U_I18N_API RegexCompile : public UMemory {
public:
enum {
RegexCompile(RegexPattern *rp, UErrorCode &e);
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
-
+ void compile(UText *pat, UParseError &pp, UErrorCode &e);
+
virtual ~RegexCompile();
int32_t LoopOp);
UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
void literalChar(UChar32 c); // Compile a literal char
- void fixLiterals(UBool split=FALSE); // Fix literal strings.
+ void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
- void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
- // taking case mode into account.
+ void appendOp(int32_t op); // Append a new op to the compiled pattern.
+ void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
+ int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction.
+ int32_t allocateData(int32_t size); // Allocate space in the matcher data area.
+ // Return index of the newly allocated data.
+ int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame.
+ // Return offset index in the frame.
int32_t minMatchLength(int32_t start,
int32_t end);
int32_t maxMatchLength(int32_t start,
UChar32 scanNamedChar();
UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
+public: // Public for testing only.
+ static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
+private:
+
UErrorCode *fStatus;
RegexPattern *fRXPat;
//
// Data associated with low level character scanning
//
- int32_t fScanIndex; // Index of current character being processed
+ int64_t fScanIndex; // Index of current character being processed
// in the rule input string.
- int32_t fNextIndex; // Index of the next character, which
- // is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
- int32_t fLineNum; // Line number in input file.
- int32_t fCharNum; // Char position within the line.
+ int64_t fLineNum; // Line number in input file.
+ int64_t fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
// until last flag is scanned.
UBool fSetModeFlag; // true for (?ismx, false for (?-ismx
+ UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here.
+ // Once completed, meaning that some non-literal pattern
+ // construct is encountered, the appropriate opcodes
+ // to match the literal will be generated, and this
+ // string will be cleared.
- int32_t fStringOpStart; // While a literal string is being scanned
- // holds the start index within RegexPattern.
- // fLiteralText where the string is being stored.
-
- int32_t fPatternLength; // Length of the input pattern string.
-
+ int64_t fPatternLength; // Length of the input pattern string.
+
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
// needing fixup, followed by negative value. The
int32_t fMatchOpenParen; // The position in the compiled pattern
// of the slot reserved for a state save
// at the start of the most recently processed
- // parenthesized block.
+ // parenthesized block. Updated when processing
+ // a close to the location for the corresponding open.
+
int32_t fMatchCloseParen; // The position in the pattern of the first
// location after the most recently processed
// parenthesized block.
// -1 for the upper interval value means none
// was specified (unlimited occurences.)
- int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
+ int64_t fNameStartPos; // Starting position of a \N{NAME} name in a
// pattern, valid while remainder of name is
// scanned.
// needed when "-y" is scanned, and we need
// to turn "x-y" into a range.
+ UnicodeString *fCaptureName; // Named Capture, the group name is built up
+ // in this string while being scanned.
};
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]