]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/regexcmp.h
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / regexcmp.h
index 94c83b63088407e64c1aa8b23e645eec798fc2cf..f6bdeafd031ee851a8539813fe3e6b080ed9638c 100644 (file)
@@ -1,7 +1,7 @@
 //
 //  regexcmp.h
 //
-//  Copyright (C) 2002-2008, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for the class RegexCompile
 #include "unicode/utypes.h"
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 
-#include "unicode/uobject.h"
-#include "unicode/uniset.h"
 #include "unicode/parseerr.h"
+#include "unicode/uniset.h"
+#include "unicode/uobject.h"
+#include "unicode/utext.h"
 #include "uhash.h"
 #include "uvector.h"
+#include "uvectr32.h"
 
 
 
@@ -37,7 +39,7 @@ struct  RegexTableEl;
 class   RegexPattern;
 
 
-class RegexCompile : public UMemory {
+class U_I18N_API RegexCompile : public UMemory {
 public:
 
     enum {
@@ -54,7 +56,8 @@ public:
     RegexCompile(RegexPattern *rp, UErrorCode &e);
 
     void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
-
+    void       compile(UText *pat, UParseError &pp, UErrorCode &e);
+    
 
     virtual    ~RegexCompile();
 
@@ -99,11 +102,16 @@ private:
                                int32_t LoopOp);
     UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
     void        literalChar(UChar32 c);              // Compile a literal char
-    void        fixLiterals(UBool split=FALSE);      // Fix literal strings.
+    void        fixLiterals(UBool split=FALSE);      // Generate code for pending literal characters.
     void        insertOp(int32_t where);             // Open up a slot for a new op in the
                                                      //   generated code at the specified location.
-    void        emitONE_CHAR(UChar32 c);             // EMit a ONE_CHAR op into the compiled code,
-                                                     //   taking case mode into account.
+    void        appendOp(int32_t op);                // Append a new op to the compiled pattern.
+    void        appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
+    int32_t     buildOp(int32_t type, int32_t val);  // Construct a new pcode instruction.
+    int32_t     allocateData(int32_t size);          // Allocate space in the matcher data area.
+                                                     //   Return index of the newly allocated data.
+    int32_t     allocateStackData(int32_t size);     // Allocate space in the match back-track stack frame.
+                                                     //   Return offset index in the frame.
     int32_t     minMatchLength(int32_t start,
                                int32_t end);
     int32_t     maxMatchLength(int32_t start,
@@ -116,6 +124,10 @@ private:
     UChar32     scanNamedChar();
     UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
 
+public:   // Public for testing only.
+    static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
+private:
+
 
     UErrorCode                    *fStatus;
     RegexPattern                  *fRXPat;
@@ -124,16 +136,14 @@ private:
     //
     //  Data associated with low level character scanning
     //
-    int32_t                       fScanIndex;        // Index of current character being processed
+    int64_t                       fScanIndex;        // Index of current character being processed
                                                      //   in the rule input string.
-    int32_t                       fNextIndex;        // Index of the next character, which
-                                                     //   is the first character not yet scanned.
     UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
     UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
     UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
                                                      //   end of line comments, in favor of (?#...) comments.
-    int32_t                       fLineNum;          // Line number in input file.
-    int32_t                       fCharNum;          // Char position within the line.
+    int64_t                       fLineNum;          // Line number in input file.
+    int64_t                       fCharNum;          // Char position within the line.
     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
                                                      //   as a single line, not two.
     UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
@@ -163,13 +173,14 @@ private:
                                                      //   until last flag is scanned.
     UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx
 
+    UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here.
+                                                     //   Once completed, meaning that some non-literal pattern
+                                                     //   construct is encountered, the appropriate opcodes
+                                                     //   to match the literal will be generated, and this
+                                                     //   string will be cleared.
 
-    int32_t                       fStringOpStart;    // While a literal string is being scanned
-                                                     //   holds the start index within RegexPattern.
-                                                     //   fLiteralText where the string is being stored.
-
-    int32_t                       fPatternLength;    // Length of the input pattern string.
-
+    int64_t                       fPatternLength;    // Length of the input pattern string.
+    
     UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
                                                      //   the positions of compiled pattern operations
                                                      //   needing fixup, followed by negative value.  The
@@ -184,7 +195,9 @@ private:
     int32_t                       fMatchOpenParen;   // The position in the compiled pattern
                                                      //   of the slot reserved for a state save
                                                      //   at the start of the most recently processed
-                                                     //   parenthesized block.
+                                                     //   parenthesized block. Updated when processing
+                                                     //   a close to the location for the corresponding open.
+
     int32_t                       fMatchCloseParen;  // The position in the pattern of the first
                                                      //   location after the most recently processed
                                                      //   parenthesized block.
@@ -196,7 +209,7 @@ private:
                                                      //   -1 for the upper interval value means none
                                                      //   was specified (unlimited occurences.)
 
-    int32_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
+    int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
                                                      //   pattern, valid while remainder of name is
                                                      //   scanned.
 
@@ -209,6 +222,8 @@ private:
                                                      //   needed when "-y" is scanned, and we need
                                                      //   to turn "x-y" into a range.
 
+    UnicodeString                *fCaptureName;      // Named Capture, the group name is built up
+                                                     //   in this string while being scanned.
 };
 
 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]