ICU-64260.0.1.tar.gz

[apple/icu.git] / icuSources / i18n / regeximp.h
diff --git a/icuSources/i18n/regeximp.h b/icuSources/i18n/regeximp.h

index 5446e88f407d5abc6a90aede2c57aba9ae4c54e1..51db88216789b919b1bd7728d550676e2c46c760 100644 (file)
--- a/icuSources/i18n/regeximp.h
+++ b/icuSources/i18n/regeximp.h
@@ -1,6 +1,8 @@
-// 
-//   Copyright (C) 2002-2004 International Business Machines Corporation 
-//   and others. All rights reserved.  
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+//
+//   Copyright (C) 2002-2015 International Business Machines Corporation
+//   and others. All rights reserved.
  //
  //   file:  regeximp.h
  //
@@ -12,11 +14,21 @@
  #ifndef _REGEXIMP_H
  #define _REGEXIMP_H
  
+#include "unicode/utypes.h"
+#include "unicode/uobject.h"
+#include "unicode/uniset.h"
+#include "unicode/utext.h"
+
  #include "cmemory.h"
+#include "ucase.h"
  
  U_NAMESPACE_BEGIN
  
-#ifdef REGEX_DEBUG   /* For debugging, define REGEX_DEBUG in regex.h, not here in this file. */
+// For debugging, define REGEX_DEBUG
+// To define with configure,
+//   CPPFLAGS="-DREGEX_DEBUG" ./runConfigureICU --enable-debug --disable-release Linux 
+
+#ifdef REGEX_DEBUG
  //
  //  debugging options.  Enable one or more of the three #defines immediately following
  //
@@ -36,19 +48,6 @@ U_NAMESPACE_BEGIN
  #define REGEX_SCAN_DEBUG_PRINTF(a)
  #endif
  
-#ifdef REGEX_DUMP_DEBUG
-#define REGEX_DUMP_DEBUG_PRINTF(a) printf a
-#else
-#define REGEX_DUMP_DEBUG_PRINTF(a)
-#endif
-
-#ifdef REGEX_RUN_DEBUG
-#define REGEX_RUN_DEBUG_PRINTF(a) printf a
-#define REGEX_DUMP_DEBUG_PRINTF(a) printf a
-#else
-#define REGEX_RUN_DEBUG_PRINTF(a)
-#endif
-
  
  //
  //  Opcode types     In the compiled form of the regexp, these are the type, or opcodes,
@@ -57,7 +56,7 @@ U_NAMESPACE_BEGIN
  enum {
       URX_RESERVED_OP   = 0,    // For multi-operand ops, most non-first words.
       URX_RESERVED_OP_N = 255,  // For multi-operand ops, negative operand values.
-     URX_BACKTRACK     = 1,
+     URX_BACKTRACK     = 1,    // Force a backtrack, as if a match test had failed.
       URX_END           = 2,
       URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
       URX_STRING        = 4,    // Value field is index of string start
@@ -66,16 +65,16 @@ enum {
       URX_NOP           = 7,
       URX_START_CAPTURE = 8,    // Value field is capture group number.
       URX_END_CAPTURE   = 9,    // Value field is capture group number
-     URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets.   
+     URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets.
       URX_SETREF        = 11,   // Value field is index of set in array of sets.
-     URX_DOTANY        = 12, 
+     URX_DOTANY        = 12,
       URX_JMP           = 13,   // Value field is destination position in
                                                      //   the pattern.
       URX_FAIL          = 14,   // Stop match operation,  No match.
  
       URX_JMP_SAV       = 15,   // Operand:  JMP destination location
       URX_BACKSLASH_B   = 16,   // Value field:  0:  \b    1:  \B
-     URX_BACKSLASH_G   = 17, 
+     URX_BACKSLASH_G   = 17,
       URX_JMP_SAV_X     = 18,   // Conditional JMP_SAV,
                                 //    Used in (x)+, breaks loop on zero length match.
                                 //    Operand:  Jmp destination.
@@ -88,21 +87,22 @@ enum {
       URX_DOLLAR        = 24,  // Also for \Z
  
       URX_CTR_INIT      = 25,   // Counter Inits for {Interval} loops.
-     URX_CTR_INIT_NG   = 26,   //   3 kinds, normal, non-greedy, and possessive.
+     URX_CTR_INIT_NG   = 26,   //   2 kinds, normal and non-greedy.
                                 //   These are 4 word opcodes.  See description.
                                 //    First Operand:  Data loc of counter variable
-                               //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx 
+                               //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx
                                 //                    at the end of the loop.
                                 //    3rd   Operand:  Minimum count.
                                 //    4th   Operand:  Max count, -1 for unbounded.
  
-     URX_DOTANY_PL     = 27,   // .+, match rest of the line.  Fail already at end.
+     URX_DOTANY_UNIX   = 27,   // '.' operator in UNIX_LINES mode, only \n marks end of line.
  
       URX_CTR_LOOP      = 28,   // Loop Ops for {interval} loops.
       URX_CTR_LOOP_NG   = 29,   //   Also in three flavors.
                                 //   Operand is loc of corresponding CTR_INIT.
  
-     URX_DOTANY_ALL_PL = 30,   // .+, match rest of the Input.  Fail if already at end
+     URX_CARET_M_UNIX  = 30,   // '^' operator, test for start of line in multi-line
+                               //      plus UNIX_LINES mode.
  
       URX_RELOC_OPRND   = 31,   // Operand value in multi-operand ops that refers
                                 //   back into compiled pattern code, and thus must
@@ -118,7 +118,7 @@ enum {
                                 //   within the matcher stack frame.
       URX_JMPX          = 36,  // Conditional JMP.
                                 //   First Operand:  JMP target location.
-                               //   Second Operand:  Data location containing an 
+                               //   Second Operand:  Data location containing an
                                 //     input position.  If current input position ==
                                 //     saved input position, FAIL rather than taking
                                 //     the JMP
@@ -157,7 +157,7 @@ enum {
       URX_LBN_END       = 48,   // Negative LookBehind end
                                 //   Parameter is the data location.
                                 //   Check that the match ended at the right spot.
-     URX_STAT_SETREF_N = 49,   // Reference to a prebuilt set (e.g. \w), negated  
+     URX_STAT_SETREF_N = 49,   // Reference to a prebuilt set (e.g. \w), negated
                                 //   Operand is index of set in array of sets.
       URX_LOOP_SR_I     = 50,   // Init a [set]* loop.
                                 //   Operand is the sets index in array of user sets.
@@ -166,12 +166,21 @@ enum {
                                 //   Must always immediately follow  LOOP_x_I instruction.
       URX_LOOP_DOT_I    = 52,   // .*, initialization of the optimized loop.
                                 //   Operand value:
-                               //      0:  Normal (. doesn't match new-line) mode.
-                               //      1:  . matches new-line mode.
-     URX_BACKSLASH_BU  = 53    // \b or \B in UREGEX_UWORD mode, using Unicode style
+                               //      bit 0:
+                               //         0:  Normal (. doesn't match new-line) mode.
+                               //         1:  . matches new-line mode.
+                               //      bit 1:  controls what new-lines are recognized by this operation.
+                               //         0:  All Unicode New-lines
+                               //         1:  UNIX_LINES, \u000a only.
+     URX_BACKSLASH_BU  = 53,   // \b or \B in UREGEX_UWORD mode, using Unicode style
                                 //   word boundaries.
+     URX_DOLLAR_D      = 54,   // $ end of input test, in UNIX_LINES mode.
+     URX_DOLLAR_MD     = 55,   // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
+     URX_BACKSLASH_H   = 56,   // Value field:  0:  \h    1:  \H
+     URX_BACKSLASH_R   = 57,   // Any line break sequence.
+     URX_BACKSLASH_V   = 58    // Value field:  0:  \v    1:  \V
  
-};           
+};
  
  // Keep this list of opcode names in sync with the above enum
  //   Used for debug printing only.
@@ -203,10 +212,10 @@ enum {
          "DOLLAR",              \
          "CTR_INIT",            \
          "CTR_INIT_NG",         \
-        "DOTANY_PL",           \
+        "DOTANY_UNIX",         \
          "CTR_LOOP",            \
          "CTR_LOOP_NG",         \
-        "DOTANY_ALL_PL",       \
+        "URX_CARET_M_UNIX",    \
          "RELOC_OPRND",         \
          "STO_SP",              \
          "LD_SP",               \
@@ -229,21 +238,25 @@ enum {
          "LOOP_SR_I",           \
          "LOOP_C",              \
          "LOOP_DOT_I",          \
-        "BACKSLASH_BU"
+        "BACKSLASH_BU",        \
+        "DOLLAR_D",            \
+        "DOLLAR_MD",           \
+        "URX_BACKSLASH_H",     \
+        "URX_BACKSLASH_R",     \
+        "URX_BACKSLASH_V" 
  
  
  //
  //  Convenience macros for assembling and disassembling a compiled operation.
  //
-#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
-#define URX_TYPE(x)          ((uint32_t)(x) >> 24) 
+#define URX_TYPE(x)          ((uint32_t)(x) >> 24)
  #define URX_VAL(x)           ((x) & 0xffffff)
  
-                
+
  //
  //  Access to Unicode Sets composite character properties
  //     The sets are accessed by the match engine for things like \w (word boundary)
-//     
+//
  enum {
       URX_ISWORD_SET  = 1,
       URX_ISALNUM_SET = 2,
@@ -270,12 +283,18 @@ enum {
  //  Match Engine State Stack Frame Layout.
  //
  struct REStackFrame {
-    int32_t            fInputIdx;        // Position of next character in the input string
-    int32_t            fPatIdx;          // Position of next Op in the compiled pattern
-    int32_t            fExtra[2];        // Extra state, for capture group start/ends
+    // Header
+    int64_t            fInputIdx;        // Position of next character in the input string
+    int64_t            fPatIdx;          // Position of next Op in the compiled pattern
+                                         // (int64_t for UVector64, values fit in an int32_t)
+    // Remainder
+    int64_t            fExtra[1];        // Extra state, for capture group start/ends
                                           //   atomic parentheses, repeat counts, etc.
                                           //   Locations assigned at pattern compile time.
+                                         //   Variable-length array.
  };
+// number of UVector elements in the header
+#define RESTACKFRAME_HDRCOUNT 2
  
  //
  //  Start-Of-Match type.  Used by find() to quickly scan to positions where a
@@ -297,7 +316,6 @@ enum StartOfMatch {
                                 (v)==START_LINE?    "START_LINE"    : \
                                 (v)==START_STRING?  "START_STRING"  : \
                                                     "ILLEGAL")
-    
  
  //
  //  8 bit set, to fast-path latin-1 set membership tests.
@@ -318,15 +336,15 @@ inline Regex8BitSet::Regex8BitSet() {
  inline UBool Regex8BitSet::contains(UChar32 c) {
      // No bounds checking!  This is deliberate.
      return ((d[c>>3] & 1 <<(c&7)) != 0);
-};
+}
  
  inline void  Regex8BitSet::add(UChar32 c) {
      d[c>>3] |= 1 << (c&7);
-};
+}
  
  inline void Regex8BitSet::init(const UnicodeSet *s) {
      if (s != NULL) {
-        for (int i=0; i<=255; i++) {
+        for (int32_t i=0; i<=255; i++) {
              if (s->contains(i)) {
                  this->add(i);
              }
@@ -339,6 +357,58 @@ inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
  }
  
  
+//  Case folded UText Iterator helper class.
+//  Wraps a UText, provides a case-folded enumeration over its contents.
+//  Used in implementing case insensitive matching constructs.
+//  Implementation in rematch.cpp
+
+class CaseFoldingUTextIterator: public UMemory {
+      public:
+        CaseFoldingUTextIterator(UText &text);
+        ~CaseFoldingUTextIterator();
+
+        UChar32 next();           // Next case folded character
+
+        UBool   inExpansion();    // True if last char returned from next() and the
+                                  //  next to be returned both originated from a string
+                                  //  folding of the same code point from the orignal UText.
+      private:
+        UText             &fUText;
+        const  UChar      *fFoldChars;
+        int32_t            fFoldLength;
+        int32_t            fFoldIndex;
+
+};
+
+
+// Case folded UChar * string iterator.
+//  Wraps a UChar  *, provides a case-folded enumeration over its contents.
+//  Used in implementing case insensitive matching constructs.
+//  Implementation in rematch.cpp
+
+class CaseFoldingUCharIterator: public UMemory {
+      public:
+        CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit);
+        ~CaseFoldingUCharIterator();
+
+        UChar32 next();           // Next case folded character
+
+        UBool   inExpansion();    // True if last char returned from next() and the
+                                  //  next to be returned both originated from a string
+                                  //  folding of the same code point from the orignal UText.
+
+        int64_t  getIndex();      // Return the current input buffer index.
+
+      private:
+        const  UChar      *fChars;
+        int64_t            fIndex;
+        int64_t            fLimit;
+        const  UChar      *fFoldChars;
+        int32_t            fFoldLength;
+        int32_t            fFoldIndex;
+
+};
+
  U_NAMESPACE_END
  #endif