]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/regeximp.h
   2 //   Copyright (C) 2002-2012 International Business Machines Corporation 
   3 //   and others. All rights reserved. 
   7 //           ICU Regular Expressions, 
   8 //               Definitions of constant values used in the compiled form of 
   9 //               a regular expression pattern. 
  15 #include "unicode/utypes.h" 
  16 #include "unicode/uobject.h" 
  17 #include "unicode/uniset.h" 
  18 #include "unicode/utext.h" 
  25 // For debugging, define REGEX_DEBUG  
  26 // To define with configure, 
  27 //   ./runConfigureICU --enable-debug --disable-release Linux CPPFLAGS="-DREGEX_DEBUG" 
  31 //  debugging options.  Enable one or more of the three #defines immediately following 
  34 //#define REGEX_SCAN_DEBUG 
  35 #define REGEX_DUMP_DEBUG 
  36 #define REGEX_RUN_DEBUG 
  38 //  End of #defines inteded to be directly set. 
  43 #ifdef REGEX_SCAN_DEBUG 
  44 #define REGEX_SCAN_DEBUG_PRINTF(a) printf a 
  46 #define REGEX_SCAN_DEBUG_PRINTF(a) 
  49 #ifdef REGEX_DUMP_DEBUG 
  50 #define REGEX_DUMP_DEBUG_PRINTF(a) printf a 
  52 #define REGEX_DUMP_DEBUG_PRINTF(a) 
  55 #ifdef REGEX_RUN_DEBUG 
  56 #define REGEX_RUN_DEBUG_PRINTF(a) printf a 
  57 #define REGEX_DUMP_DEBUG_PRINTF(a) printf a 
  59 #define REGEX_RUN_DEBUG_PRINTF(a) 
  64 //  Opcode types     In the compiled form of the regexp, these are the type, or opcodes, 
  68      URX_RESERVED_OP   
= 0,    // For multi-operand ops, most non-first words. 
  69      URX_RESERVED_OP_N 
= 255,  // For multi-operand ops, negative operand values. 
  70      URX_BACKTRACK     
= 1,    // Force a backtrack, as if a match test had failed. 
  72      URX_ONECHAR       
= 3,    // Value field is the 21 bit unicode char to match 
  73      URX_STRING        
= 4,    // Value field is index of string start 
  74      URX_STRING_LEN    
= 5,    // Value field is string length (code units) 
  75      URX_STATE_SAVE    
= 6,    // Value field is pattern position to push 
  77      URX_START_CAPTURE 
= 8,    // Value field is capture group number. 
  78      URX_END_CAPTURE   
= 9,    // Value field is capture group number 
  79      URX_STATIC_SETREF 
= 10,   // Value field is index of set in array of sets. 
  80      URX_SETREF        
= 11,   // Value field is index of set in array of sets. 
  82      URX_JMP           
= 13,   // Value field is destination position in 
  84      URX_FAIL          
= 14,   // Stop match operation,  No match. 
  86      URX_JMP_SAV       
= 15,   // Operand:  JMP destination location 
  87      URX_BACKSLASH_B   
= 16,   // Value field:  0:  \b    1:  \B 
  89      URX_JMP_SAV_X     
= 18,   // Conditional JMP_SAV, 
  90                                //    Used in (x)+, breaks loop on zero length match. 
  91                                //    Operand:  Jmp destination. 
  93      URX_BACKSLASH_Z   
= 20,   // \z   Unconditional end of line. 
  95      URX_DOTANY_ALL    
= 21,   // ., in the . matches any mode. 
  96      URX_BACKSLASH_D   
= 22,   // Value field:  0:  \d    1:  \D 
  97      URX_CARET         
= 23,   // Value field:  1:  multi-line mode. 
  98      URX_DOLLAR        
= 24,  // Also for \Z 
 100      URX_CTR_INIT      
= 25,   // Counter Inits for {Interval} loops. 
 101      URX_CTR_INIT_NG   
= 26,   //   2 kinds, normal and non-greedy. 
 102                                //   These are 4 word opcodes.  See description. 
 103                                //    First Operand:  Data loc of counter variable 
 104                                //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx 
 105                                //                    at the end of the loop. 
 106                                //    3rd   Operand:  Minimum count. 
 107                                //    4th   Operand:  Max count, -1 for unbounded. 
 109      URX_DOTANY_UNIX   
= 27,   // '.' operator in UNIX_LINES mode, only \n marks end of line. 
 111      URX_CTR_LOOP      
= 28,   // Loop Ops for {interval} loops. 
 112      URX_CTR_LOOP_NG   
= 29,   //   Also in three flavors. 
 113                                //   Operand is loc of corresponding CTR_INIT. 
 115      URX_CARET_M_UNIX  
= 30,   // '^' operator, test for start of line in multi-line 
 116                                //      plus UNIX_LINES mode. 
 118      URX_RELOC_OPRND   
= 31,   // Operand value in multi-operand ops that refers 
 119                                //   back into compiled pattern code, and thus must 
 120                                //   be relocated when inserting/deleting ops in code. 
 122      URX_STO_SP        
= 32,   // Store the stack ptr.  Operand is location within 
 123                                //   matcher data (not stack data) to store it. 
 124      URX_LD_SP         
= 33,   // Load the stack pointer.  Operand is location 
 126      URX_BACKREF       
= 34,   // Back Reference.  Parameter is the index of the 
 127                                //   capture group variables in the state stack frame. 
 128      URX_STO_INP_LOC   
= 35,   // Store the input location.  Operand is location 
 129                                //   within the matcher stack frame. 
 130      URX_JMPX          
= 36,  // Conditional JMP. 
 131                                //   First Operand:  JMP target location. 
 132                                //   Second Operand:  Data location containing an 
 133                                //     input position.  If current input position == 
 134                                //     saved input position, FAIL rather than taking 
 136      URX_LA_START      
= 37,   // Starting a LookAround expression. 
 137                                //   Save InputPos and SP in static data. 
 138                                //   Operand:  Static data offset for the save 
 139      URX_LA_END        
= 38,   // Ending a Lookaround expression. 
 140                                //   Restore InputPos and Stack to saved values. 
 141                                //   Operand:  Static data offset for saved data. 
 142      URX_ONECHAR_I     
= 39,   // Test for case-insensitive match of a literal character. 
 143                                //   Operand:  the literal char. 
 144      URX_STRING_I      
= 40,   // Case insensitive string compare. 
 145                                //   First Operand:  Index of start of string in string literals 
 146                                //   Second Operand (next word in compiled code): 
 147                                //     the length of the string. 
 148      URX_BACKREF_I     
= 41,   // Case insensitive back reference. 
 149                                //   Parameter is the index of the 
 150                                //   capture group variables in the state stack frame. 
 151      URX_DOLLAR_M      
= 42,   // $ in multi-line mode. 
 152      URX_CARET_M       
= 43,   // ^ in multi-line mode. 
 153      URX_LB_START      
= 44,   // LookBehind Start. 
 154                                //   Paramater is data location 
 155      URX_LB_CONT       
= 45,   // LookBehind Continue. 
 156                                //   Param 0:  the data location 
 157                                //   Param 1:  The minimum length of the look-behind match 
 158                                //   Param 2:  The max length of the look-behind match 
 159      URX_LB_END        
= 46,   // LookBehind End. 
 160                                //   Parameter is the data location. 
 161                                //     Check that match ended at the right spot, 
 162                                //     Restore original input string len. 
 163      URX_LBN_CONT      
= 47,   // Negative LookBehind Continue 
 164                                //   Param 0:  the data location 
 165                                //   Param 1:  The minimum length of the look-behind match 
 166                                //   Param 2:  The max     length of the look-behind match 
 167                                //   Param 3:  The pattern loc following the look-behind block. 
 168      URX_LBN_END       
= 48,   // Negative LookBehind end 
 169                                //   Parameter is the data location. 
 170                                //   Check that the match ended at the right spot. 
 171      URX_STAT_SETREF_N 
= 49,   // Reference to a prebuilt set (e.g. \w), negated 
 172                                //   Operand is index of set in array of sets. 
 173      URX_LOOP_SR_I     
= 50,   // Init a [set]* loop. 
 174                                //   Operand is the sets index in array of user sets. 
 175      URX_LOOP_C        
= 51,   // Continue a [set]* or OneChar* loop. 
 176                                //   Operand is a matcher static data location. 
 177                                //   Must always immediately follow  LOOP_x_I instruction. 
 178      URX_LOOP_DOT_I    
= 52,   // .*, initialization of the optimized loop. 
 181                                //         0:  Normal (. doesn't match new-line) mode. 
 182                                //         1:  . matches new-line mode. 
 183                                //      bit 1:  controls what new-lines are recognized by this operation. 
 184                                //         0:  All Unicode New-lines 
 185                                //         1:  UNIX_LINES, \u000a only. 
 186      URX_BACKSLASH_BU  
= 53,   // \b or \B in UREGEX_UWORD mode, using Unicode style 
 188      URX_DOLLAR_D      
= 54,   // $ end of input test, in UNIX_LINES mode. 
 189      URX_DOLLAR_MD     
= 55    // $ end of input test, in MULTI_LINE and UNIX_LINES mode. 
 193 // Keep this list of opcode names in sync with the above enum 
 194 //   Used for debug printing only. 
 195 #define URX_OPCODE_NAMES       \ 
 206         "URX_STATIC_SETREF",   \ 
 226         "URX_CARET_M_UNIX",    \ 
 255 //  Convenience macros for assembling and disassembling a compiled operation. 
 257 #define URX_BUILD(type, val) (int32_t)((type << 24) | (val)) 
 258 #define URX_TYPE(x)          ((uint32_t)(x) >> 24) 
 259 #define URX_VAL(x)           ((x) & 0xffffff) 
 263 //  Access to Unicode Sets composite character properties 
 264 //     The sets are accessed by the match engine for things like \w (word boundary) 
 272      URX_GC_NORMAL
,          // Sets for finding grapheme cluster boundaries. 
 283      URX_NEG_SET     
= 0x800000          // Flag bit to reverse sense of set 
 289 //  Match Engine State Stack Frame Layout. 
 291 struct REStackFrame 
{ 
 293     int64_t            fInputIdx
;        // Position of next character in the input string 
 294     int64_t            fPatIdx
;          // Position of next Op in the compiled pattern 
 295                                          // (int64_t for UVector64, values fit in an int32_t) 
 297     int64_t            fExtra
[1];        // Extra state, for capture group start/ends 
 298                                          //   atomic parentheses, repeat counts, etc. 
 299                                          //   Locations assigned at pattern compile time. 
 300                                          //   Variable-length array. 
 302 // number of UVector elements in the header 
 303 #define RESTACKFRAME_HDRCOUNT 2 
 306 //  Start-Of-Match type.  Used by find() to quickly scan to positions where a 
 307 //                        match might start before firing up the full match engine. 
 310     START_NO_INFO
,             // No hint available. 
 311     START_CHAR
,                // Match starts with a literal code point. 
 312     START_SET
,                 // Match starts with something matching a set. 
 313     START_START
,               // Match starts at start of buffer only (^ or \A) 
 314     START_LINE
,                // Match starts with ^ in multi-line mode. 
 315     START_STRING               
// Match starts with a literal string. 
 318 #define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \ 
 319                                (v)==START_CHAR?    "START_CHAR"    : \ 
 320                                (v)==START_SET?     "START_SET"     : \ 
 321                                (v)==START_START?   "START_START"   : \ 
 322                                (v)==START_LINE?    "START_LINE"    : \ 
 323                                (v)==START_STRING?  "START_STRING"  : \ 
 327 //  8 bit set, to fast-path latin-1 set membership tests. 
 329 struct Regex8BitSet 
: public UMemory 
{ 
 330     inline Regex8BitSet(); 
 331     inline void operator = (const Regex8BitSet 
&s
); 
 332     inline void init(const UnicodeSet 
*src
); 
 333     inline UBool 
contains(UChar32 c
); 
 334     inline void  add(UChar32 c
); 
 338 inline Regex8BitSet::Regex8BitSet() { 
 339     uprv_memset(d
, 0, sizeof(d
)); 
 342 inline UBool 
Regex8BitSet::contains(UChar32 c
) { 
 343     // No bounds checking!  This is deliberate. 
 344     return ((d
[c
>>3] & 1 <<(c
&7)) != 0); 
 347 inline void  Regex8BitSet::add(UChar32 c
) { 
 348     d
[c
>>3] |= 1 << (c
&7); 
 351 inline void Regex8BitSet::init(const UnicodeSet 
*s
) { 
 353         for (int32_t i
=0; i
<=255; i
++) { 
 354             if (s
->contains(i
)) { 
 361 inline void Regex8BitSet::operator = (const Regex8BitSet 
&s
) { 
 362    uprv_memcpy(d
, s
.d
, sizeof(d
)); 
 366 //  Case folded UText Iterator helper class. 
 367 //  Wraps a UText, provides a case-folded enumeration over its contents. 
 368 //  Used in implementing case insensitive matching constructs. 
 369 //  Implementation in rematch.cpp 
 371 class CaseFoldingUTextIterator
: public UMemory 
{ 
 373         CaseFoldingUTextIterator(UText 
&text
); 
 374         ~CaseFoldingUTextIterator(); 
 376         UChar32 
next();           // Next case folded character  
 378         UBool   
inExpansion();    // True if last char returned from next() and the  
 379                                   //  next to be returned both originated from a string 
 380                                   //  folding of the same code point from the orignal UText. 
 383         const  UCaseProps 
*fcsp
; 
 384         const  UChar      
*fFoldChars
; 
 391 // Case folded UChar * string iterator. 
 392 //  Wraps a UChar  *, provides a case-folded enumeration over its contents. 
 393 //  Used in implementing case insensitive matching constructs. 
 394 //  Implementation in rematch.cpp 
 396 class CaseFoldingUCharIterator
: public UMemory 
{ 
 398         CaseFoldingUCharIterator(const UChar 
*chars
, int64_t start
, int64_t limit
); 
 399         ~CaseFoldingUCharIterator(); 
 401         UChar32 
next();           // Next case folded character  
 403         UBool   
inExpansion();    // True if last char returned from next() and the  
 404                                   //  next to be returned both originated from a string 
 405                                   //  folding of the same code point from the orignal UText. 
 407         int64_t  getIndex();      // Return the current input buffer index. 
 413         const  UCaseProps 
*fcsp
; 
 414         const  UChar      
*fFoldChars
;