]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_lex.c
   3  * This file is #included by regcomp.c. 
   5  * Copyright (c) 1998, 1999 Henry Spencer.      All rights reserved. 
   7  * Development of this software was funded, in part, by Cray Research Inc., 
   8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 
   9  * Corporation, none of whom are responsible for the results.  The author 
  12  * Redistribution and use in source and binary forms -- with or without 
  13  * modification -- are permitted for any purpose, provided that 
  14  * redistributions in source form retain this entire copyright notice and 
  15  * indicate the origin and nature of any modifications. 
  17  * I'd appreciate being given credit for this package in the documentation 
  18  * of software which uses it, but that is not a requirement. 
  20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 
  21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
  22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL 
  23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
  29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  35 /* scanning macros (know about v) */ 
  36 #define ATEOS()         (v->now >= v->stop) 
  37 #define HAVE(n)         (v->stop - v->now >= (n)) 
  38 #define NEXT1(c)        (!ATEOS() && *v->now == CHR(c)) 
  39 #define NEXT2(a,b)      (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) 
  40 #define NEXT3(a,b,c)    (HAVE(3) && *v->now == CHR(a) && \ 
  41                                                 *(v->now+1) == CHR(b) && \ 
  42                                                 *(v->now+2) == CHR(c)) 
  43 #define SET(c)          (v->nexttype = (c)) 
  44 #define SETV(c, n)      (v->nexttype = (c), v->nextvalue = (n)) 
  45 #define RET(c)          return (SET(c), 1) 
  46 #define RETV(c, n)      return (SETV(c, n), 1) 
  47 #define FAILW(e)        return (ERR(e), 0)      /* ERR does SET(EOS) */ 
  48 #define LASTTYPE(t) (v->lasttype == (t)) 
  50 /* lexical contexts */ 
  51 #define L_ERE   1                               /* mainline ERE/ARE */ 
  52 #define L_BRE   2                               /* mainline BRE */ 
  53 #define L_Q 3                                   /* REG_QUOTE */ 
  54 #define L_EBND  4                               /* ERE/ARE bound */ 
  55 #define L_BBND  5                               /* BRE bound */ 
  56 #define L_BRACK 6                               /* brackets */ 
  57 #define L_CEL   7                               /* collating element */ 
  58 #define L_ECL   8                               /* equivalence class */ 
  59 #define L_CCL   9                               /* character class */ 
  60 #define INTOCON(c)      (v->lexcon = (c)) 
  61 #define INCON(con)      (v->lexcon == (con)) 
  63 /* construct pointer past end of chr array */ 
  64 #define ENDOF(array)    ((array) + sizeof(array)/sizeof(chr)) 
  67  * lexstart - set up lexical stuff, scan leading options 
  70 lexstart(struct vars 
* v
) 
  72         prefixes(v
);                            /* may turn on new type bits etc. */ 
  75         if (v
->cflags 
& REG_QUOTE
) 
  77                 assert(!(v
->cflags 
& (REG_ADVANCED 
| REG_EXPANDED 
| REG_NEWLINE
))); 
  80         else if (v
->cflags 
& REG_EXTENDED
) 
  82                 assert(!(v
->cflags 
& REG_QUOTE
)); 
  87                 assert(!(v
->cflags 
& (REG_QUOTE 
| REG_ADVF
))); 
  91         v
->nexttype 
= EMPTY
;            /* remember we were at the start */ 
  92         next(v
);                                        /* set up the first token */ 
  96  * prefixes - implement various special prefixes 
  99 prefixes(struct vars 
* v
) 
 101         /* literal string doesn't get any of this stuff */ 
 102         if (v
->cflags 
& REG_QUOTE
) 
 105         /* initial "***" gets special things */ 
 106         if (HAVE(4) && NEXT3('*', '*', '*')) 
 107                 switch (*(v
->now 
+ 3)) 
 109                         case CHR('?'):          /* "***?" error, msg shows version */ 
 111                                 return;                 /* proceed no further */ 
 113                         case CHR('='):          /* "***=" shifts to literal string */ 
 115                                 v
->cflags 
|= REG_QUOTE
; 
 116                                 v
->cflags 
&= ~(REG_ADVANCED 
| REG_EXPANDED 
| REG_NEWLINE
); 
 118                                 return;                 /* and there can be no more prefixes */ 
 120                         case CHR(':'):          /* "***:" shifts to AREs */ 
 122                                 v
->cflags 
|= REG_ADVANCED
; 
 125                         default:                        /* otherwise *** is just an error */ 
 131         /* BREs and EREs don't get embedded options */ 
 132         if ((v
->cflags 
& REG_ADVANCED
) != REG_ADVANCED
) 
 135         /* embedded options (AREs only) */ 
 136         if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v
->now 
+ 2))) 
 140                 for (; !ATEOS() && iscalpha(*v
->now
); v
->now
++) 
 143                                 case CHR('b'):  /* BREs (but why???) */ 
 144                                         v
->cflags 
&= ~(REG_ADVANCED 
| REG_QUOTE
); 
 146                                 case CHR('c'):  /* case sensitive */ 
 147                                         v
->cflags 
&= ~REG_ICASE
; 
 149                                 case CHR('e'):  /* plain EREs */ 
 150                                         v
->cflags 
|= REG_EXTENDED
; 
 151                                         v
->cflags 
&= ~(REG_ADVF 
| REG_QUOTE
); 
 153                                 case CHR('i'):  /* case insensitive */ 
 154                                         v
->cflags 
|= REG_ICASE
; 
 156                                 case CHR('m'):  /* Perloid synonym for n */ 
 157                                 case CHR('n'):  /* \n affects ^ $ . [^ */ 
 158                                         v
->cflags 
|= REG_NEWLINE
; 
 160                                 case CHR('p'):  /* ~Perl, \n affects . [^ */ 
 161                                         v
->cflags 
|= REG_NLSTOP
; 
 162                                         v
->cflags 
&= ~REG_NLANCH
; 
 164                                 case CHR('q'):  /* literal string */ 
 165                                         v
->cflags 
|= REG_QUOTE
; 
 166                                         v
->cflags 
&= ~REG_ADVANCED
; 
 168                                 case CHR('s'):  /* single line, \n ordinary */ 
 169                                         v
->cflags 
&= ~REG_NEWLINE
; 
 171                                 case CHR('t'):  /* tight syntax */ 
 172                                         v
->cflags 
&= ~REG_EXPANDED
; 
 174                                 case CHR('w'):  /* weird, \n affects ^ $ only */ 
 175                                         v
->cflags 
&= ~REG_NLSTOP
; 
 176                                         v
->cflags 
|= REG_NLANCH
; 
 178                                 case CHR('x'):  /* expanded syntax */ 
 179                                         v
->cflags 
|= REG_EXPANDED
; 
 191                 if (v
->cflags 
& REG_QUOTE
) 
 192                         v
->cflags 
&= ~(REG_EXPANDED 
| REG_NEWLINE
); 
 197  * lexnest - "call a subroutine", interpolating string at the lexical level 
 199  * Note, this is not a very general facility.  There are a number of 
 200  * implicit assumptions about what sorts of strings can be subroutines. 
 203 lexnest(struct vars 
* v
, 
 204                 chr 
*beginp
,                    /* start of interpolation */ 
 205                 chr 
*endp
)                              /* one past end of interpolation */ 
 207         assert(v
->savenow 
== NULL
); /* only one level of nesting */ 
 209         v
->savestop 
= v
->stop
; 
 215  * string constants to interpolate as expansions of things like \d 
 217 static chr      backd
[] = {                     /* \d */ 
 218         CHR('['), CHR('['), CHR(':'), 
 219         CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 
 220         CHR(':'), CHR(']'), CHR(']') 
 222 static chr      backD
[] = {                     /* \D */ 
 223         CHR('['), CHR('^'), CHR('['), CHR(':'), 
 224         CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 
 225         CHR(':'), CHR(']'), CHR(']') 
 227 static chr      brbackd
[] = {           /* \d within brackets */ 
 229         CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 
 232 static chr      backs
[] = {                     /* \s */ 
 233         CHR('['), CHR('['), CHR(':'), 
 234         CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 
 235         CHR(':'), CHR(']'), CHR(']') 
 237 static chr      backS
[] = {                     /* \S */ 
 238         CHR('['), CHR('^'), CHR('['), CHR(':'), 
 239         CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 
 240         CHR(':'), CHR(']'), CHR(']') 
 242 static chr      brbacks
[] = {           /* \s within brackets */ 
 244         CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 
 247 static chr      backw
[] = {                     /* \w */ 
 248         CHR('['), CHR('['), CHR(':'), 
 249         CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 
 250         CHR(':'), CHR(']'), CHR('_'), CHR(']') 
 252 static chr      backW
[] = {                     /* \W */ 
 253         CHR('['), CHR('^'), CHR('['), CHR(':'), 
 254         CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 
 255         CHR(':'), CHR(']'), CHR('_'), CHR(']') 
 257 static chr      brbackw
[] = {           /* \w within brackets */ 
 259         CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 
 260         CHR(':'), CHR(']'), CHR('_') 
 264  * lexword - interpolate a bracket expression for word characters 
 265  * Possibly ought to inquire whether there is a "word" character class. 
 268 lexword(struct vars 
* v
) 
 270         lexnest(v
, backw
, ENDOF(backw
)); 
 274  * next - get next token 
 276 static int                                              /* 1 normal, 0 failure */ 
 277 next(struct vars 
* v
) 
 281         /* errors yield an infinite sequence of failures */ 
 283                 return 0;                               /* the error has set nexttype to EOS */ 
 285         /* remember flavor of last token */ 
 286         v
->lasttype 
= v
->nexttype
; 
 289         if (v
->nexttype 
== EMPTY 
&& (v
->cflags 
& REG_BOSONLY
)) 
 291                 /* at start of a REG_BOSONLY RE */ 
 292                 RETV(SBEGIN
, 0);                /* same as \A */ 
 295         /* if we're nested and we've hit end, return to outer level */ 
 296         if (v
->savenow 
!= NULL 
&& ATEOS()) 
 299                 v
->stop 
= v
->savestop
; 
 300                 v
->savenow 
= v
->savestop 
= NULL
; 
 303         /* skip white space etc. if appropriate (not in literal or []) */ 
 304         if (v
->cflags 
& REG_EXPANDED
) 
 315         /* handle EOS, depending on context */ 
 339         /* okay, time to actually get a character */ 
 342         /* deal with the easy contexts, punt EREs to code below */ 
 345                 case L_BRE
:                             /* punt BREs to separate function */ 
 346                         return brenext(v
, c
); 
 348                 case L_ERE
:                             /* see below */ 
 350                 case L_Q
:                               /* literal strings are easy */ 
 353                 case L_BBND
:                    /* bounds are fairly simple */ 
 367                                         RETV(DIGIT
, (chr
) DIGITVAL(c
)); 
 372                                 case CHR('}'):  /* ERE bound ends with } */ 
 376                                                 if ((v
->cflags 
& REG_ADVF
) && NEXT1('?')) 
 387                                 case CHR('\\'): /* BRE bound ends with \} */ 
 388                                         if (INCON(L_BBND
) && NEXT1('}')) 
 403                 case L_BRACK
:                   /* brackets are not too hard */ 
 411                                                 INTOCON((v
->cflags 
& REG_EXTENDED
) ? 
 418                                         if (!(v
->cflags 
& REG_ADVF
)) 
 423                                         (DISCARD
) lexescape(v
); 
 425                                         {                       /* not all escapes okay here */ 
 430                                                         switch (v
->nextvalue
) 
 433                                                                         lexnest(v
, brbackd
, ENDOF(brbackd
)); 
 436                                                                         lexnest(v
, brbacks
, ENDOF(brbacks
)); 
 439                                                                         lexnest(v
, brbackw
, ENDOF(brbackw
)); 
 445                                                         /* lexnest done, back up and try again */ 
 446                                                         v
->nexttype 
= v
->lasttype
; 
 450                                         /* not one of the acceptable escapes */ 
 454                                         if (LASTTYPE('[') || NEXT1(']')) 
 466                                                         /* might or might not be locale-specific */ 
 492                 case L_CEL
:                             /* collating elements are easy */ 
 493                         if (c 
== CHR('.') && NEXT1(']')) 
 502                 case L_ECL
:                             /* ditto equivalence classes */ 
 503                         if (c 
== CHR('=') && NEXT1(']')) 
 512                 case L_CCL
:                             /* ditto character classes */ 
 513                         if (c 
== CHR(':') && NEXT1(']')) 
 527         /* that got rid of everything except EREs and AREs */ 
 528         assert(INCON(L_ERE
)); 
 530         /* deal with EREs and AREs, except for backslashes */ 
 537                         if ((v
->cflags 
& REG_ADVF
) && NEXT1('?')) 
 546                         if ((v
->cflags 
& REG_ADVF
) && NEXT1('?')) 
 555                         if ((v
->cflags 
& REG_ADVF
) && NEXT1('?')) 
 563                 case CHR('{'):                  /* bounds start or plain character */ 
 564                         if (v
->cflags 
& REG_EXPANDED
) 
 566                         if (ATEOS() || !iscdigit(*v
->now
)) 
 580                 case CHR('('):                  /* parenthesis, or advanced extension */ 
 581                         if ((v
->cflags 
& REG_ADVF
) && NEXT1('?')) 
 587                                         case CHR(':'):          /* non-capturing paren */ 
 590                                         case CHR('#'):          /* comment */ 
 591                                                 while (!ATEOS() && *v
->now 
!= CHR(')')) 
 595                                                 assert(v
->nexttype 
== v
->lasttype
); 
 598                                         case CHR('='):          /* positive lookahead */ 
 599                                                 NOTE(REG_ULOOKAHEAD
); 
 602                                         case CHR('!'):          /* negative lookahead */ 
 603                                                 NOTE(REG_ULOOKAHEAD
); 
 612                         if (v
->cflags 
& REG_NOSUB
) 
 613                                 RETV('(', 0);   /* all parens non-capturing */ 
 622                 case CHR('['):                  /* easy except for [[:<:]] and [[:>:]] */ 
 623                         if (HAVE(6) && *(v
->now 
+ 0) == CHR('[') && 
 624                                 *(v
->now 
+ 1) == CHR(':') && 
 625                                 (*(v
->now 
+ 2) == CHR('<') || 
 626                                  *(v
->now 
+ 2) == CHR('>')) && 
 627                                 *(v
->now 
+ 3) == CHR(':') && 
 628                                 *(v
->now 
+ 4) == CHR(']') && 
 629                                 *(v
->now 
+ 5) == CHR(']')) 
 634                                 RET((c 
== CHR('<')) ? '<' : '>'); 
 653                 case CHR('\\'): /* mostly punt backslashes to code below */ 
 657                 default:                                /* ordinary character */ 
 662         /* ERE/ARE backslash handling; backslash already eaten */ 
 664         if (!(v
->cflags 
& REG_ADVF
)) 
 665         {                                                       /* only AREs have non-trivial escapes */ 
 666                 if (iscalnum(*v
->now
)) 
 671                 RETV(PLAIN
, *v
->now
++); 
 673         (DISCARD
) lexescape(v
); 
 676         if (v
->nexttype 
== CCLASS
) 
 677         {                                                       /* fudge at lexical level */ 
 678                 switch (v
->nextvalue
) 
 681                                 lexnest(v
, backd
, ENDOF(backd
)); 
 684                                 lexnest(v
, backD
, ENDOF(backD
)); 
 687                                 lexnest(v
, backs
, ENDOF(backs
)); 
 690                                 lexnest(v
, backS
, ENDOF(backS
)); 
 693                                 lexnest(v
, backw
, ENDOF(backw
)); 
 696                                 lexnest(v
, backW
, ENDOF(backW
)); 
 703                 /* lexnest done, back up and try again */ 
 704                 v
->nexttype 
= v
->lasttype
; 
 707         /* otherwise, lexescape has already done the work */ 
 712  * lexescape - parse an ARE backslash escape (backslash already eaten) 
 713  * Note slightly nonstandard use of the CCLASS type code. 
 715 static int                                              /* not actually used, but convenient for 
 717 lexescape(struct vars 
* v
) 
 720         static chr      alert
[] = { 
 721                 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') 
 724                 CHR('E'), CHR('S'), CHR('C') 
 728         assert(v
->cflags 
& REG_ADVF
); 
 739                         RETV(PLAIN
, chrnamed(v
, alert
, ENDOF(alert
), CHR('\007'))); 
 745                         RETV(PLAIN
, CHR('\b')); 
 748                         RETV(PLAIN
, CHR('\\')); 
 754                         RETV(PLAIN
, (chr
) (*v
->now
++ & 037)); 
 766                         RETV(PLAIN
, chrnamed(v
, esc
, ENDOF(esc
), CHR('\033'))); 
 769                         RETV(PLAIN
, CHR('\f')); 
 778                         RETV(PLAIN
, CHR('\n')); 
 781                         RETV(PLAIN
, CHR('\r')); 
 792                         RETV(PLAIN
, CHR('\t')); 
 795                         c 
= lexdigits(v
, 16, 4, 4); 
 801                         c 
= lexdigits(v
, 16, 8, 8); 
 807                         RETV(PLAIN
, CHR('\v')); 
 819                         c 
= lexdigits(v
, 16, 1, 255);           /* REs >255 long outside 
 846                         v
->now
--;                       /* put first digit back */ 
 847                         c 
= lexdigits(v
, 10, 1, 255);           /* REs >255 long outside 
 851                         /* ugly heuristic (first test is "exactly 1 digit?") */ 
 852                         if (v
->now 
- save 
== 0 || (int) c 
<= v
->nsubexp
) 
 855                                 RETV(BACKREF
, (chr
) c
); 
 857                         /* oops, doesn't look like it's a backref after all... */ 
 859                         /* and fall through into octal number */ 
 862                         v
->now
--;                       /* put first digit back */ 
 863                         c 
= lexdigits(v
, 8, 1, 3); 
 870                         FAILW(REG_EESCAPE
); /* unknown alphabetic escape */ 
 877  * lexdigits - slurp up digits and return chr value 
 879 static chr                                              
/* chr value; errors signalled via ERR */ 
 880 lexdigits(struct vars 
* v
, 
 885         uchr            n
;                              /* unsigned to avoid overflow misbehavior */ 
 889         const uchr      ub 
= (uchr
) base
; 
 892         for (len 
= 0; len 
< maxlen 
&& !ATEOS(); len
++) 
 934                                 v
->now
--;               /* oops, not a digit at all */ 
 940                 {                                               /* not a plausible digit */ 
 945                         break;                          /* NOTE BREAK OUT */ 
 946                 n 
= n 
* ub 
+ (uchr
) d
; 
 955  * brenext - get next BRE token 
 957  * This is much like EREs except for all the stupid backslashes and the 
 958  * context-dependency of some things. 
 960 static int                                              /* 1 normal, 0 failure */ 
 961 brenext(struct vars 
* v
, 
 969                         if (LASTTYPE(EMPTY
) || LASTTYPE('(') || LASTTYPE('^')) 
 974                         if (HAVE(6) && *(v
->now 
+ 0) == CHR('[') && 
 975                                 *(v
->now 
+ 1) == CHR(':') && 
 976                                 (*(v
->now 
+ 2) == CHR('<') || 
 977                                  *(v
->now 
+ 2) == CHR('>')) && 
 978                                 *(v
->now 
+ 3) == CHR(':') && 
 979                                 *(v
->now 
+ 4) == CHR(']') && 
 980                                 *(v
->now 
+ 5) == CHR(']')) 
 985                                 RET((c 
== CHR('<')) ? '<' : '>'); 
1009                         if (v
->cflags 
& REG_EXPANDED
) 
1013                         if (NEXT2('\\', ')')) 
1021                         break;                          /* see below */ 
1027         assert(c 
== CHR('\\')); 
1047                         NOTE(REG_UNONPOSIX
); 
1051                         NOTE(REG_UNONPOSIX
); 
1064                         RETV(BACKREF
, (chr
) DIGITVAL(c
)); 
1080  * skip - skip white space and comments in expanded form 
1083 skip(struct vars 
* v
) 
1085         chr                
*start 
= v
->now
; 
1087         assert(v
->cflags 
& REG_EXPANDED
); 
1091                 while (!ATEOS() && iscspace(*v
->now
)) 
1093                 if (ATEOS() || *v
->now 
!= CHR('#')) 
1094                         break;                          /* NOTE BREAK OUT */ 
1096                 while (!ATEOS() && *v
->now 
!= CHR('\n')) 
1098                 /* leave the newline to be picked up by the iscspace loop */ 
1101         if (v
->now 
!= start
) 
1102                 NOTE(REG_UNONPOSIX
); 
1106  * newline - return the chr for a newline 
1108  * This helps confine use of CHR to this source file. 
1117  * chrnamed - return the chr known by a given (chr string) name 
1119  * The code is a bit clumsy, but this routine gets only such specialized 
1120  * use that it hardly matters. 
1123 chrnamed(struct vars 
* v
, 
1124                  chr 
*startp
,                   /* start of name */ 
1125                  chr 
*endp
,                             /* just past end of name */ 
1126                  chr lastresort
)                /* what to return if name lookup fails */ 
1135         c 
= element(v
, startp
, endp
); 
1140                 return (chr
) lastresort
; 
1142         cv 
= range(v
, c
, c
, 0); 
1144                 return (chr
) lastresort
;