]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
  16  * parse structure, passed up and down to avoid global variables and 
  20         char *next
;             /* next character in RE */ 
  21         char *end
;              /* end of string (-> NUL normally) */ 
  22         int error
;              /* has an error been seen? */ 
  23         sop 
*strip
;             /* malloced strip */ 
  24         sopno ssize
;            /* malloced strip size (allocated) */ 
  25         sopno slen
;             /* malloced strip length (used) */ 
  26         int ncsalloc
;           /* number of csets allocated */ 
  28 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */ 
  29         sopno pbegin
[NPAREN
];   /* -> ( ([0] unused) */ 
  30         sopno pend
[NPAREN
];     /* -> ) ([0] unused) */ 
  35 static char nuls
[10];           /* place to point scanner in event of error */ 
  38  * macros for use with parse structure 
  39  * BEWARE:  these know that the parse structure is named `p' !!! 
  41 #define PEEK()  (*p->next) 
  42 #define PEEK2() (*(p->next+1)) 
  43 #define MORE()  (p->next < p->end) 
  44 #define MORE2() (p->next+1 < p->end) 
  45 #define SEE(c)  (MORE() && PEEK() == (c)) 
  46 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 
  47 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0) 
  48 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 
  49 #define NEXT()  (p->next++) 
  50 #define NEXT2() (p->next += 2) 
  51 #define NEXTn(n)        (p->next += (n)) 
  52 #define GETNEXT()       (*p->next++) 
  53 #define SETERROR(e)     seterr(p, (e)) 
  54 #define REQUIRE(co, e)  ((void)((co) || SETERROR(e))) 
  55 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e)) 
  56 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e)) 
  57 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e)) 
  58 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 
  59 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 
  60 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos)) 
  61 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos) 
  62 #define HERE()          (p->slen) 
  63 #define THERE()         (p->slen - 1) 
  64 #define THERETHERE()    (p->slen - 2) 
  65 #define DROP(n) (p->slen -= (n)) 
  68 static int never 
= 0;           /* for use in asserts; shuts lint up */ 
  70 #define never   0               /* some <assert.h>s have bugs too */ 
  74  - regcomp - interface for parser and compilation 
  75  = extern int regcomp(regex_t *, const char *, int); 
  76  = #define      REG_BASIC       0000 
  77  = #define      REG_EXTENDED    0001 
  78  = #define      REG_ICASE       0002 
  79  = #define      REG_NOSUB       0004 
  80  = #define      REG_NEWLINE     0010 
  81  = #define      REG_NOSPEC      0020 
  82  = #define      REG_PEND        0040 
  83  = #define      REG_DUMP        0200 
  85 int                             /* 0 success, otherwise REG_something */ 
  86 regcomp(preg
, pattern
, cflags
) 
  92         register struct re_guts 
*g
; 
  93         register struct parse 
*p 
= &pa
; 
  97 #       define  GOODFLAGS(f)    (f) 
  99 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP) 
 102         cflags 
= GOODFLAGS(cflags
); 
 103         if ((cflags
®_EXTENDED
) && (cflags
®_NOSPEC
)) 
 106         if (cflags
®_PEND
) { 
 107                 if (preg
->re_endp 
< pattern
) 
 109                 len 
= preg
->re_endp 
- pattern
; 
 111                 len 
= strlen((char *)pattern
); 
 113         /* do the mallocs early so failure handling is easy */ 
 114         g 
= (struct re_guts 
*)malloc(sizeof(struct re_guts
) + 
 115                                                         (NC
-1)*sizeof(cat_t
)); 
 118         p
->ssize 
= len
/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 
 119         p
->strip 
= (sop 
*)malloc(p
->ssize 
* sizeof(sop
)); 
 121         if (p
->strip 
== NULL
) { 
 128         p
->next 
= (char *)pattern
;      /* convenience; we do not modify it */ 
 129         p
->end 
= p
->next 
+ len
; 
 132         for (i 
= 0; i 
< NPAREN
; i
++) { 
 147         g
->ncategories 
= 1;     /* category 0 is "everything else" */ 
 148         g
->categories 
= &g
->catspace
[-(CHAR_MIN
)]; 
 149         (void) memset((char *)g
->catspace
, 0, NC
*sizeof(cat_t
)); 
 154         g
->firststate 
= THERE(); 
 155         if (cflags
®_EXTENDED
) 
 157         else if (cflags
®_NOSPEC
) 
 162         g
->laststate 
= THERE(); 
 164         /* tidy up loose ends and fill things in */ 
 168         g
->nplus 
= pluscount(p
, g
); 
 170         preg
->re_nsub 
= g
->nsub
; 
 172         preg
->re_magic 
= MAGIC1
; 
 174         /* not debugging, so can't rely on the assert() in regexec() */ 
 176                 SETERROR(REG_ASSERT
); 
 179         /* win or lose, we're done */ 
 180         if (p
->error 
!= 0)      /* lose */ 
 186  - p_ere - ERE parser top level, concatenation and alternation 
 187  == static void p_ere(register struct parse *p, int stop); 
 191 register struct parse 
*p
; 
 192 int stop
;                       /* character this ERE should end at */ 
 195         register sopno prevback
; 
 196         register sopno prevfwd
; 
 198         register int first 
= 1;         /* is this the first alternative? */ 
 201                 /* do a bunch of concatenated expressions */ 
 203                 while (MORE() && (c 
= PEEK()) != '|' && c 
!= stop
) 
 205                 REQUIRE(HERE() != conc
, REG_EMPTY
);     /* require nonempty */ 
 208                         break;          /* NOTE BREAK OUT */ 
 211                         INSERT(OCH_
, conc
);     /* offset is wrong */ 
 216                 ASTERN(OOR1
, prevback
); 
 218                 AHEAD(prevfwd
);                 /* fix previous offset */ 
 220                 EMIT(OOR2
, 0);                  /* offset is very wrong */ 
 223         if (!first
) {           /* tail-end fixups */ 
 225                 ASTERN(O_CH
, prevback
); 
 228         assert(!MORE() || SEE(stop
)); 
 232  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 
 233  == static void p_ere_exp(register struct parse *p); 
 237 register struct parse 
*p
; 
 243         register sopno subno
; 
 246         assert(MORE());         /* caller should have ensured this */ 
 252                 REQUIRE(MORE(), REG_EPAREN
); 
 256                         p
->pbegin
[subno
] = HERE(); 
 257                 EMIT(OLPAREN
, subno
); 
 260                 if (subno 
< NPAREN
) { 
 261                         p
->pend
[subno
] = HERE(); 
 262                         assert(p
->pend
[subno
] != 0); 
 264                 EMIT(ORPAREN
, subno
); 
 265                 MUSTEAT(')', REG_EPAREN
); 
 267 #ifndef POSIX_MISTAKE 
 268         case ')':               /* happens only if no current unmatched ( */ 
 270                  * You may ask, why the ifndef?  Because I didn't notice 
 271                  * this until slightly too late for 1003.2, and none of the 
 272                  * other 1003.2 regular-expression reviewers noticed it at 
 273                  * all.  So an unmatched ) is legal POSIX, at least until 
 274                  * we can get it fixed. 
 276                 SETERROR(REG_EPAREN
); 
 281                 p
->g
->iflags 
|= USEBOL
; 
 287                 p
->g
->iflags 
|= USEEOL
; 
 296                 SETERROR(REG_BADRPT
); 
 299                 if (p
->g
->cflags
®_NEWLINE
) 
 308                 REQUIRE(MORE(), REG_EESCAPE
); 
 312         case '{':               /* okay as ordinary except if digit follows */ 
 313                 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT
); 
 323         /* we call { a repetition if followed by a digit */ 
 324         if (!( c 
== '*' || c 
== '+' || c 
== '?' || 
 325                                 (c 
== '{' && MORE2() && isdigit(PEEK2())) )) 
 326                 return;         /* no repetition, we're done */ 
 329         REQUIRE(!wascaret
, REG_BADRPT
); 
 331         case '*':       /* implemented as +? */ 
 332                 /* this case does not require the (y|) trick, noKLUDGE */ 
 335                 INSERT(OQUEST_
, pos
); 
 336                 ASTERN(O_QUEST
, pos
); 
 343                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 
 344                 INSERT(OCH_
, pos
);              /* offset slightly wrong */ 
 345                 ASTERN(OOR1
, pos
);              /* this one's right */ 
 346                 AHEAD(pos
);                     /* fix the OCH_ */ 
 347                 EMIT(OOR2
, 0);                  /* offset very wrong... */ 
 348                 AHEAD(THERE());                 /* ...so fix it */ 
 349                 ASTERN(O_CH
, THERETHERE()); 
 354                         if (isdigit(PEEK())) { 
 356                                 REQUIRE(count 
<= count2
, REG_BADBR
); 
 357                         } else          /* single number with comma */ 
 359                 } else          /* just a single number */ 
 361                 repeat(p
, pos
, count
, count2
); 
 362                 if (!EAT('}')) {        /* error heuristics */ 
 363                         while (MORE() && PEEK() != '}') 
 365                         REQUIRE(MORE(), REG_EBRACE
); 
 374         if (!( c 
== '*' || c 
== '+' || c 
== '?' || 
 375                                 (c 
== '{' && MORE2() && isdigit(PEEK2())) ) ) 
 377         SETERROR(REG_BADRPT
); 
 381  - p_str - string (no metacharacters) "parser" 
 382  == static void p_str(register struct parse *p); 
 386 register struct parse 
*p
; 
 388         REQUIRE(MORE(), REG_EMPTY
); 
 390                 ordinary(p
, GETNEXT()); 
 394  - p_bre - BRE parser top level, anchoring and concatenation 
 395  == static void p_bre(register struct parse *p, register int end1, \ 
 396  ==     register int end2); 
 397  * Giving end1 as OUT essentially eliminates the end1/end2 check. 
 399  * This implementation is a bit of a kludge, in that a trailing $ is first 
 400  * taken as an ordinary character and then revised to be an anchor.  The 
 401  * only undesirable side effect is that '$' gets included as a character 
 402  * category in such cases.  This is fairly harmless; not worth fixing. 
 403  * The amount of lookahead needed to avoid this kludge is excessive. 
 407 register struct parse 
*p
; 
 408 register int end1
;              /* first terminating character */ 
 409 register int end2
;              /* second terminating character */ 
 411         register sopno start 
= HERE(); 
 412         register int first 
= 1;                 /* first subexpression? */ 
 413         register int wasdollar 
= 0; 
 417                 p
->g
->iflags 
|= USEBOL
; 
 420         while (MORE() && !SEETWO(end1
, end2
)) { 
 421                 wasdollar 
= p_simp_re(p
, first
); 
 424         if (wasdollar
) {        /* oops, that was a trailing anchor */ 
 427                 p
->g
->iflags 
|= USEEOL
; 
 431         REQUIRE(HERE() != start
, REG_EMPTY
);    /* require nonempty */ 
 435  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 
 436  == static int p_simp_re(register struct parse *p, int starordinary); 
 438 static int                      /* was the simple RE an unbackslashed $? */ 
 439 p_simp_re(p
, starordinary
) 
 440 register struct parse 
*p
; 
 441 int starordinary
;               /* is a leading * an ordinary character? */ 
 448         register sopno subno
; 
 449 #       define  BACKSL  (1<<CHAR_BIT) 
 451         pos 
= HERE();           /* repetion op, if any, covers from here */ 
 453         assert(MORE());         /* caller should have ensured this */ 
 456                 REQUIRE(MORE(), REG_EESCAPE
); 
 457                 c 
= BACKSL 
| (unsigned char)GETNEXT(); 
 461                 if (p
->g
->cflags
®_NEWLINE
) 
 470                 SETERROR(REG_BADRPT
); 
 476                         p
->pbegin
[subno
] = HERE(); 
 477                 EMIT(OLPAREN
, subno
); 
 478                 /* the MORE here is an error heuristic */ 
 479                 if (MORE() && !SEETWO('\\', ')')) 
 481                 if (subno 
< NPAREN
) { 
 482                         p
->pend
[subno
] = HERE(); 
 483                         assert(p
->pend
[subno
] != 0); 
 485                 EMIT(ORPAREN
, subno
); 
 486                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN
); 
 488         case BACKSL
|')':        /* should not get here -- must be user */ 
 490                 SETERROR(REG_EPAREN
); 
 501                 i 
= (c
&~BACKSL
) - '0'; 
 503                 if (p
->pend
[i
] != 0) { 
 504                         assert(i 
<= p
->g
->nsub
); 
 506                         assert(p
->pbegin
[i
] != 0); 
 507                         assert(OP(p
->strip
[p
->pbegin
[i
]]) == OLPAREN
); 
 508                         assert(OP(p
->strip
[p
->pend
[i
]]) == ORPAREN
); 
 509                         (void) dupl(p
, p
->pbegin
[i
]+1, p
->pend
[i
]); 
 512                         SETERROR(REG_ESUBREG
); 
 516                 REQUIRE(starordinary
, REG_BADRPT
); 
 519                 ordinary(p
, (char)c
);   /* takes off BACKSL, if any */ 
 523         if (EAT('*')) {         /* implemented as +? */ 
 524                 /* this case does not require the (y|) trick, noKLUDGE */ 
 527                 INSERT(OQUEST_
, pos
); 
 528                 ASTERN(O_QUEST
, pos
); 
 529         } else if (EATTWO('\\', '{')) { 
 532                         if (MORE() && isdigit(PEEK())) { 
 534                                 REQUIRE(count 
<= count2
, REG_BADBR
); 
 535                         } else          /* single number with comma */ 
 537                 } else          /* just a single number */ 
 539                 repeat(p
, pos
, count
, count2
); 
 540                 if (!EATTWO('\\', '}')) {       /* error heuristics */ 
 541                         while (MORE() && !SEETWO('\\', '}')) 
 543                         REQUIRE(MORE(), REG_EBRACE
); 
 546         } else if (c 
== (unsigned char)'$')     /* $ (but not \$) ends it */ 
 553  - p_count - parse a repetition count 
 554  == static int p_count(register struct parse *p); 
 556 static int                      /* the value */ 
 558 register struct parse 
*p
; 
 560         register int count 
= 0; 
 561         register int ndigits 
= 0; 
 563         while (MORE() && isdigit(PEEK()) && count 
<= DUPMAX
) { 
 564                 count 
= count
*10 + (GETNEXT() - '0'); 
 568         REQUIRE(ndigits 
> 0 && count 
<= DUPMAX
, REG_BADBR
); 
 573  - p_bracket - parse a bracketed character list 
 574  == static void p_bracket(register struct parse *p); 
 576  * Note a significant property of this code:  if the allocset() did SETERROR, 
 577  * no set operations are done. 
 581 register struct parse 
*p
; 
 583         register cset 
*cs 
= allocset(p
); 
 584         register int invert 
= 0; 
 586         /* Dept of Truly Sickening Special-Case Kludges */ 
 587         if (p
->next 
+ 5 < p
->end 
&& strncmp(p
->next
, "[:<:]]", 6) == 0) { 
 592         if (p
->next 
+ 5 < p
->end 
&& strncmp(p
->next
, "[:>:]]", 6) == 0) { 
 599                 invert
++;       /* make note to invert set at end */ 
 604         while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 
 608         MUSTEAT(']', REG_EBRACK
); 
 610         if (p
->error 
!= 0)      /* don't mess things up further */ 
 613         if (p
->g
->cflags
®_ICASE
) { 
 617                 for (i 
= p
->g
->csetsize 
- 1; i 
>= 0; i
--) 
 618                         if (CHIN(cs
, i
) && isalpha(i
)) { 
 623                 if (cs
->multis 
!= NULL
) 
 629                 for (i 
= p
->g
->csetsize 
- 1; i 
>= 0; i
--) 
 634                 if (p
->g
->cflags
®_NEWLINE
) 
 636                 if (cs
->multis 
!= NULL
) 
 640         assert(cs
->multis 
== NULL
);             /* xxx */ 
 642         if (nch(p
, cs
) == 1) {          /* optimize singleton sets */ 
 643                 ordinary(p
, firstch(p
, cs
)); 
 646                 EMIT(OANYOF
, freezeset(p
, cs
)); 
 650  - p_b_term - parse one term of a bracketed character list 
 651  == static void p_b_term(register struct parse *p, register cset *cs); 
 655 register struct parse 
*p
; 
 659         register char start
, finish
; 
 662         /* classify what we've got */ 
 663         switch ((MORE()) ? PEEK() : '\0') { 
 665                 c 
= (MORE2()) ? PEEK2() : '\0'; 
 668                 SETERROR(REG_ERANGE
); 
 669                 return;                 /* NOTE RETURN */ 
 677         case ':':               /* character class */ 
 679                 REQUIRE(MORE(), REG_EBRACK
); 
 681                 REQUIRE(c 
!= '-' && c 
!= ']', REG_ECTYPE
); 
 683                 REQUIRE(MORE(), REG_EBRACK
); 
 684                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE
); 
 686         case '=':               /* equivalence class */ 
 688                 REQUIRE(MORE(), REG_EBRACK
); 
 690                 REQUIRE(c 
!= '-' && c 
!= ']', REG_ECOLLATE
); 
 692                 REQUIRE(MORE(), REG_EBRACK
); 
 693                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE
); 
 695         default:                /* symbol, ordinary character, or range */ 
 696 /* xxx revision needed for multichar stuff */ 
 697                 start 
= p_b_symbol(p
); 
 698                 if (SEE('-') && MORE2() && PEEK2() != ']') { 
 704                                 finish 
= p_b_symbol(p
); 
 707 /* xxx what about signed chars here... */ 
 708                 REQUIRE(start 
<= finish
, REG_ERANGE
); 
 709                 for (i 
= start
; i 
<= finish
; i
++) 
 716  - p_b_cclass - parse a character-class name and deal with it 
 717  == static void p_b_cclass(register struct parse *p, register cset *cs); 
 721 register struct parse 
*p
; 
 724         register char *sp 
= p
->next
; 
 725         register struct cclass 
*cp
; 
 730         while (MORE() && isalpha(PEEK())) 
 733         for (cp 
= cclasses
; cp
->name 
!= NULL
; cp
++) 
 734                 if (strncmp(cp
->name
, sp
, len
) == 0 && cp
->name
[len
] == '\0') 
 736         if (cp
->name 
== NULL
) { 
 737                 /* oops, didn't find it */ 
 738                 SETERROR(REG_ECTYPE
); 
 743         while ((c 
= *u
++) != '\0') 
 745         for (u 
= cp
->multis
; *u 
!= '\0'; u 
+= strlen(u
) + 1) 
 750  - p_b_eclass - parse an equivalence-class name and deal with it 
 751  == static void p_b_eclass(register struct parse *p, register cset *cs); 
 753  * This implementation is incomplete. xxx 
 757 register struct parse 
*p
; 
 762         c 
= p_b_coll_elem(p
, '='); 
 767  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 
 768  == static char p_b_symbol(register struct parse *p); 
 770 static char                     /* value of symbol */ 
 772 register struct parse 
*p
; 
 776         REQUIRE(MORE(), REG_EBRACK
); 
 777         if (!EATTWO('[', '.')) 
 780         /* collating symbol */ 
 781         value 
= p_b_coll_elem(p
, '.'); 
 782         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE
); 
 787  - p_b_coll_elem - parse a collating-element name and look it up 
 788  == static char p_b_coll_elem(register struct parse *p, int endc); 
 790 static char                     /* value of collating element */ 
 791 p_b_coll_elem(p
, endc
) 
 792 register struct parse 
*p
; 
 793 int endc
;                       /* name ended by endc,']' */ 
 795         register char *sp 
= p
->next
; 
 796         register struct cname 
*cp
; 
 799         while (MORE() && !SEETWO(endc
, ']')) 
 802                 SETERROR(REG_EBRACK
); 
 806         for (cp 
= cnames
; cp
->name 
!= NULL
; cp
++) 
 807                 if (strncmp(cp
->name
, sp
, len
) == 0 && cp
->name
[len
] == '\0') 
 808                         return(cp
->code
);       /* known name */ 
 810                 return(*sp
);    /* single character */ 
 811         SETERROR(REG_ECOLLATE
);                 /* neither */ 
 816  - othercase - return the case counterpart of an alphabetic 
 817  == static char othercase(int ch); 
 819 static char                     /* if no counterpart, return ch */ 
 826         else if (islower(ch
)) 
 828         else                    /* peculiar, but could happen */ 
 833  - bothcases - emit a dualcase version of a two-case character 
 834  == static void bothcases(register struct parse *p, int ch); 
 836  * Boy, is this implementation ever a kludge... 
 840 register struct parse 
*p
; 
 843         register char *oldnext 
= p
->next
; 
 844         register char *oldend 
= p
->end
; 
 847         assert(othercase(ch
) != ch
);    /* p_bracket() would recurse */ 
 854         assert(p
->next 
== bracket
+2); 
 860  - ordinary - emit an ordinary character 
 861  == static void ordinary(register struct parse *p, register int ch); 
 865 register struct parse 
*p
; 
 868         register cat_t 
*cap 
= p
->g
->categories
; 
 870         if ((p
->g
->cflags
®_ICASE
) && isalpha(ch
) && othercase(ch
) != ch
) 
 873                 EMIT(OCHAR
, (unsigned char)ch
); 
 875                         cap
[ch
] = p
->g
->ncategories
++; 
 880  - nonnewline - emit REG_NEWLINE version of OANY 
 881  == static void nonnewline(register struct parse *p); 
 883  * Boy, is this implementation ever a kludge... 
 887 register struct parse 
*p
; 
 889         register char *oldnext 
= p
->next
; 
 890         register char *oldend 
= p
->end
; 
 900         assert(p
->next 
== bracket
+3); 
 906  - repeat - generate code for a bounded repetition, recursively if needed 
 907  == static void repeat(register struct parse *p, sopno start, int from, int to); 
 910 repeat(p
, start
, from
, to
) 
 911 register struct parse 
*p
; 
 912 sopno start
;                    /* operand from here to end of strip */ 
 913 int from
;                       /* repeated from this number */ 
 914 int to
;                         /* to this number of times (maybe INFINITY) */ 
 916         register sopno finish 
= HERE(); 
 919 #       define  REP(f, t)       ((f)*8 + (t)) 
 920 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 
 923         if (p
->error 
!= 0)      /* head off possible runaway recursion */ 
 928         switch (REP(MAP(from
), MAP(to
))) { 
 929         case REP(0, 0):                 /* must be user doing this */ 
 930                 DROP(finish
-start
);     /* drop the operand */ 
 932         case REP(0, 1):                 /* as x{1,1}? */ 
 933         case REP(0, N
):                 /* as x{1,n}? */ 
 934         case REP(0, INF
):               /* as x{1,}? */ 
 935                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 
 936                 INSERT(OCH_
, start
);            /* offset is wrong... */ 
 937                 repeat(p
, start
+1, 1, to
); 
 939                 AHEAD(start
);                   /* ... fix it */ 
 942                 ASTERN(O_CH
, THERETHERE()); 
 944         case REP(1, 1):                 /* trivial case */ 
 947         case REP(1, N
):                 /* as x?x{1,n-1} */ 
 948                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 
 952                 EMIT(OOR2
, 0);                  /* offset very wrong... */ 
 953                 AHEAD(THERE());                 /* ...so fix it */ 
 954                 ASTERN(O_CH
, THERETHERE()); 
 955                 copy 
= dupl(p
, start
+1, finish
+1); 
 956                 assert(copy 
== finish
+4); 
 957                 repeat(p
, copy
, 1, to
-1); 
 959         case REP(1, INF
):               /* as x+ */ 
 960                 INSERT(OPLUS_
, start
); 
 961                 ASTERN(O_PLUS
, start
); 
 963         case REP(N
, N
):                 /* as xx{m-1,n-1} */ 
 964                 copy 
= dupl(p
, start
, finish
); 
 965                 repeat(p
, copy
, from
-1, to
-1); 
 967         case REP(N
, INF
):               /* as xx{n-1,INF} */ 
 968                 copy 
= dupl(p
, start
, finish
); 
 969                 repeat(p
, copy
, from
-1, to
); 
 971         default:                        /* "can't happen" */ 
 972                 SETERROR(REG_ASSERT
);   /* just in case */ 
 978  - seterr - set an error condition 
 979  == static int seterr(register struct parse *p, int e); 
 981 static int                      /* useless but makes type checking happy */ 
 983 register struct parse 
*p
; 
 986         if (p
->error 
== 0)      /* keep earliest error condition */ 
 988         p
->next 
= nuls
;         /* try to bring things to a halt */ 
 990         return(0);              /* make the return value well-defined */ 
 994  - allocset - allocate a set of characters for [] 
 995  == static cset *allocset(register struct parse *p); 
 999 register struct parse 
*p
; 
1001         register int no 
= p
->g
->ncsets
++; 
1003         register size_t nbytes
; 
1005         register size_t css 
= (size_t)p
->g
->csetsize
; 
1008         if (no 
>= p
->ncsalloc
) {        /* need another column of space */ 
1009                 p
->ncsalloc 
+= CHAR_BIT
; 
1011                 assert(nc 
% CHAR_BIT 
== 0); 
1012                 nbytes 
= nc 
/ CHAR_BIT 
* css
; 
1013                 if (p
->g
->sets 
== NULL
) 
1014                         p
->g
->sets 
= (cset 
*)malloc(nc 
* sizeof(cset
)); 
1016                         p
->g
->sets 
= (cset 
*)realloc((char *)p
->g
->sets
, 
1018                 if (p
->g
->setbits 
== NULL
) 
1019                         p
->g
->setbits 
= (uch 
*)malloc(nbytes
); 
1021                         p
->g
->setbits 
= (uch 
*)realloc((char *)p
->g
->setbits
, 
1023                         /* xxx this isn't right if setbits is now NULL */ 
1024                         for (i 
= 0; i 
< no
; i
++) 
1025                                 p
->g
->sets
[i
].ptr 
= p
->g
->setbits 
+ css
*(i
/CHAR_BIT
); 
1027                 if (p
->g
->sets 
!= NULL 
&& p
->g
->setbits 
!= NULL
) 
1028                         (void) memset((char *)p
->g
->setbits 
+ (nbytes 
- css
), 
1032                         SETERROR(REG_ESPACE
); 
1033                         /* caller's responsibility not to do set ops */ 
1037         assert(p
->g
->sets 
!= NULL
);     /* xxx */ 
1038         cs 
= &p
->g
->sets
[no
]; 
1039         cs
->ptr 
= p
->g
->setbits 
+ css
*((no
)/CHAR_BIT
); 
1040         cs
->mask 
= 1 << ((no
) % CHAR_BIT
); 
1049  - freeset - free a now-unused set 
1050  == static void freeset(register struct parse *p, register cset *cs); 
1054 register struct parse 
*p
; 
1058         register cset 
*top 
= &p
->g
->sets
[p
->g
->ncsets
]; 
1059         register size_t css 
= (size_t)p
->g
->csetsize
; 
1061         for (i 
= 0; i 
< css
; i
++) 
1063         if (cs 
== top
-1)        /* recover only the easy case */ 
1068  - freezeset - final processing on a set of characters 
1069  == static int freezeset(register struct parse *p, register cset *cs); 
1071  * The main task here is merging identical sets.  This is usually a waste 
1072  * of time (although the hash code minimizes the overhead), but can win 
1073  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash 
1074  * is done using addition rather than xor -- all ASCII [aA] sets xor to 
1077 static int                      /* set number */ 
1079 register struct parse 
*p
; 
1082         register uch h 
= cs
->hash
; 
1084         register cset 
*top 
= &p
->g
->sets
[p
->g
->ncsets
]; 
1086         register size_t css 
= (size_t)p
->g
->csetsize
; 
1088         /* look for an earlier one which is the same */ 
1089         for (cs2 
= &p
->g
->sets
[0]; cs2 
< top
; cs2
++) 
1090                 if (cs2
->hash 
== h 
&& cs2 
!= cs
) { 
1092                         for (i 
= 0; i 
< css
; i
++) 
1093                                 if (!!CHIN(cs2
, i
) != !!CHIN(cs
, i
)) 
1099         if (cs2 
< top
) {        /* found one */ 
1104         return((int)(cs 
- p
->g
->sets
)); 
1108  - firstch - return first character in a set (which must have at least one) 
1109  == static int firstch(register struct parse *p, register cset *cs); 
1111 static int                      /* character; there is no "none" value */ 
1113 register struct parse 
*p
; 
1117         register size_t css 
= (size_t)p
->g
->csetsize
; 
1119         for (i 
= 0; i 
< css
; i
++) 
1123         return(0);              /* arbitrary */ 
1127  - nch - number of characters in a set 
1128  == static int nch(register struct parse *p, register cset *cs); 
1132 register struct parse 
*p
; 
1136         register size_t css 
= (size_t)p
->g
->csetsize
; 
1139         for (i 
= 0; i 
< css
; i
++) 
1146  - mcadd - add a collating element to a cset 
1147  == static void mcadd(register struct parse *p, register cset *cs, \ 
1148  ==     register char *cp); 
1152 register struct parse 
*p
; 
1156         register size_t oldend 
= cs
->smultis
; 
1158         cs
->smultis 
+= strlen(cp
) + 1; 
1159         if (cs
->multis 
== NULL
) 
1160                 cs
->multis 
= malloc(cs
->smultis
); 
1162                 cs
->multis 
= realloc(cs
->multis
, cs
->smultis
); 
1163         if (cs
->multis 
== NULL
) { 
1164                 SETERROR(REG_ESPACE
); 
1168         (void) strcpy(cs
->multis 
+ oldend 
- 1, cp
); 
1169         cs
->multis
[cs
->smultis 
- 1] = '\0'; 
1172 /* these functions don't seem to be used (yet?), suppress warnings */ 
1175  - mcsub - subtract a collating element from a cset 
1176  == static void mcsub(register cset *cs, register char *cp); 
1183         register char *fp 
= mcfind(cs
, cp
); 
1184         register size_t len 
= strlen(fp
); 
1187         (void) memmove(fp
, fp 
+ len 
+ 1, 
1188                                 cs
->smultis 
- (fp 
+ len 
+ 1 - cs
->multis
)); 
1191         if (cs
->smultis 
== 0) { 
1197         cs
->multis 
= realloc(cs
->multis
, cs
->smultis
); 
1198         assert(cs
->multis 
!= NULL
); 
1202  - mcin - is a collating element in a cset? 
1203  == static int mcin(register cset *cs, register char *cp); 
1210         return(mcfind(cs
, cp
) != NULL
); 
1214  - mcfind - find a collating element in a cset 
1215  == static char *mcfind(register cset *cs, register char *cp); 
1224         if (cs
->multis 
== NULL
) 
1226         for (p 
= cs
->multis
; *p 
!= '\0'; p 
+= strlen(p
) + 1) 
1227                 if (strcmp(cp
, p
) == 0) 
1234  - mcinvert - invert the list of collating elements in a cset 
1235  == static void mcinvert(register struct parse *p, register cset *cs); 
1237  * This would have to know the set of possibilities.  Implementation 
1242 register struct parse 
*p
; 
1245         assert(cs
->multis 
== NULL
);     /* xxx */ 
1249  - mccase - add case counterparts of the list of collating elements in a cset 
1250  == static void mccase(register struct parse *p, register cset *cs); 
1252  * This would have to know the set of possibilities.  Implementation 
1257 register struct parse 
*p
; 
1260         assert(cs
->multis 
== NULL
);     /* xxx */ 
1264  - isinsets - is this character in any sets? 
1265  == static int isinsets(register struct re_guts *g, int c); 
1267 static int                      /* predicate */ 
1269 register struct re_guts 
*g
; 
1274         register int ncols 
= (g
->ncsets
+(CHAR_BIT
-1)) / CHAR_BIT
; 
1275         register unsigned uc 
= (unsigned char)c
; 
1277         for (i 
= 0, col 
= g
->setbits
; i 
< ncols
; i
++, col 
+= g
->csetsize
) 
1284  - samesets - are these two characters in exactly the same sets? 
1285  == static int samesets(register struct re_guts *g, int c1, int c2); 
1287 static int                      /* predicate */ 
1289 register struct re_guts 
*g
; 
1295         register int ncols 
= (g
->ncsets
+(CHAR_BIT
-1)) / CHAR_BIT
; 
1296         register unsigned uc1 
= (unsigned char)c1
; 
1297         register unsigned uc2 
= (unsigned char)c2
; 
1299         for (i 
= 0, col 
= g
->setbits
; i 
< ncols
; i
++, col 
+= g
->csetsize
) 
1300                 if (col
[uc1
] != col
[uc2
]) 
1306  - categorize - sort out character categories 
1307  == static void categorize(struct parse *p, register struct re_guts *g); 
1312 register struct re_guts 
*g
; 
1314         register cat_t 
*cats 
= g
->categories
; 
1319         /* avoid making error situations worse */ 
1323         for (c 
= CHAR_MIN
; c 
<= CHAR_MAX
; c
++) 
1324                 if (cats
[c
] == 0 && isinsets(g
, c
)) { 
1325                         cat 
= g
->ncategories
++; 
1327                         for (c2 
= c
+1; c2 
<= CHAR_MAX
; c2
++) 
1328                                 if (cats
[c2
] == 0 && samesets(g
, c
, c2
)) 
1334  - dupl - emit a duplicate of a bunch of sops 
1335  == static sopno dupl(register struct parse *p, sopno start, sopno finish); 
1337 static sopno                    
/* start of duplicate */ 
1338 dupl(p
, start
, finish
) 
1339 register struct parse 
*p
; 
1340 sopno start
;                    /* from here */ 
1341 sopno finish
;                   /* to this less one */ 
1343         register sopno ret 
= HERE(); 
1344         register sopno len 
= finish 
- start
; 
1346         assert(finish 
>= start
); 
1349         enlarge(p
, p
->ssize 
+ len
);     /* this many unexpected additions */ 
1350         assert(p
->ssize 
>= p
->slen 
+ len
); 
1351         (void) memcpy((char *)(p
->strip 
+ p
->slen
), 
1352                 (char *)(p
->strip 
+ start
), (size_t)len
*sizeof(sop
)); 
1358  - doemit - emit a strip operator 
1359  == static void doemit(register struct parse *p, sop op, size_t opnd); 
1361  * It might seem better to implement this as a macro with a function as 
1362  * hard-case backup, but it's just too big and messy unless there are 
1363  * some changes to the data structures.  Maybe later. 
1367 register struct parse 
*p
; 
1371         /* avoid making error situations worse */ 
1375         /* deal with oversize operands ("can't happen", more or less) */ 
1376         assert(opnd 
< 1<<OPSHIFT
); 
1378         /* deal with undersized strip */ 
1379         if (p
->slen 
>= p
->ssize
) 
1380                 enlarge(p
, (p
->ssize
+1) / 2 * 3);       /* +50% */ 
1381         assert(p
->slen 
< p
->ssize
); 
1383         /* finally, it's all reduced to the easy case */ 
1384         p
->strip
[p
->slen
++] = SOP(op
, opnd
); 
1388  - doinsert - insert a sop into the strip 
1389  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 
1392 doinsert(p
, op
, opnd
, pos
) 
1393 register struct parse 
*p
; 
1402         /* avoid making error situations worse */ 
1407         EMIT(op
, opnd
);         /* do checks, ensure space */ 
1408         assert(HERE() == sn
+1); 
1411         /* adjust paren pointers */ 
1413         for (i 
= 1; i 
< NPAREN
; i
++) { 
1414                 if (p
->pbegin
[i
] >= pos
) { 
1417                 if (p
->pend
[i
] >= pos
) { 
1422         memmove((char *)&p
->strip
[pos
+1], (char *)&p
->strip
[pos
], 
1423                                                 (HERE()-pos
-1)*sizeof(sop
)); 
1428  - dofwd - complete a forward reference 
1429  == static void dofwd(register struct parse *p, sopno pos, sop value); 
1432 dofwd(p
, pos
, value
) 
1433 register struct parse 
*p
; 
1437         /* avoid making error situations worse */ 
1441         assert(value 
< 1<<OPSHIFT
); 
1442         p
->strip
[pos
] = OP(p
->strip
[pos
]) | value
; 
1446  - enlarge - enlarge the strip 
1447  == static void enlarge(register struct parse *p, sopno size); 
1451 register struct parse 
*p
; 
1452 register sopno size
; 
1456         if (p
->ssize 
>= size
) 
1459         sp 
= (sop 
*)realloc(p
->strip
, size
*sizeof(sop
)); 
1461                 SETERROR(REG_ESPACE
); 
1469  - stripsnug - compact the strip 
1470  == static void stripsnug(register struct parse *p, register struct re_guts *g); 
1474 register struct parse 
*p
; 
1475 register struct re_guts 
*g
; 
1477         g
->nstates 
= p
->slen
; 
1478         g
->strip 
= (sop 
*)realloc((char *)p
->strip
, p
->slen 
* sizeof(sop
)); 
1479         if (g
->strip 
== NULL
) { 
1480                 SETERROR(REG_ESPACE
); 
1481                 g
->strip 
= p
->strip
; 
1486  - findmust - fill in must and mlen with longest mandatory literal string 
1487  == static void findmust(register struct parse *p, register struct re_guts *g); 
1489  * This algorithm could do fancy things like analyzing the operands of | 
1490  * for common subsequences.  Someday.  This code is simple and finds most 
1491  * of the interesting cases. 
1493  * Note that must and mlen got initialized during setup. 
1498 register struct re_guts 
*g
; 
1502         register sop 
*newstart
; 
1503         register sopno newlen
; 
1508         /* avoid making error situations worse */ 
1512         /* find the longest OCHAR sequence in strip */ 
1514         scan 
= g
->strip 
+ 1; 
1518                 case OCHAR
:             /* sequence member */ 
1519                         if (newlen 
== 0)                /* new sequence */ 
1520                                 newstart 
= scan 
- 1; 
1523                 case OPLUS_
:            /* things that don't break one */ 
1527                 case OQUEST_
:           /* things that must be skipped */ 
1533                                 /* assert() interferes w debug printouts */ 
1534                                 if (OP(s
) != O_QUEST 
&& OP(s
) != O_CH 
&& 
1539                         } while (OP(s
) != O_QUEST 
&& OP(s
) != O_CH
); 
1541                 default:                /* things that break a sequence */ 
1542                         if (newlen 
> g
->mlen
) {         /* ends one */ 
1549         } while (OP(s
) != OEND
); 
1551         if (g
->mlen 
== 0)               /* there isn't one */ 
1554         /* turn it into a character string */ 
1555         g
->must 
= malloc((size_t)g
->mlen 
+ 1); 
1556         if (g
->must 
== NULL
) {          /* argh; just forget it */ 
1562         for (i 
= g
->mlen
; i 
> 0; i
--) { 
1563                 while (OP(s 
= *scan
++) != OCHAR
) 
1565                 assert(cp 
< g
->must 
+ g
->mlen
); 
1566                 *cp
++ = (char)OPND(s
); 
1568         assert(cp 
== g
->must 
+ g
->mlen
); 
1569         *cp
++ = '\0';           /* just on general principles */ 
1573  - pluscount - count + nesting 
1574  == static sopno pluscount(register struct parse *p, register struct re_guts *g); 
1576 static sopno                    
/* nesting depth */ 
1579 register struct re_guts 
*g
; 
1583         register sopno plusnest 
= 0; 
1584         register sopno maxnest 
= 0; 
1587                 return(0);      /* there may not be an OEND */ 
1589         scan 
= g
->strip 
+ 1; 
1597                         if (plusnest 
> maxnest
) 
1602         } while (OP(s
) != OEND
);