regex/regcomp-fbsd.c

   1 /*-
   2  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   3  * Copyright (c) 1992, 1993, 1994
   4  *      The Regents of the University of California.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * Henry Spencer.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  34  */
  35
  36 #if defined(LIBC_SCCS) && !defined(lint)
  37 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
  38 #endif /* LIBC_SCCS and not lint */
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $");
  41
  42 #include "xlocale_private.h"
  43
  44 #include <sys/types.h>
  45 #include <stdio.h>
  46 #include <string.h>
  47 #include <ctype.h>
  48 #include <limits.h>
  49 #include <stdlib.h>
  50 #include <regex.h>
  51 #include <runetype.h>
  52 #include <wchar.h>
  53 #include <wctype.h>
  54
  55 #include "collate.h"
  56
  57 #include "utils.h"
  58 #include "regex2.h"
  59
  60 #include "cname.h"
  61
  62 /*
  63  * parse structure, passed up and down to avoid global variables and
  64  * other clumsinesses
  65  */
  66 struct parse {
  67         char *next;             /* next character in RE */
  68         char *end;              /* end of string (-> NUL normally) */
  69         int error;              /* has an error been seen? */
  70         sop *strip;             /* malloced strip */
  71         sopno ssize;            /* malloced strip size (allocated) */
  72         sopno slen;             /* malloced strip length (used) */
  73         int ncsalloc;           /* number of csets allocated */
  74 #if __DARWIN_UNIX03
  75         int zerorepeats;
  76 #endif /* __DARWIN_UNIX03 */
  77         struct re_guts *g;
  78 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
  79         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
  80         sopno pend[NPAREN];     /* -> ) ([0] unused) */
  81 };
  82
  83 /* ========= begin header generated by ./mkh ========= */
  84 #ifdef __cplusplus
  85 extern "C" {
  86 #endif
  87
  88 /* === regcomp.c === */
  89 static void p_ere(struct parse *p, wint_t stop);
  90 static void p_ere_exp(struct parse *p);
  91 static void p_str(struct parse *p);
  92 static void p_bre(struct parse *p, wint_t end1, wint_t end2);
  93 static int p_simp_re(struct parse *p, int starordinary);
  94 static int p_count(struct parse *p);
  95 static void p_bracket(struct parse *p);
  96 static void p_b_term(struct parse *p, cset *cs);
  97 static void p_b_cclass(struct parse *p, cset *cs);
  98 static void p_b_eclass(struct parse *p, cset *cs);
  99 static wint_t p_b_symbol(struct parse *p);
 100 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
 101 static wint_t othercase(wint_t ch, locale_t loc);
 102 static void bothcases(struct parse *p, wint_t ch);
 103 static void ordinary(struct parse *p, wint_t ch);
 104 static void nonnewline(struct parse *p);
 105 static void repeat(struct parse *p, sopno start, int from, int to);
 106 static int seterr(struct parse *p, int e);
 107 static cset *allocset(struct parse *p);
 108 static void freeset(struct parse *p, cset *cs);
 109 static void CHadd(struct parse *p, cset *cs, wint_t ch);
 110 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
 111 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
 112 static wint_t singleton(cset *cs, locale_t loc);
 113 static sopno dupl(struct parse *p, sopno start, sopno finish);
 114 static void doemit(struct parse *p, sop op, size_t opnd);
 115 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
 116 static void dofwd(struct parse *p, sopno pos, sop value);
 117 static void enlarge(struct parse *p, sopno size);
 118 static void stripsnug(struct parse *p, struct re_guts *g);
 119 static void findmust(struct parse *p, struct re_guts *g);
 120 static int altoffset(sop *scan, int offset);
 121 static void computejumps(struct parse *p, struct re_guts *g);
 122 static void computematchjumps(struct parse *p, struct re_guts *g);
 123 static sopno pluscount(struct parse *p, struct re_guts *g);
 124 static wint_t wgetnext(struct parse *p);
 125
 126 #ifdef __cplusplus
 127 }
 128 #endif
 129 /* ========= end header generated by ./mkh ========= */
 130
 131 static char nuls[10];           /* place to point scanner in event of error */
 132
 133 /*
 134  * macros for use with parse structure
 135  * BEWARE:  these know that the parse structure is named `p' !!!
 136  */
 137 #define PEEK()  (*p->next)
 138 #define PEEK2() (*(p->next+1))
 139 #define MORE()  (p->next < p->end)
 140 #define MORE2() (p->next+1 < p->end)
 141 #define SEE(c)  (MORE() && PEEK() == (c))
 142 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 143 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 144 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 145 #define NEXT()  (p->next++)
 146 #define NEXT2() (p->next += 2)
 147 #define NEXTn(n)        (p->next += (n))
 148 #define GETNEXT()       (*p->next++)
 149 #define WGETNEXT()      wgetnext(p)
 150 #define SETERROR(e)     seterr(p, (e))
 151 #define REQUIRE(co, e)  ((co) || SETERROR(e))
 152 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 153 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 154 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 155 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 156 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 157 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 158 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 159 #define HERE()          (p->slen)
 160 #define THERE()         (p->slen - 1)
 161 #define THERETHERE()    (p->slen - 2)
 162 #define DROP(n) (p->slen -= (n))
 163
 164 #ifndef NDEBUG
 165 static int never = 0;           /* for use in asserts; shuts lint up */
 166 #else
 167 #define never   0               /* some <assert.h>s have bugs too */
 168 #endif
 169
 170 /* Macro used by computejump()/computematchjump() */
 171 #define MIN(a,b)        ((a)<(b)?(a):(b))
 172
 173 /*
 174  - regcomp - interface for parser and compilation
 175  = extern int regcomp(regex_t *, const char *, int);
 176  = #define      REG_BASIC       0000
 177  = #define      REG_EXTENDED    0001
 178  = #define      REG_ICASE       0002
 179  = #define      REG_NOSUB       0004
 180  = #define      REG_NEWLINE     0010
 181  = #define      REG_NOSPEC      0020
 182  = #define      REG_PEND        0040
 183  = #define      REG_DUMP        0200
 184  */
 185 int                             /* 0 success, otherwise REG_something */
 186 regcomp(regex_t * __restrict preg,
 187         const char * __restrict pattern,
 188         int cflags)
 189 {
 190         struct parse pa;
 191         struct re_guts *g;
 192         struct parse *p = &pa;
 193         int i;
 194         size_t len;
 195 #ifdef REDEBUG
 196 #       define  GOODFLAGS(f)    (f)
 197 #else
 198 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 199 #endif
 200
 201         cflags = GOODFLAGS(cflags);
 202         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 203                 return(REG_INVARG);
 204
 205         if (cflags&REG_PEND) {
 206                 if (preg->re_endp < pattern)
 207                         return(REG_INVARG);
 208                 len = preg->re_endp - pattern;
 209         } else
 210                 len = strlen((char *)pattern);
 211
 212         /* do the mallocs early so failure handling is easy */
 213         g = (struct re_guts *)malloc(sizeof(struct re_guts));
 214         if (g == NULL)
 215                 return(REG_ESPACE);
 216         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 217         p->strip = (sop *)malloc(p->ssize * sizeof(sop));
 218         p->slen = 0;
 219         if (p->strip == NULL) {
 220                 free((char *)g);
 221                 return(REG_ESPACE);
 222         }
 223
 224         /* set things up */
 225         p->g = g;
 226         p->next = (char *)pattern;      /* convenience; we do not modify it */
 227         p->end = p->next + len;
 228         p->error = 0;
 229         p->ncsalloc = 0;
 230 #if __DARWIN_UNIX03
 231         p->zerorepeats = 0;
 232 #endif /* __DARWIN_UNIX03 */
 233         for (i = 0; i < NPAREN; i++) {
 234                 p->pbegin[i] = 0;
 235                 p->pend[i] = 0;
 236         }
 237         g->loc = __current_locale();
 238         g->sets = NULL;
 239         g->ncsets = 0;
 240         g->cflags = cflags;
 241         g->iflags = 0;
 242         g->nbol = 0;
 243         g->neol = 0;
 244         g->must = NULL;
 245         g->moffset = -1;
 246         g->charjump = NULL;
 247         g->matchjump = NULL;
 248         g->mlen = 0;
 249         g->nsub = 0;
 250         g->backrefs = 0;
 251
 252         /* do it */
 253         EMIT(OEND, 0);
 254         g->firststate = THERE();
 255         if (cflags&REG_EXTENDED)
 256                 p_ere(p, OUT);
 257         else if (cflags&REG_NOSPEC)
 258                 p_str(p);
 259         else
 260                 p_bre(p, OUT, OUT);
 261         EMIT(OEND, 0);
 262         g->laststate = THERE();
 263
 264         /* tidy up loose ends and fill things in */
 265         stripsnug(p, g);
 266         findmust(p, g);
 267         /* only use Boyer-Moore algorithm if the pattern is bigger
 268          * than three characters
 269          */
 270         if(g->mlen > 3) {
 271                 computejumps(p, g);
 272                 computematchjumps(p, g);
 273                 if(g->matchjump == NULL && g->charjump != NULL) {
 274                         free(g->charjump);
 275                         g->charjump = NULL;
 276                 }
 277         }
 278         g->nplus = pluscount(p, g);
 279         g->magic = MAGIC2;
 280         preg->re_nsub = g->nsub;
 281         preg->re_g = g;
 282         preg->re_magic = MAGIC1;
 283 #ifndef REDEBUG
 284         /* not debugging, so can't rely on the assert() in regexec() */
 285         if (g->iflags&BAD)
 286                 SETERROR(REG_ASSERT);
 287 #endif
 288
 289         /* win or lose, we're done */
 290         if (p->error != 0)      /* lose */
 291                 regfree(preg);
 292         return(p->error);
 293 }
 294
 295 /*
 296  - p_ere - ERE parser top level, concatenation and alternation
 297  == static void p_ere(struct parse *p, int stop);
 298  */
 299 static void
 300 p_ere(struct parse *p,
 301         int stop)               /* character this ERE should end at */
 302 {
 303         char c;
 304         sopno prevback;
 305         sopno prevfwd;
 306         sopno conc;
 307         int first = 1;          /* is this the first alternative? */
 308
 309         for (;;) {
 310                 /* do a bunch of concatenated expressions */
 311                 conc = HERE();
 312                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 313                         p_ere_exp(p);
 314 #if __DARWIN_UNIX03
 315                 if (!p->zerorepeats) REQUIRE(HERE() != conc, REG_EMPTY);        /* require nonempty */
 316                 else p->zerorepeats--;
 317 #else
 318                 (void)REQUIRE(HERE() != conc, REG_EMPTY);       /* require nonempty */
 319 #endif
 320                 if (!EAT('|'))
 321                         break;          /* NOTE BREAK OUT */
 322
 323                 if (first) {
 324                         INSERT(OCH_, conc);     /* offset is wrong */
 325                         prevfwd = conc;
 326                         prevback = conc;
 327                         first = 0;
 328                 }
 329                 ASTERN(OOR1, prevback);
 330                 prevback = THERE();
 331                 AHEAD(prevfwd);                 /* fix previous offset */
 332                 prevfwd = HERE();
 333                 EMIT(OOR2, 0);                  /* offset is very wrong */
 334         }
 335
 336         if (!first) {           /* tail-end fixups */
 337                 AHEAD(prevfwd);
 338                 ASTERN(O_CH, prevback);
 339         }
 340
 341         assert(!MORE() || SEE(stop));
 342 }
 343
 344 /*
 345  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 346  == static void p_ere_exp(struct parse *p);
 347  */
 348 static void
 349 p_ere_exp(struct parse *p)
 350 {
 351         char c;
 352         wint_t wc;
 353         sopno pos;
 354         int count;
 355         int count2;
 356         sopno subno;
 357         int wascaret = 0;
 358
 359         assert(MORE());         /* caller should have ensured this */
 360         c = GETNEXT();
 361
 362         pos = HERE();
 363         switch (c) {
 364         case '(':
 365                 (void)REQUIRE(MORE(), REG_EPAREN);
 366                 p->g->nsub++;
 367                 subno = p->g->nsub;
 368                 if (subno < NPAREN)
 369                         p->pbegin[subno] = HERE();
 370                 EMIT(OLPAREN, subno);
 371                 if (!SEE(')'))
 372                         p_ere(p, ')');
 373                 if (subno < NPAREN) {
 374                         p->pend[subno] = HERE();
 375                         assert(p->pend[subno] != 0);
 376                 }
 377                 EMIT(ORPAREN, subno);
 378                 (void)MUSTEAT(')', REG_EPAREN);
 379                 break;
 380 #ifndef POSIX_MISTAKE
 381         case ')':               /* happens only if no current unmatched ( */
 382                 /*
 383                  * You may ask, why the ifndef?  Because I didn't notice
 384                  * this until slightly too late for 1003.2, and none of the
 385                  * other 1003.2 regular-expression reviewers noticed it at
 386                  * all.  So an unmatched ) is legal POSIX, at least until
 387                  * we can get it fixed.
 388                  */
 389                 SETERROR(REG_EPAREN);
 390                 break;
 391 #endif
 392         case '^':
 393                 EMIT(OBOL, 0);
 394                 p->g->iflags |= USEBOL;
 395                 p->g->nbol++;
 396                 wascaret = 1;
 397                 break;
 398         case '$':
 399                 EMIT(OEOL, 0);
 400                 p->g->iflags |= USEEOL;
 401                 p->g->neol++;
 402                 break;
 403         case '|':
 404                 SETERROR(REG_EMPTY);
 405                 break;
 406         case '*':
 407         case '+':
 408         case '?':
 409                 SETERROR(REG_BADRPT);
 410                 break;
 411         case '.':
 412                 if (p->g->cflags&REG_NEWLINE)
 413                         nonnewline(p);
 414                 else
 415                         EMIT(OANY, 0);
 416                 break;
 417         case '[':
 418                 p_bracket(p);
 419                 break;
 420         case '\\':
 421                 (void)REQUIRE(MORE(), REG_EESCAPE);
 422                 wc = WGETNEXT();
 423                 ordinary(p, wc);
 424                 break;
 425         case '{':               /* okay as ordinary except if digit follows */
 426                 (void)REQUIRE(!MORE() || !isdigit_l((uch)PEEK(), p->g->loc), REG_BADRPT);
 427                 /* FALLTHROUGH */
 428         default:
 429                 p->next--;
 430                 wc = WGETNEXT();
 431                 ordinary(p, wc);
 432                 break;
 433         }
 434
 435         if (!MORE())
 436                 return;
 437         c = PEEK();
 438         /* we call { a repetition if followed by a digit */
 439         if (!( c == '*' || c == '+' || c == '?' ||
 440                                 (c == '{' && MORE2() && isdigit_l((uch)PEEK2(), p->g->loc)) ))
 441                 return;         /* no repetition, we're done */
 442         NEXT();
 443
 444         (void)REQUIRE(!wascaret, REG_BADRPT);
 445         switch (c) {
 446         case '*':       /* implemented as +? */
 447                 /* this case does not require the (y|) trick, noKLUDGE */
 448                 INSERT(OPLUS_, pos);
 449                 ASTERN(O_PLUS, pos);
 450                 INSERT(OQUEST_, pos);
 451                 ASTERN(O_QUEST, pos);
 452                 break;
 453         case '+':
 454                 INSERT(OPLUS_, pos);
 455                 ASTERN(O_PLUS, pos);
 456                 break;
 457         case '?':
 458                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 459                 INSERT(OCH_, pos);              /* offset slightly wrong */
 460                 ASTERN(OOR1, pos);              /* this one's right */
 461                 AHEAD(pos);                     /* fix the OCH_ */
 462                 EMIT(OOR2, 0);                  /* offset very wrong... */
 463                 AHEAD(THERE());                 /* ...so fix it */
 464                 ASTERN(O_CH, THERETHERE());
 465                 break;
 466         case '{':
 467                 count = p_count(p);
 468                 if (EAT(',')) {
 469                         if (isdigit_l((uch)PEEK(), p->g->loc)) {
 470                                 count2 = p_count(p);
 471                                 (void)REQUIRE(count <= count2, REG_BADBR);
 472                         } else          /* single number with comma */
 473                                 count2 = INFINITY;
 474                 } else          /* just a single number */
 475                         count2 = count;
 476                 repeat(p, pos, count, count2);
 477                 if (!EAT('}')) {        /* error heuristics */
 478                         while (MORE() && PEEK() != '}')
 479                                 NEXT();
 480                         (void)REQUIRE(MORE(), REG_EBRACE);
 481                         SETERROR(REG_BADBR);
 482                 }
 483                 break;
 484         }
 485
 486         if (!MORE())
 487                 return;
 488         c = PEEK();
 489         if (!( c == '*' || c == '+' || c == '?' ||
 490                                 (c == '{' && MORE2() && isdigit_l((uch)PEEK2(), p->g->loc)) ) )
 491                 return;
 492         SETERROR(REG_BADRPT);
 493 }
 494
 495 /*
 496  - p_str - string (no metacharacters) "parser"
 497  == static void p_str(struct parse *p);
 498  */
 499 static void
 500 p_str(struct parse *p)
 501 {
 502 #if __DARWIN_UNIX03
 503         if (!p->zerorepeats) REQUIRE(MORE(), REG_EMPTY);
 504         else p->zerorepeats--;
 505 #else  /* !__DARWIN_UNIX03 */
 506         (void)REQUIRE(MORE(), REG_EMPTY);
 507 #endif /* __DARWIN_UNIX03 */
 508         while (MORE())
 509                 ordinary(p, WGETNEXT());
 510 }
 511
 512 /*
 513  - p_bre - BRE parser top level, anchoring and concatenation
 514  == static void p_bre(struct parse *p, int end1, \
 515  ==     int end2);
 516  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 517  *
 518  * This implementation is a bit of a kludge, in that a trailing $ is first
 519  * taken as an ordinary character and then revised to be an anchor.
 520  * The amount of lookahead needed to avoid this kludge is excessive.
 521  */
 522 static void
 523 p_bre(struct parse *p,
 524         int end1,               /* first terminating character */
 525         int end2)               /* second terminating character */
 526 {
 527         sopno start = HERE();
 528         int first = 1;                  /* first subexpression? */
 529         int wasdollar = 0;
 530
 531         if (EAT('^')) {
 532                 EMIT(OBOL, 0);
 533                 p->g->iflags |= USEBOL;
 534                 p->g->nbol++;
 535         }
 536         while (MORE() && !SEETWO(end1, end2)) {
 537                 wasdollar = p_simp_re(p, first);
 538                 first = 0;
 539         }
 540         if (wasdollar) {        /* oops, that was a trailing anchor */
 541                 DROP(1);
 542                 EMIT(OEOL, 0);
 543                 p->g->iflags |= USEEOL;
 544                 p->g->neol++;
 545         }
 546 #if __DARWIN_UNIX03
 547         if (!p->zerorepeats) REQUIRE(HERE() != start, REG_EMPTY);       /* require nonempty */
 548         else p->zerorepeats--;
 549 #else  /* !__DARWIN_UNIX03 */
 550         (void)REQUIRE(HERE() != start, REG_EMPTY);      /* require nonempty */
 551 #endif /* __DARWIN_UNIX03 */
 552 }
 553
 554 /*
 555  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 556  == static int p_simp_re(struct parse *p, int starordinary);
 557  */
 558 static int                      /* was the simple RE an unbackslashed $? */
 559 p_simp_re(struct parse *p,
 560         int starordinary)       /* is a leading * an ordinary character? */
 561 {
 562         int c;
 563         int count;
 564         int count2;
 565         sopno pos;
 566         int i;
 567         wint_t wc;
 568         sopno subno;
 569 #       define  BACKSL  (1<<CHAR_BIT)
 570
 571         pos = HERE();           /* repetion op, if any, covers from here */
 572
 573         assert(MORE());         /* caller should have ensured this */
 574         c = GETNEXT();
 575         if (c == '\\') {
 576                 (void)REQUIRE(MORE(), REG_EESCAPE);
 577                 c = BACKSL | GETNEXT();
 578         }
 579         switch (c) {
 580         case '.':
 581                 if (p->g->cflags&REG_NEWLINE)
 582                         nonnewline(p);
 583                 else
 584                         EMIT(OANY, 0);
 585                 break;
 586         case '[':
 587                 p_bracket(p);
 588                 break;
 589         case BACKSL|'{':
 590                 SETERROR(REG_BADRPT);
 591                 break;
 592         case BACKSL|'(':
 593                 p->g->nsub++;
 594                 subno = p->g->nsub;
 595                 if (subno < NPAREN)
 596                         p->pbegin[subno] = HERE();
 597                 EMIT(OLPAREN, subno);
 598                 /* the MORE here is an error heuristic */
 599                 if (MORE() && !SEETWO('\\', ')'))
 600                         p_bre(p, '\\', ')');
 601                 if (subno < NPAREN) {
 602                         p->pend[subno] = HERE();
 603                         assert(p->pend[subno] != 0);
 604                 }
 605                 EMIT(ORPAREN, subno);
 606                 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 607                 break;
 608         case BACKSL|')':        /* should not get here -- must be user */
 609         case BACKSL|'}':
 610                 SETERROR(REG_EPAREN);
 611                 break;
 612         case BACKSL|'1':
 613         case BACKSL|'2':
 614         case BACKSL|'3':
 615         case BACKSL|'4':
 616         case BACKSL|'5':
 617         case BACKSL|'6':
 618         case BACKSL|'7':
 619         case BACKSL|'8':
 620         case BACKSL|'9':
 621                 i = (c&~BACKSL) - '0';
 622                 assert(i < NPAREN);
 623                 if (p->pend[i] != 0) {
 624 #if __DARWIN_UNIX03
 625                         int skip = 1;
 626 #endif /* __DARWIN_UNIX03 */
 627                         assert(i <= p->g->nsub);
 628                         EMIT(OBACK_, i);
 629                         assert(p->pbegin[i] != 0);
 630                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 631                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 632 #if __DARWIN_UNIX03
 633                         if (OP(p->strip[p->pbegin[i]+skip]) == OBOL) {
 634                                 skip++;         /* don't dup anchor in subexp */
 635                         }
 636                         (void) dupl(p, p->pbegin[i]+skip, p->pend[i]);
 637 #else  /* !__DARWIN_UNIX03 */
 638                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 639 #endif /* __DARWIN_UNIX03 */
 640                         EMIT(O_BACK, i);
 641                 } else
 642                         SETERROR(REG_ESUBREG);
 643                 p->g->backrefs = 1;
 644                 break;
 645         case '*':
 646                 (void)REQUIRE(starordinary, REG_BADRPT);
 647                 /* FALLTHROUGH */
 648         default:
 649                 p->next--;
 650                 wc = WGETNEXT();
 651                 ordinary(p, wc);
 652                 break;
 653         }
 654
 655         if (EAT('*')) {         /* implemented as +? */
 656                 /* this case does not require the (y|) trick, noKLUDGE */
 657                 INSERT(OPLUS_, pos);
 658                 ASTERN(O_PLUS, pos);
 659                 INSERT(OQUEST_, pos);
 660                 ASTERN(O_QUEST, pos);
 661         } else if (EATTWO('\\', '{')) {
 662                 (void)REQUIRE(MORE(), REG_EBRACE);
 663                 count = p_count(p);
 664                 if (EAT(',')) {
 665                         if (MORE() && isdigit_l((uch)PEEK(), p->g->loc)) {
 666                                 count2 = p_count(p);
 667                                 (void)REQUIRE(count <= count2, REG_BADBR);
 668                         } else          /* single number with comma */
 669                                 count2 = INFINITY;
 670                 } else          /* just a single number */
 671                         count2 = count;
 672                 repeat(p, pos, count, count2);
 673                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 674                         while (MORE() && !SEETWO('\\', '}'))
 675                                 NEXT();
 676                         (void)REQUIRE(MORE(), REG_EBRACE);
 677                         SETERROR(REG_BADBR);
 678                 }
 679         } else if (c == '$')     /* $ (but not \$) ends it */
 680                 return(1);
 681
 682         return(0);
 683 }
 684
 685 /*
 686  - p_count - parse a repetition count
 687  == static int p_count(struct parse *p);
 688  */
 689 static int                      /* the value */
 690 p_count(struct parse *p)
 691 {
 692         int count = 0;
 693         int ndigits = 0;
 694
 695         while (MORE() && isdigit_l((uch)PEEK(), p->g->loc) && count <= DUPMAX) {
 696                 count = count*10 + (GETNEXT() - '0');
 697                 ndigits++;
 698         }
 699
 700         (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 701         return(count);
 702 }
 703
 704 /*
 705  - p_bracket - parse a bracketed character list
 706  == static void p_bracket(struct parse *p);
 707  */
 708 static void
 709 p_bracket(struct parse *p)
 710 {
 711         cset *cs;
 712         wint_t ch;
 713
 714         /* Dept of Truly Sickening Special-Case Kludges */
 715         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 716                 EMIT(OBOW, 0);
 717                 NEXTn(6);
 718                 return;
 719         }
 720         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 721                 EMIT(OEOW, 0);
 722                 NEXTn(6);
 723                 return;
 724         }
 725
 726         if ((cs = allocset(p)) == NULL)
 727                 return;
 728
 729         if (p->g->cflags&REG_ICASE)
 730                 cs->icase = 1;
 731         if (EAT('^'))
 732                 cs->invert = 1;
 733 #if __DARWIN_UNIX03
 734         if (PEEK2() != '-' && PEEK2() != ']') { /* Don't eat '-' or ']' if they're part of ranges
 735                                                  * but do process [^-] */
 736         if (EAT(']'))
 737                 CHadd(p, cs, ']');
 738         else if (EAT('-'))
 739                 CHadd(p, cs, '-');
 740         }
 741         if (MORE() && !SEETWO('-',']')) /* Parse RE []-'] */
 742                 p_b_term(p, cs);
 743 #else /* !__DARWIN_UNIX03 */
 744         if (EAT(']'))
 745                 CHadd(p, cs, ']');
 746         else if (EAT('-'))
 747                 CHadd(p, cs, '-');
 748 #endif /* __DARWIN_UNIX03 */
 749         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 750                 p_b_term(p, cs);
 751         if (EAT('-'))
 752                 CHadd(p, cs, '-');
 753         (void)MUSTEAT(']', REG_EBRACK);
 754
 755         if (p->error != 0)      /* don't mess things up further */
 756                 return;
 757
 758         if (cs->invert && p->g->cflags&REG_NEWLINE)
 759                 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
 760
 761         if ((ch = singleton(cs, p->g->loc)) != OUT) {   /* optimize singleton sets */
 762                 ordinary(p, ch);
 763                 freeset(p, cs);
 764         } else
 765                 EMIT(OANYOF, (int)(cs - p->g->sets));
 766 }
 767
 768 /*
 769  - p_b_term - parse one term of a bracketed character list
 770  == static void p_b_term(struct parse *p, cset *cs);
 771  */
 772 static void
 773 p_b_term(struct parse *p, cset *cs)
 774 {
 775         char c;
 776         wint_t start, finish;
 777         wint_t i;
 778
 779         /* classify what we've got */
 780         switch ((MORE()) ? PEEK() : '\0') {
 781         case '[':
 782                 c = (MORE2()) ? PEEK2() : '\0';
 783                 break;
 784         case '-':
 785 #if __DARWIN_UNIX03
 786                 if (PEEK2() != '-') { /* Allow [---] */
 787                 SETERROR(REG_ERANGE);
 788                 return;                 /* NOTE RETURN */
 789                 } else
 790                         c = '-';
 791 #else  /* !__DARWIN_UNIX03 */
 792                 SETERROR(REG_ERANGE);
 793                 return;                 /* NOTE RETURN */
 794 #endif /* __DARWIN_UNIX03 */
 795                 break;
 796         default:
 797                 c = '\0';
 798                 break;
 799         }
 800
 801         switch (c) {
 802         case ':':               /* character class */
 803                 NEXT2();
 804                 (void)REQUIRE(MORE(), REG_EBRACK);
 805                 c = PEEK();
 806                 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 807                 p_b_cclass(p, cs);
 808                 (void)REQUIRE(MORE(), REG_EBRACK);
 809                 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 810                 break;
 811         case '=':               /* equivalence class */
 812                 NEXT2();
 813                 (void)REQUIRE(MORE(), REG_EBRACK);
 814                 c = PEEK();
 815 #if __DARWIN_UNIX03
 816                 REQUIRE(c != '-', REG_ECOLLATE); /* allow [=]=] */
 817 #else  /* !__DARWIN_UNIX03 */
 818                 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 819 #endif /* __DARWIN_UNIX03 */
 820                 p_b_eclass(p, cs);
 821                 (void)REQUIRE(MORE(), REG_EBRACK);
 822                 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 823                 break;
 824         default:                /* symbol, ordinary character, or range */
 825                 start = p_b_symbol(p);
 826                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 827                         /* range */
 828                         NEXT();
 829                         if (EAT('-'))
 830                                 finish = '-';
 831                         else
 832                                 finish = p_b_symbol(p);
 833                 } else
 834                         finish = start;
 835                 if (start == finish)
 836                         CHadd(p, cs, start);
 837                 else {
 838                         if (p->g->loc->__collate_load_error) {
 839                                 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
 840                                 CHaddrange(p, cs, start, finish);
 841                         } else {
 842                                 (void)REQUIRE(__collate_range_cmp(start, finish, p->g->loc) <= 0, REG_ERANGE);
 843                                 for (i = 0; i <= UCHAR_MAX; i++) {
 844                                         if (   __collate_range_cmp(start, i, p->g->loc) <= 0
 845                                             && __collate_range_cmp(i, finish, p->g->loc) <= 0
 846                                            )
 847                                                 CHadd(p, cs, i);
 848                                 }
 849                         }
 850                 }
 851                 break;
 852         }
 853 }
 854
 855 /*
 856  - p_b_cclass - parse a character-class name and deal with it
 857  == static void p_b_cclass(struct parse *p, cset *cs);
 858  */
 859 static void
 860 p_b_cclass(struct parse *p, cset *cs)
 861 {
 862         char *sp = p->next;
 863         size_t len;
 864         wctype_t wct;
 865         char clname[16];
 866
 867         while (MORE() && isalpha_l((uch)PEEK(), p->g->loc))
 868                 NEXT();
 869         len = p->next - sp;
 870         if (len >= sizeof(clname) - 1) {
 871                 SETERROR(REG_ECTYPE);
 872                 return;
 873         }
 874         memcpy(clname, sp, len);
 875         clname[len] = '\0';
 876         if ((wct = wctype_l(clname, p->g->loc)) == 0) {
 877                 SETERROR(REG_ECTYPE);
 878                 return;
 879         }
 880         CHaddtype(p, cs, wct);
 881 }
 882
 883 /*
 884  - p_b_eclass - parse an equivalence-class name and deal with it
 885  == static void p_b_eclass(struct parse *p, cset *cs);
 886  */
 887 static void
 888 p_b_eclass(struct parse *p, cset *cs)
 889 {
 890         char *sp = p->next;
 891         int len, ec;
 892         mbstate_t mbs;
 893         int *newequiv_classes;
 894         wint_t c;
 895
 896         while (MORE() && !SEETWO('=', ']'))
 897                 NEXT();
 898         if (!MORE()) {
 899                 SETERROR(REG_EBRACK);
 900                 return;
 901         }
 902         len = p->next - sp;
 903         memset(&mbs, 0, sizeof(mbs));
 904         ec = __collate_equiv_class(sp, len, &mbs, p->g->loc);
 905         if (ec > 0) {
 906                 newequiv_classes = realloc(cs->equiv_classes,
 907                     (cs->nequiv_classes + 1) * sizeof(*cs->equiv_classes));
 908                 if (newequiv_classes == NULL) {
 909                         SETERROR(REG_ESPACE);
 910                         return;
 911                 }
 912                 cs->equiv_classes = newequiv_classes;
 913                 cs->equiv_classes[cs->nequiv_classes++] = ec;
 914                 return;
 915         }
 916         /* not an equivalence class, so fallback to a collating element */
 917         p->next = sp;
 918         c = p_b_coll_elem(p, '=');
 919         CHadd(p, cs, c);
 920 }
 921
 922 /*
 923  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 924  == static char p_b_symbol(struct parse *p);
 925  */
 926 static wint_t                   /* value of symbol */
 927 p_b_symbol(struct parse *p)
 928 {
 929         wint_t value;
 930
 931         (void)REQUIRE(MORE(), REG_EBRACK);
 932         if (!EATTWO('[', '.'))
 933                 return(WGETNEXT());
 934
 935         /* collating symbol */
 936         value = p_b_coll_elem(p, '.');
 937         (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 938         return(value);
 939 }
 940
 941 /*
 942  - p_b_coll_elem - parse a collating-element name and look it up
 943  == static char p_b_coll_elem(struct parse *p, int endc);
 944  */
 945 static wint_t                   /* value of collating element */
 946 p_b_coll_elem(struct parse *p,
 947         wint_t endc)            /* name ended by endc,']' */
 948 {
 949         char *sp = p->next;
 950         const struct cname *cp;
 951         int len;
 952         mbstate_t mbs;
 953         wchar_t wbuf[16];
 954         size_t clen;
 955
 956         while (MORE() && !SEETWO(endc, ']'))
 957                 NEXT();
 958         if (!MORE()) {
 959                 SETERROR(REG_EBRACK);
 960                 return(0);
 961         }
 962         len = p->next - sp;
 963         for (cp = cnames; cp->name != NULL; cp++)
 964                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 965                         return(cp->code);       /* known name */
 966         memset(&mbs, 0, sizeof(mbs));
 967         clen = __collate_collating_symbol(wbuf, 16, sp, len, &mbs, p->g->loc);
 968         if (clen == 1)
 969                 return (*wbuf);                 /* single character */
 970         else if (clen == (size_t)-1)
 971                 SETERROR(REG_ILLSEQ);
 972         else
 973                 SETERROR(REG_ECOLLATE);         /* neither */
 974         return(0);
 975 }
 976
 977 /*
 978  - othercase - return the case counterpart of an alphabetic
 979  == static char othercase(wint_t ch, locale_t loc);
 980  */
 981 static wint_t                   /* if no counterpart, return ch */
 982 othercase(wint_t ch, locale_t loc)
 983 {
 984         assert(iswalpha_l(ch, loc));
 985         if (iswupper_l(ch, loc))
 986                 return(towlower_l(ch, loc));
 987         else if (iswlower_l(ch, loc))
 988                 return(towupper_l(ch, loc));
 989         else                    /* peculiar, but could happen */
 990                 return(ch);
 991 }
 992
 993 /*
 994  - bothcases - emit a dualcase version of a two-case character
 995  == static void bothcases(struct parse *p, int ch);
 996  *
 997  * Boy, is this implementation ever a kludge...
 998  */
 999 static void
1000 bothcases(struct parse *p, wint_t ch)
1001 {
1002         char *oldnext = p->next;
1003         char *oldend = p->end;
1004         char bracket[3 + MB_LEN_MAX];
1005         size_t n;
1006         mbstate_t mbs;
1007
1008         assert(othercase(ch, p->g->loc) != ch); /* p_bracket() would recurse */
1009         p->next = bracket;
1010         memset(&mbs, 0, sizeof(mbs));
1011         n = wcrtomb_l(bracket, ch, &mbs, p->g->loc);
1012         assert(n != (size_t)-1);
1013         bracket[n] = ']';
1014         bracket[n + 1] = '\0';
1015         p->end = bracket+n+1;
1016         p_bracket(p);
1017         assert(p->next == p->end);
1018         p->next = oldnext;
1019         p->end = oldend;
1020 }
1021
1022 /*
1023  - ordinary - emit an ordinary character
1024  == static void ordinary(struct parse *p, int ch);
1025  */
1026 static void
1027 ordinary(struct parse *p, wint_t ch)
1028 {
1029         cset *cs;
1030
1031         if ((p->g->cflags&REG_ICASE) && iswalpha_l(ch, p->g->loc) && othercase(ch, p->g->loc) != ch)
1032                 bothcases(p, ch);
1033         else if ((ch & OPDMASK) == ch)
1034                 EMIT(OCHAR, ch);
1035         else {
1036                 /*
1037                  * Kludge: character is too big to fit into an OCHAR operand.
1038                  * Emit a singleton set.
1039                  */
1040                 if ((cs = allocset(p)) == NULL)
1041                         return;
1042                 CHadd(p, cs, ch);
1043                 EMIT(OANYOF, (int)(cs - p->g->sets));
1044         }
1045 }
1046
1047 /*
1048  - nonnewline - emit REG_NEWLINE version of OANY
1049  == static void nonnewline(struct parse *p);
1050  *
1051  * Boy, is this implementation ever a kludge...
1052  */
1053 static void
1054 nonnewline(struct parse *p)
1055 {
1056         char *oldnext = p->next;
1057         char *oldend = p->end;
1058         char bracket[4];
1059
1060         p->next = bracket;
1061         p->end = bracket+3;
1062         bracket[0] = '^';
1063         bracket[1] = '\n';
1064         bracket[2] = ']';
1065         bracket[3] = '\0';
1066         p_bracket(p);
1067         assert(p->next == bracket+3);
1068         p->next = oldnext;
1069         p->end = oldend;
1070 }
1071
1072 /*
1073  - repeat - generate code for a bounded repetition, recursively if needed
1074  == static void repeat(struct parse *p, sopno start, int from, int to);
1075  */
1076 static void
1077 repeat(struct parse *p,
1078         sopno start,            /* operand from here to end of strip */
1079         int from,               /* repeated from this number */
1080         int to)                 /* to this number of times (maybe INFINITY) */
1081 {
1082         sopno finish = HERE();
1083 #       define  N       2
1084 #       define  INF     3
1085 #       define  REP(f, t)       ((f)*8 + (t))
1086 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1087         sopno copy;
1088
1089         if (p->error != 0)      /* head off possible runaway recursion */
1090                 return;
1091
1092         assert(from <= to);
1093
1094         switch (REP(MAP(from), MAP(to))) {
1095         case REP(0, 0):                 /* must be user doing this */
1096                 DROP(finish-start);     /* drop the operand */
1097 #if __DARWIN_UNIX03
1098                 p->zerorepeats++;
1099 #endif /* __DARWIN_UNIX03 */
1100                 break;
1101         case REP(0, INF):               /* as x{1,}? */
1102 #if __DARWIN_UNIX03
1103                 /* this case does not require the (y|) trick, noKLUDGE */
1104                 /* Just like * =+?  */
1105                 INSERT(OPLUS_, start);
1106                 ASTERN(O_PLUS, start);
1107                 INSERT(OQUEST_, start);
1108                 ASTERN(O_QUEST, start);
1109                 break;
1110 #endif /* __DARWIN_UNIX03 */
1111         case REP(0, 1):                 /* as x{1,1}? */
1112         case REP(0, N):                 /* as x{1,n}? */
1113                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1114                 INSERT(OCH_, start);            /* offset is wrong... */
1115                 repeat(p, start+1, 1, to);
1116                 ASTERN(OOR1, start);
1117                 AHEAD(start);                   /* ... fix it */
1118                 EMIT(OOR2, 0);
1119                 AHEAD(THERE());
1120                 ASTERN(O_CH, THERETHERE());
1121                 break;
1122         case REP(1, 1):                 /* trivial case */
1123                 /* done */
1124                 break;
1125         case REP(1, N):                 /* as x?x{1,n-1} */
1126 #if __DARWIN_UNIX03
1127                 INSERT(OQUEST_, start);
1128                 ASTERN(O_QUEST, start);
1129 #else /* !__DARWIN_UNIX03 */
1130                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1131                 INSERT(OCH_, start);
1132                 ASTERN(OOR1, start);
1133                 AHEAD(start);
1134                 EMIT(OOR2, 0);                  /* offset very wrong... */
1135                 AHEAD(THERE());                 /* ...so fix it */
1136                 ASTERN(O_CH, THERETHERE());
1137 #endif /* __DARWIN_UNIX03 */
1138                 copy = dupl(p, start+1, finish+1);
1139                 assert(copy == finish+4);
1140                 repeat(p, copy, 1, to-1);
1141                 break;
1142         case REP(1, INF):               /* as x+ */
1143                 INSERT(OPLUS_, start);
1144                 ASTERN(O_PLUS, start);
1145                 break;
1146         case REP(N, N):                 /* as xx{m-1,n-1} */
1147                 copy = dupl(p, start, finish);
1148                 repeat(p, copy, from-1, to-1);
1149                 break;
1150         case REP(N, INF):               /* as xx{n-1,INF} */
1151                 copy = dupl(p, start, finish);
1152                 repeat(p, copy, from-1, to);
1153                 break;
1154         default:                        /* "can't happen" */
1155                 SETERROR(REG_ASSERT);   /* just in case */
1156                 break;
1157         }
1158 }
1159
1160 /*
1161  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1162  - character from the parse struct, signals a REG_ILLSEQ error if the
1163  - character can't be converted. Returns the number of bytes consumed.
1164  */
1165 static wint_t
1166 wgetnext(struct parse *p)
1167 {
1168         mbstate_t mbs;
1169         wchar_t wc;
1170         size_t n;
1171
1172         memset(&mbs, 0, sizeof(mbs));
1173         n = mbrtowc_l(&wc, p->next, p->end - p->next, &mbs, p->g->loc);
1174         if (n == (size_t)-1 || n == (size_t)-2) {
1175                 SETERROR(REG_ILLSEQ);
1176                 return (0);
1177         }
1178         if (n == 0)
1179                 n = 1;
1180         p->next += n;
1181         return (wc);
1182 }
1183
1184 /*
1185  - seterr - set an error condition
1186  == static int seterr(struct parse *p, int e);
1187  */
1188 static int                      /* useless but makes type checking happy */
1189 seterr(struct parse *p, int e)
1190 {
1191         if (p->error == 0)      /* keep earliest error condition */
1192                 p->error = e;
1193         p->next = nuls;         /* try to bring things to a halt */
1194         p->end = nuls;
1195         return(0);              /* make the return value well-defined */
1196 }
1197
1198 /*
1199  - allocset - allocate a set of characters for []
1200  == static cset *allocset(struct parse *p);
1201  */
1202 static cset *
1203 allocset(struct parse *p)
1204 {
1205         cset *cs, *ncs;
1206
1207         ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1208         if (ncs == NULL) {
1209                 SETERROR(REG_ESPACE);
1210                 return (NULL);
1211         }
1212         p->g->sets = ncs;
1213         cs = &p->g->sets[p->g->ncsets++];
1214         memset(cs, 0, sizeof(*cs));
1215
1216         return(cs);
1217 }
1218
1219 /*
1220  - freeset - free a now-unused set
1221  == static void freeset(struct parse *p, cset *cs);
1222  */
1223 static void
1224 freeset(struct parse *p, cset *cs)
1225 {
1226         cset *top = &p->g->sets[p->g->ncsets];
1227
1228         free(cs->wides);
1229         free(cs->ranges);
1230         free(cs->types);
1231         memset(cs, 0, sizeof(*cs));
1232         if (cs == top-1)        /* recover only the easy case */
1233                 p->g->ncsets--;
1234 }
1235
1236 /*
1237  - singleton - Determine whether a set contains only one character,
1238  - returning it if so, otherwise returning OUT.
1239  */
1240 static wint_t
1241 singleton(cset *cs, locale_t loc)
1242 {
1243         wint_t i, s, n;
1244
1245         for (i = n = 0; i < NC; i++)
1246                 if (CHIN(cs, i, loc)) {
1247                         n++;
1248                         s = i;
1249                 }
1250         if (n == 1)
1251                 return (s);
1252         if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1253             cs->icase == 0)
1254                 return (cs->wides[0]);
1255         /* Don't bother handling the other cases. */
1256         return (OUT);
1257 }
1258
1259 /*
1260  - CHadd - add character to character set.
1261  */
1262 static void
1263 CHadd(struct parse *p, cset *cs, wint_t ch)
1264 {
1265         wint_t nch, *newwides;
1266         assert(ch >= 0);
1267         if (ch < NC)
1268                 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1269         else {
1270                 newwides = realloc(cs->wides, (cs->nwides + 1) *
1271                     sizeof(*cs->wides));
1272                 if (newwides == NULL) {
1273                         SETERROR(REG_ESPACE);
1274                         return;
1275                 }
1276                 cs->wides = newwides;
1277                 cs->wides[cs->nwides++] = ch;
1278         }
1279         if (cs->icase) {
1280                 if ((nch = towlower_l(ch, p->g->loc)) < NC)
1281                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1282                 if ((nch = towupper_l(ch, p->g->loc)) < NC)
1283                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1284         }
1285 }
1286
1287 /*
1288  - CHaddrange - add all characters in the range [min,max] to a character set.
1289  */
1290 static void
1291 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1292 {
1293         crange *newranges;
1294
1295         for (; min < NC && min <= max; min++)
1296                 CHadd(p, cs, min);
1297         if (min >= max)
1298                 return;
1299         newranges = realloc(cs->ranges, (cs->nranges + 1) *
1300             sizeof(*cs->ranges));
1301         if (newranges == NULL) {
1302                 SETERROR(REG_ESPACE);
1303                 return;
1304         }
1305         cs->ranges = newranges;
1306         cs->ranges[cs->nranges].min = min;
1307         cs->ranges[cs->nranges].min = max;
1308         cs->nranges++;
1309 }
1310
1311 /*
1312  - CHaddtype - add all characters of a certain type to a character set.
1313  */
1314 static void
1315 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1316 {
1317         wint_t i;
1318         wctype_t *newtypes;
1319
1320         for (i = 0; i < NC; i++)
1321                 if (iswctype_l(i, wct, p->g->loc))
1322                         CHadd(p, cs, i);
1323         newtypes = realloc(cs->types, (cs->ntypes + 1) *
1324             sizeof(*cs->types));
1325         if (newtypes == NULL) {
1326                 SETERROR(REG_ESPACE);
1327                 return;
1328         }
1329         cs->types = newtypes;
1330         cs->types[cs->ntypes++] = wct;
1331 }
1332
1333 /*
1334  - dupl - emit a duplicate of a bunch of sops
1335  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1336  */
1337 static sopno                    /* start of duplicate */
1338 dupl(struct parse *p,
1339         sopno start,            /* from here */
1340         sopno finish)           /* to this less one */
1341 {
1342         sopno ret = HERE();
1343         sopno len = finish - start;
1344
1345         assert(finish >= start);
1346         if (len == 0)
1347                 return(ret);
1348         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1349         assert(p->ssize >= p->slen + len);
1350         (void) memcpy((char *)(p->strip + p->slen),
1351                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1352         p->slen += len;
1353         return(ret);
1354 }
1355
1356 /*
1357  - doemit - emit a strip operator
1358  == static void doemit(struct parse *p, sop op, size_t opnd);
1359  *
1360  * It might seem better to implement this as a macro with a function as
1361  * hard-case backup, but it's just too big and messy unless there are
1362  * some changes to the data structures.  Maybe later.
1363  */
1364 static void
1365 doemit(struct parse *p, sop op, size_t opnd)
1366 {
1367         /* avoid making error situations worse */
1368         if (p->error != 0)
1369                 return;
1370
1371         /* deal with oversize operands ("can't happen", more or less) */
1372         assert(opnd < 1<<OPSHIFT);
1373
1374         /* deal with undersized strip */
1375         if (p->slen >= p->ssize)
1376                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1377         assert(p->slen < p->ssize);
1378
1379         /* finally, it's all reduced to the easy case */
1380         p->strip[p->slen++] = SOP(op, opnd);
1381 }
1382
1383 /*
1384  - doinsert - insert a sop into the strip
1385  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1386  */
1387 static void
1388 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1389 {
1390         sopno sn;
1391         sop s;
1392         int i;
1393
1394         /* avoid making error situations worse */
1395         if (p->error != 0)
1396                 return;
1397
1398         sn = HERE();
1399         EMIT(op, opnd);         /* do checks, ensure space */
1400         assert(HERE() == sn+1);
1401         s = p->strip[sn];
1402
1403         /* adjust paren pointers */
1404         assert(pos > 0);
1405         for (i = 1; i < NPAREN; i++) {
1406                 if (p->pbegin[i] >= pos) {
1407                         p->pbegin[i]++;
1408                 }
1409                 if (p->pend[i] >= pos) {
1410                         p->pend[i]++;
1411                 }
1412         }
1413
1414         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1415                                                 (HERE()-pos-1)*sizeof(sop));
1416         p->strip[pos] = s;
1417 }
1418
1419 /*
1420  - dofwd - complete a forward reference
1421  == static void dofwd(struct parse *p, sopno pos, sop value);
1422  */
1423 static void
1424 dofwd(struct parse *p, sopno pos, sop value)
1425 {
1426         /* avoid making error situations worse */
1427         if (p->error != 0)
1428                 return;
1429
1430         assert(value < 1<<OPSHIFT);
1431         p->strip[pos] = OP(p->strip[pos]) | value;
1432 }
1433
1434 /*
1435  - enlarge - enlarge the strip
1436  == static void enlarge(struct parse *p, sopno size);
1437  */
1438 static void
1439 enlarge(struct parse *p, sopno size)
1440 {
1441         sop *sp;
1442
1443         if (p->ssize >= size)
1444                 return;
1445
1446         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1447         if (sp == NULL) {
1448                 SETERROR(REG_ESPACE);
1449                 return;
1450         }
1451         p->strip = sp;
1452         p->ssize = size;
1453 }
1454
1455 /*
1456  - stripsnug - compact the strip
1457  == static void stripsnug(struct parse *p, struct re_guts *g);
1458  */
1459 static void
1460 stripsnug(struct parse *p, struct re_guts *g)
1461 {
1462         g->nstates = p->slen;
1463         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1464         if (g->strip == NULL) {
1465                 SETERROR(REG_ESPACE);
1466                 g->strip = p->strip;
1467         }
1468 }
1469
1470 /*
1471  - findmust - fill in must and mlen with longest mandatory literal string
1472  == static void findmust(struct parse *p, struct re_guts *g);
1473  *
1474  * This algorithm could do fancy things like analyzing the operands of |
1475  * for common subsequences.  Someday.  This code is simple and finds most
1476  * of the interesting cases.
1477  *
1478  * Note that must and mlen got initialized during setup.
1479  */
1480 static void
1481 findmust(struct parse *p, struct re_guts *g)
1482 {
1483         sop *scan;
1484         sop *start;
1485         sop *newstart;
1486         sopno newlen;
1487         sop s;
1488         char *cp;
1489         int offset;
1490         char buf[MB_LEN_MAX];
1491         size_t clen;
1492         mbstate_t mbs;
1493         struct __xlocale_st_runelocale *rl = p->g->loc->__lc_ctype;
1494
1495         /* avoid making error situations worse */
1496         if (p->error != 0)
1497                 return;
1498
1499         /*
1500          * It's not generally safe to do a ``char'' substring search on
1501          * multibyte character strings, but it's safe for at least
1502          * UTF-8 (see RFC 3629).
1503          */
1504         if (rl->__mb_cur_max > 1 &&
1505             strcmp(rl->_CurrentRuneLocale.__encoding, "UTF-8") != 0)
1506                 return;
1507
1508         /* find the longest OCHAR sequence in strip */
1509         newlen = 0;
1510         offset = 0;
1511         g->moffset = 0;
1512         scan = g->strip + 1;
1513         do {
1514                 s = *scan++;
1515                 switch (OP(s)) {
1516                 case OCHAR:             /* sequence member */
1517                         if (newlen == 0) {              /* new sequence */
1518                                 memset(&mbs, 0, sizeof(mbs));
1519                                 newstart = scan - 1;
1520                         }
1521                         clen = wcrtomb_l(buf, OPND(s), &mbs, p->g->loc);
1522                         if (clen == (size_t)-1)
1523                                 goto toohard;
1524                         newlen += clen;
1525                         break;
1526                 case OPLUS_:            /* things that don't break one */
1527                 case OLPAREN:
1528                 case ORPAREN:
1529                         break;
1530                 case OQUEST_:           /* things that must be skipped */
1531                 case OCH_:
1532                         offset = altoffset(scan, offset);
1533                         scan--;
1534                         do {
1535                                 scan += OPND(s);
1536                                 s = *scan;
1537                                 /* assert() interferes w debug printouts */
1538                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1539                                                         OP(s) != OOR2) {
1540                                         g->iflags |= BAD;
1541                                         return;
1542                                 }
1543                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1544                         /* FALLTHROUGH */
1545                 case OBOW:              /* things that break a sequence */
1546                 case OEOW:
1547                 case OBOL:
1548                 case OEOL:
1549                 case O_QUEST:
1550                 case O_CH:
1551                 case OEND:
1552                         if (newlen > g->mlen) {         /* ends one */
1553                                 start = newstart;
1554                                 g->mlen = newlen;
1555                                 if (offset > -1) {
1556                                         g->moffset += offset;
1557                                         offset = newlen;
1558                                 } else
1559                                         g->moffset = offset;
1560                         } else {
1561                                 if (offset > -1)
1562                                         offset += newlen;
1563                         }
1564                         newlen = 0;
1565                         break;
1566                 case OANY:
1567                         if (newlen > g->mlen) {         /* ends one */
1568                                 start = newstart;
1569                                 g->mlen = newlen;
1570                                 if (offset > -1) {
1571                                         g->moffset += offset;
1572                                         offset = newlen;
1573                                 } else
1574                                         g->moffset = offset;
1575                         } else {
1576                                 if (offset > -1)
1577                                         offset += newlen;
1578                         }
1579                         if (offset > -1)
1580                                 offset++;
1581                         newlen = 0;
1582                         break;
1583                 case OANYOF:            /* may or may not invalidate offset */
1584                         /* First, everything as OANY */
1585                         if (newlen > g->mlen) {         /* ends one */
1586                                 start = newstart;
1587                                 g->mlen = newlen;
1588                                 if (offset > -1) {
1589                                         g->moffset += offset;
1590                                         offset = newlen;
1591                                 } else
1592                                         g->moffset = offset;
1593                         } else {
1594                                 if (offset > -1)
1595                                         offset += newlen;
1596                         }
1597                         if (offset > -1)
1598                                 offset++;
1599                         newlen = 0;
1600                         break;
1601                 toohard:
1602                 default:
1603                         /* Anything here makes it impossible or too hard
1604                          * to calculate the offset -- so we give up;
1605                          * save the last known good offset, in case the
1606                          * must sequence doesn't occur later.
1607                          */
1608                         if (newlen > g->mlen) {         /* ends one */
1609                                 start = newstart;
1610                                 g->mlen = newlen;
1611                                 if (offset > -1)
1612                                         g->moffset += offset;
1613                                 else
1614                                         g->moffset = offset;
1615                         }
1616                         offset = -1;
1617                         newlen = 0;
1618                         break;
1619                 }
1620         } while (OP(s) != OEND);
1621
1622         if (g->mlen == 0) {             /* there isn't one */
1623                 g->moffset = -1;
1624                 return;
1625         }
1626
1627         /* turn it into a character string */
1628         g->must = malloc((size_t)g->mlen + 1);
1629         if (g->must == NULL) {          /* argh; just forget it */
1630                 g->mlen = 0;
1631                 g->moffset = -1;
1632                 return;
1633         }
1634         cp = g->must;
1635         scan = start;
1636         memset(&mbs, 0, sizeof(mbs));
1637         while (cp < g->must + g->mlen) {
1638                 while (OP(s = *scan++) != OCHAR)
1639                         continue;
1640                 clen = wcrtomb_l(cp, OPND(s), &mbs, p->g->loc);
1641                 assert(clen != (size_t)-1);
1642                 cp += clen;
1643         }
1644         assert(cp == g->must + g->mlen);
1645         *cp++ = '\0';           /* just on general principles */
1646 }
1647
1648 /*
1649  - altoffset - choose biggest offset among multiple choices
1650  == static int altoffset(sop *scan, int offset);
1651  *
1652  * Compute, recursively if necessary, the largest offset among multiple
1653  * re paths.
1654  */
1655 static int
1656 altoffset(sop *scan, int offset)
1657 {
1658         int largest;
1659         int try;
1660         sop s;
1661
1662         /* If we gave up already on offsets, return */
1663         if (offset == -1)
1664                 return -1;
1665
1666         largest = 0;
1667         try = 0;
1668         s = *scan++;
1669         while (OP(s) != O_QUEST && OP(s) != O_CH) {
1670                 switch (OP(s)) {
1671                 case OOR1:
1672                         if (try > largest)
1673                                 largest = try;
1674                         try = 0;
1675                         break;
1676                 case OQUEST_:
1677                 case OCH_:
1678                         try = altoffset(scan, try);
1679                         if (try == -1)
1680                                 return -1;
1681                         scan--;
1682                         do {
1683                                 scan += OPND(s);
1684                                 s = *scan;
1685                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1686                                                         OP(s) != OOR2)
1687                                         return -1;
1688                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1689                         /* We must skip to the next position, or we'll
1690                          * leave altoffset() too early.
1691                          */
1692                         scan++;
1693                         break;
1694                 case OANYOF:
1695                 case OCHAR:
1696                 case OANY:
1697                         try++;
1698                 case OBOW:
1699                 case OEOW:
1700                 case OLPAREN:
1701                 case ORPAREN:
1702                 case OOR2:
1703                         break;
1704                 default:
1705                         try = -1;
1706                         break;
1707                 }
1708                 if (try == -1)
1709                         return -1;
1710                 s = *scan++;
1711         }
1712
1713         if (try > largest)
1714                 largest = try;
1715
1716         return largest+offset;
1717 }
1718
1719 /*
1720  - computejumps - compute char jumps for BM scan
1721  == static void computejumps(struct parse *p, struct re_guts *g);
1722  *
1723  * This algorithm assumes g->must exists and is has size greater than
1724  * zero. It's based on the algorithm found on Computer Algorithms by
1725  * Sara Baase.
1726  *
1727  * A char jump is the number of characters one needs to jump based on
1728  * the value of the character from the text that was mismatched.
1729  */
1730 static void
1731 computejumps(struct parse *p, struct re_guts *g)
1732 {
1733         int ch;
1734         int mindex;
1735
1736         /* Avoid making errors worse */
1737         if (p->error != 0)
1738                 return;
1739
1740         g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1741         if (g->charjump == NULL)        /* Not a fatal error */
1742                 return;
1743         /* Adjust for signed chars, if necessary */
1744         g->charjump = &g->charjump[-(CHAR_MIN)];
1745
1746         /* If the character does not exist in the pattern, the jump
1747          * is equal to the number of characters in the pattern.
1748          */
1749         for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1750                 g->charjump[ch] = g->mlen;
1751
1752         /* If the character does exist, compute the jump that would
1753          * take us to the last character in the pattern equal to it
1754          * (notice that we match right to left, so that last character
1755          * is the first one that would be matched).
1756          */
1757         for (mindex = 0; mindex < g->mlen; mindex++)
1758                 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1759 }
1760
1761 /*
1762  - computematchjumps - compute match jumps for BM scan
1763  == static void computematchjumps(struct parse *p, struct re_guts *g);
1764  *
1765  * This algorithm assumes g->must exists and is has size greater than
1766  * zero. It's based on the algorithm found on Computer Algorithms by
1767  * Sara Baase.
1768  *
1769  * A match jump is the number of characters one needs to advance based
1770  * on the already-matched suffix.
1771  * Notice that all values here are minus (g->mlen-1), because of the way
1772  * the search algorithm works.
1773  */
1774 static void
1775 computematchjumps(struct parse *p, struct re_guts *g)
1776 {
1777         int mindex;             /* General "must" iterator */
1778         int suffix;             /* Keeps track of matching suffix */
1779         int ssuffix;            /* Keeps track of suffixes' suffix */
1780         int* pmatches;          /* pmatches[k] points to the next i
1781                                  * such that i+1...mlen is a substring
1782                                  * of k+1...k+mlen-i-1
1783                                  */
1784
1785         /* Avoid making errors worse */
1786         if (p->error != 0)
1787                 return;
1788
1789         pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
1790         if (pmatches == NULL) {
1791                 g->matchjump = NULL;
1792                 return;
1793         }
1794
1795         g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
1796         if (g->matchjump == NULL)       /* Not a fatal error */
1797                 return;
1798
1799         /* Set maximum possible jump for each character in the pattern */
1800         for (mindex = 0; mindex < g->mlen; mindex++)
1801                 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1802
1803         /* Compute pmatches[] */
1804         for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1805             mindex--, suffix--) {
1806                 pmatches[mindex] = suffix;
1807
1808                 /* If a mismatch is found, interrupting the substring,
1809                  * compute the matchjump for that position. If no
1810                  * mismatch is found, then a text substring mismatched
1811                  * against the suffix will also mismatch against the
1812                  * substring.
1813                  */
1814                 while (suffix < g->mlen
1815                     && g->must[mindex] != g->must[suffix]) {
1816                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
1817                             g->mlen - mindex - 1);
1818                         suffix = pmatches[suffix];
1819                 }
1820         }
1821
1822         /* Compute the matchjump up to the last substring found to jump
1823          * to the beginning of the largest must pattern prefix matching
1824          * it's own suffix.
1825          */
1826         for (mindex = 0; mindex <= suffix; mindex++)
1827                 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1828                     g->mlen + suffix - mindex);
1829
1830         ssuffix = pmatches[suffix];
1831         while (suffix < g->mlen) {
1832                 while (suffix <= ssuffix && suffix < g->mlen) {
1833                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
1834                             g->mlen + ssuffix - suffix);
1835                         suffix++;
1836                 }
1837                 if (suffix < g->mlen)
1838                         ssuffix = pmatches[ssuffix];
1839         }
1840
1841         free(pmatches);
1842 }
1843
1844 /*
1845  - pluscount - count + nesting
1846  == static sopno pluscount(struct parse *p, struct re_guts *g);
1847  */
1848 static sopno                    /* nesting depth */
1849 pluscount(struct parse *p, struct re_guts *g)
1850 {
1851         sop *scan;
1852         sop s;
1853         sopno plusnest = 0;
1854         sopno maxnest = 0;
1855
1856         if (p->error != 0)
1857                 return(0);      /* there may not be an OEND */
1858
1859         scan = g->strip + 1;
1860         do {
1861                 s = *scan++;
1862                 switch (OP(s)) {
1863                 case OPLUS_:
1864                         plusnest++;
1865                         break;
1866                 case O_PLUS:
1867                         if (plusnest > maxnest)
1868                                 maxnest = plusnest;
1869                         plusnest--;
1870                         break;
1871                 }
1872         } while (OP(s) != OEND);
1873         if (plusnest != 0)
1874                 g->iflags |= BAD;
1875         return(maxnest);
1876 }