regex/TRE/lib/tre-parse.c

   1 /*
   2   tre-parse.c - Regexp parser
   3
   4   This software is released under a BSD-style license.
   5   See the file LICENSE for details and copyright.
   6
   7 */
   8
   9 /*
  10   This parser is just a simple recursive descent parser for POSIX.2
  11   regexps.  The parser supports both the obsolete default syntax and
  12   the "extended" syntax, and some nonstandard extensions.
  13 */
  14
  15
  16 #ifdef HAVE_CONFIG_H
  17 #include <config.h>
  18 #endif /* HAVE_CONFIG_H */
  19 #include <string.h>
  20 #include <assert.h>
  21 #include <limits.h>
  22 #include <stddef.h>
  23
  24 #include "xmalloc.h"
  25 #include "tre-mem.h"
  26 #include "tre-ast.h"
  27 #include "tre-stack.h"
  28 #include "tre-parse.h"
  29
  30 /* BSD compatibility:
  31      Before looking up a collating symbol, check if the name matches in
  32      the character names (cnames) array; if so, use the corresponding
  33      character.
  34
  35      Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
  36      the implementation choice that for ERE, a non-numeric character following
  37      a left brace that would normally be a bound, causes the left brace to be
  38      literal. */
  39 #define BSD_COMPATIBILITY
  40 #ifdef BSD_COMPATIBILITY
  41 #include "cname.h"
  42 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
  43 #endif /* BSD_COMPATIBILITY */
  44
  45 /* Characters with special meanings in regexp syntax. */
  46 #define CHAR_PIPE          L'|'
  47 #define CHAR_LPAREN        L'('
  48 #define CHAR_RPAREN        L')'
  49 #define CHAR_LBRACE        L'{'
  50 #define CHAR_RBRACE        L'}'
  51 #define CHAR_LBRACKET      L'['
  52 #define CHAR_RBRACKET      L']'
  53 #define CHAR_MINUS         L'-'
  54 #define CHAR_STAR          L'*'
  55 #define CHAR_QUESTIONMARK  L'?'
  56 #define CHAR_PLUS          L'+'
  57 #define CHAR_PERIOD        L'.'
  58 #define CHAR_COLON         L':'
  59 #define CHAR_EQUAL         L'='
  60 #define CHAR_COMMA         L','
  61 #define CHAR_CARET         L'^'
  62 #define CHAR_DOLLAR        L'$'
  63 #define CHAR_BACKSLASH     L'\\'
  64 #define CHAR_HASH          L'#'
  65 #define CHAR_TILDE         L'~'
  66
  67
  68 /* Some macros for expanding \w, \s, etc. */
  69 static const struct tre_macro_struct {
  70   const char c;
  71   const char *expansion;
  72 } tre_macros[] =
  73   { {'t', "\t"},           {'n', "\n"},            {'r', "\r"},
  74     {'f', "\f"},           {'a', "\a"},            {'e', "\033"},
  75     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
  76     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
  77     { 0, NULL }
  78   };
  79
  80
  81 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
  82    must have at least `len' items.  Sets buf[0] to zero if the there
  83    is no match in `tre_macros'. */
  84 static void
  85 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
  86                  tre_char_t *buf, size_t buf_len)
  87 {
  88   int i;
  89
  90   buf[0] = 0;
  91   if (regex >= regex_end)
  92     return;
  93
  94   for (i = 0; tre_macros[i].expansion; i++)
  95     {
  96       if (tre_macros[i].c == *regex)
  97         {
  98           unsigned int j;
  99           DPRINT(("Expanding macro '%c' => '%s'\n",
 100                   tre_macros[i].c, tre_macros[i].expansion));
 101           for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
 102             buf[j] = tre_macros[i].expansion[j];
 103           buf[j] = 0;
 104           break;
 105         }
 106     }
 107 }
 108
 109 static reg_errcode_t
 110 tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
 111          tre_bracket_match_list_t **items)
 112 {
 113   reg_errcode_t status = REG_OK;
 114   tre_bracket_match_list_t *array = *items;
 115   int i = array->num_bracket_matches;
 116   /* Allocate more space if necessary. */
 117   if (i >= *max_i)
 118     {
 119       tre_bracket_match_list_t *new_items;
 120       DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
 121       /* If the array is already 1024 items large, give up -- there's
 122          probably an error in the regexp (e.g. not a '\0' terminated
 123          string and missing ']') */
 124       if (*max_i >= 1024)
 125         return REG_ESPACE;
 126       *max_i *= 2;
 127       new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
 128       if (new_items == NULL)
 129         return REG_ESPACE;
 130       *items = array = new_items;
 131     }
 132   array->bracket_matches[i].type = type;
 133   array->bracket_matches[i].value = val;
 134   array->num_bracket_matches++;
 135   return status;
 136 }
 137
 138 #ifndef TRE_USE_SYSTEM_WCTYPE
 139
 140 /* isalnum() and the rest may be macros, so wrap them to functions. */
 141 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
 142 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
 143
 144 #ifdef tre_isascii
 145 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
 146 #else /* !tre_isascii */
 147 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
 148 #endif /* !tre_isascii */
 149
 150 #ifdef tre_isblank
 151 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
 152 #else /* !tre_isblank */
 153 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
 154 #endif /* !tre_isblank */
 155
 156 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
 157 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
 158 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
 159 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
 160 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
 161 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
 162 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
 163 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
 164 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
 165
 166 struct {
 167   char *name;
 168   int (*func)(tre_cint_t);
 169 } tre_ctype_map[] = {
 170   { "alnum", &tre_isalnum_func },
 171   { "alpha", &tre_isalpha_func },
 172 #ifdef tre_isascii
 173   { "ascii", &tre_isascii_func },
 174 #endif /* tre_isascii */
 175 #ifdef tre_isblank
 176   { "blank", &tre_isblank_func },
 177 #endif /* tre_isblank */
 178   { "cntrl", &tre_iscntrl_func },
 179   { "digit", &tre_isdigit_func },
 180   { "graph", &tre_isgraph_func },
 181   { "lower", &tre_islower_func },
 182   { "print", &tre_isprint_func },
 183   { "punct", &tre_ispunct_func },
 184   { "space", &tre_isspace_func },
 185   { "upper", &tre_isupper_func },
 186   { "xdigit", &tre_isxdigit_func },
 187   { NULL, NULL}
 188 };
 189
 190 tre_ctype_t tre_ctype(const char *name)
 191 {
 192   int i;
 193   for (i = 0; tre_ctype_map[i].name != NULL; i++)
 194     {
 195       if (strcmp(name, tre_ctype_map[i].name) == 0)
 196         return tre_ctype_map[i].func;
 197     }
 198   return (tre_ctype_t)0;
 199 }
 200 #endif /* !TRE_USE_SYSTEM_WCTYPE */
 201
 202 #define REST(re) (int)(ctx->re_end - (re)), (re)
 203
 204 #define START_COLLATING_SYMBOLS         16
 205 #define MAX_COLLATING_SYMBOL_LEN        4
 206
 207 typedef struct {
 208   const tre_char_t *start;
 209   int len;
 210 } tre_collating_symbol;
 211
 212 #include <xlocale.h>
 213
 214 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
 215
 216 #ifdef BSD_COMPATIBILITY
 217 static wchar_t
 218 tre_search_cnames(const wchar_t *name, size_t len)
 219 {
 220   size_t low = 0;
 221   size_t high = NCNAMES - 1;
 222   size_t cur;
 223   int cmp;
 224
 225   while(low <= high)
 226     {
 227       cur = (low + high) / 2;
 228       cmp = wcsncmp(name, cnames[cur].name, len);
 229       if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
 230       if (cmp > 0) low = cur + 1;
 231       else high = cur - 1;
 232     }
 233   return (wchar_t)-1;
 234 }
 235 #endif /* BSD_COMPATIBILITY */
 236
 237 /* Scan the contents of a bracket expression, and create a
 238  * tre_bracket_match_list_t encoding the bracket expression.  If during
 239  * the scan, multi-character collating symbols are detected, switch
 240  * into a mode to collect those MCCSs into a tre_collating_symbol
 241  * list and pass them back.  tre_parse_bracket will use that to
 242  * create a new string composed of a union of the bracket expression
 243  * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
 244  * call tre_parse (recursive) to parse that new string (which will
 245  * call tre_parse_bracket and tre_parse_bracket_items again. */
 246 static reg_errcode_t
 247 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
 248                         int *items_size, tre_collating_symbol **result)
 249 {
 250   const tre_char_t *re = ctx->re;
 251   const tre_char_t *re_end = ctx->re_end;
 252   tre_collating_symbol *col_syms = NULL;
 253   tre_collating_symbol *cp = NULL;
 254   int n_col_syms = 0;
 255   reg_errcode_t status;
 256   int max_i = *items_size;
 257   int other = 0;  /* contains content other than multi-character collating
 258                    * symbols */
 259   int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
 260   tre_cint_t min, c;
 261   int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
 262   int collect_MCCS = 0;
 263   const tre_char_t *start;
 264
 265   for ( ;re < re_end; re++)
 266     {
 267       switch (*re)
 268         {
 269         case CHAR_MINUS:
 270           /* A first hyphen */
 271           if (re == ctx->re)
 272             {
 273               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 274               min = CHAR_MINUS;
 275               other++;
 276               range = 0;
 277               break;
 278             }
 279           /* The hyphen is the end range */
 280           if (range > 0)
 281             {
 282               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 283               c = CHAR_MINUS;
 284               goto process_end_range;
 285             }
 286           if (re + 1 >= re_end)
 287             {
 288               status = REG_EBRACK;
 289               goto error;
 290             }
 291           /* The hyphen is at the end */
 292           if (re[1] == CHAR_RBRACKET)
 293             {
 294               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 295               c = CHAR_MINUS;
 296               goto process_begin_range;
 297             }
 298           /* Two ranges are not allowed to share an endpoint, or begin
 299            * range is illegal. */
 300           if (range < 0)
 301             {
 302               status = REG_ERANGE;
 303               goto error;
 304             }
 305           range = 1; /* Expect end range */
 306           DPRINT(("tre_parse_bracket:   range: '%.*" STRF "'\n", REST(re)));
 307           break;
 308
 309         case CHAR_LBRACKET:
 310           if (re + 1 >= re_end)
 311             {
 312               status = REG_EBRACK;
 313               goto error;
 314             }
 315           switch (re[1])
 316             {
 317             case CHAR_PERIOD:
 318               {
 319                 re += 2;
 320                 start = re;
 321                 for (;; re++)
 322                   {
 323                     if (re >= re_end)
 324                       {
 325                         status = REG_ECOLLATE;
 326                         goto error;
 327                       }
 328                     if (*re == CHAR_PERIOD)
 329                       {
 330                         if (re + 1 >= re_end)
 331                           {
 332                             status = REG_ECOLLATE;
 333                             goto error;
 334                           }
 335                         /* Found end */
 336                         if (re[1] == CHAR_RBRACKET)
 337                           {
 338                             DPRINT(("tre_parse_bracket:   collating "
 339                                     "symbol: '%.*" STRF "'\n",
 340                                     REST(start - 2)));
 341                             /* Empty name */
 342                             if (re == start)
 343                               {
 344                                 status = REG_ECOLLATE;
 345                                 goto error;
 346                               }
 347 #ifdef BSD_COMPATIBILITY
 348                             /* Check if the name is in cnames; if so, use
 349                                the corresponding code */
 350                             c = tre_search_cnames(start, re - start);
 351                             if (c != (wchar_t)-1)
 352                               {
 353                                 re++;
 354                                 goto process_single_character;
 355                               }
 356 #endif /* BSD_COMPATIBILITY */
 357                             /* Verify this is a known sequence */
 358                             if (__collate_equiv_value(ctx->loc, start,
 359                                                           re - start) <= 0)
 360                               {
 361                                 status = REG_ECOLLATE;
 362                                 goto error;
 363                               }
 364                             /* Process single character collating symbols */
 365                             if (re - start == 1)
 366                               {
 367                                 c = *start;
 368                                 re++;
 369                                 goto process_single_character;
 370                               }
 371                             /* Inverted MCCSs are undefined */
 372                             if (invert)
 373                               {
 374                                 status = REG_ECOLLATE;
 375                                 goto error;
 376                               }
 377                             /* Can't have MCCSs as an endpoint to a range */
 378                             if (range > 0)
 379                               {
 380                                 status = REG_ERANGE;
 381                                 goto error;
 382                               }
 383                             range = -1;
 384                             /* Switch into MCCS collection mode (if not
 385                              * already there */
 386 #if TRE_DEBUG
 387                             if (!collect_MCCS)
 388                               {
 389                                 collect_MCCS = 1;
 390                                 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
 391                               }
 392 #else /* !TRE_DEBUG */
 393                             collect_MCCS = 1;
 394 #endif /* !TRE_DEBUG */
 395                             /* Allocate a memory block the first time */
 396                             if (!cp)
 397                               {
 398                                 if ((col_syms = xmalloc(sizeof(*col_syms) *
 399                                             (START_COLLATING_SYMBOLS + 2)))
 400                                             == NULL)
 401                                   return REG_ESPACE;
 402                                 cp = col_syms + 1;
 403                                 n_col_syms = START_COLLATING_SYMBOLS;
 404                               }
 405                             /* Enlarge the memory block is more is needed */
 406                             if ((cp - col_syms) - 1 >= n_col_syms)
 407                               {
 408                                 int i = n_col_syms;
 409                                 tre_collating_symbol *tmp =
 410                                     xrealloc(col_syms, sizeof(*col_syms) *
 411                                              ((n_col_syms *= 2) + 2));
 412                                 if (tmp == NULL)
 413                                   {
 414                                     xfree(col_syms);
 415                                     return REG_ESPACE;
 416                                   }
 417                                 DPRINT(("tre_list_collating_symbols: "
 418                                         "Enlarging col_syms to %d\n",
 419                                         n_col_syms));
 420                                 col_syms = tmp;
 421                                 cp = col_syms + i + 1;
 422                               }
 423                             cp->start = start;
 424                             cp->len = re - start;
 425                             cp++;
 426                             re++;
 427                             break;
 428                           }
 429                       }
 430                   }
 431                 break;
 432               }
 433
 434             case CHAR_EQUAL:
 435             case CHAR_COLON:
 436               {
 437                 /* Process equivalence and character classes */
 438                 tre_char_t kind = re[1];
 439
 440                 /* Can't have a class as an endpoint to a range */
 441                 if (range > 0)
 442                   {
 443                     status = REG_ERANGE;
 444                     goto error;
 445                   }
 446                 if (!collect_MCCS && range == 0)
 447                   {
 448                     status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
 449                                           min, &max_i, items);
 450                     if (status != REG_OK)
 451                       goto error;
 452                   }
 453                 range = -1;
 454                 re += 2;
 455                 start = re;
 456                 for (;; re++)
 457                   {
 458                     if (re >= re_end)
 459                       {
 460                         status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
 461                         goto error;
 462                       }
 463                     if (*re == kind)
 464                       {
 465                         if (re + 1 >= re_end)
 466                           {
 467                             status = kind == CHAR_EQUAL ? REG_ECOLLATE :
 468                                                           REG_ECTYPE;
 469                             goto error;
 470                           }
 471                         /* Found end */
 472                         if (re[1] == CHAR_RBRACKET)
 473                           {
 474                             if (re == start)
 475                               {
 476                                 /* Empty class name */
 477                                 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
 478                                                               REG_ECTYPE;
 479                                 goto error;
 480                               }
 481                             /* Process equivalence class */
 482                             if (kind == CHAR_EQUAL)
 483                               {
 484                                 int equiv;
 485
 486                                 DPRINT(("tre_parse_bracket:   equivalence: '%.*"
 487                                         STRF "'\n", REST(start - 2)));
 488
 489                                 /* While we find the collation value even for
 490                                    multi-character collating elements , we
 491                                    don't (yet) match any collation values
 492                                    against multi-character sequences.  We'd have
 493                                    to enumerate those multi-character sequences
 494                                    and like multi-character collating symbols,
 495                                    create a union of those sequences with the
 496                                    rest of the bracket expression.  While
 497                                    doable, a bracket expression matching
 498                                    multiple characters, that doesn't explicitly
 499                                    contain multi-character sequences, might
 500                                    be unexpected, so we punt for now. */
 501                                 if ((equiv = __collate_equiv_value(ctx->loc,
 502                                              start, re - start)) <= 0)
 503                                   {
 504                                     /* The standard says that if no collating
 505                                        element if found, we use the collating
 506                                        symbol itself.  But __collate_equiv_value
 507                                        doesn't make a distinction between
 508                                        an element that is in a equvalence
 509                                        class with others, or is the only member,
 510                                        so we already know there is no collating
 511                                        symbol.  (Note that in the case of a
 512                                        collating element whose collation value
 513                                        is unique, matching against the
 514                                        collating element itself, or against
 515                                        its collation value, is equivalent.) */
 516 #ifdef BSD_COMPATIBILITY
 517                                     /* Check if the name is in cnames; if so,
 518                                        use the corresponding code */
 519                                     c = tre_search_cnames(start, re - start);
 520                                     if (c != (wchar_t)-1)
 521                                       {
 522                                         re++;
 523                                         goto process_single_character;
 524                                       }
 525 #endif /* BSD_COMPATIBILITY */
 526                                     status = REG_ECOLLATE;
 527                                     goto error;
 528                                   }
 529                                 if (!collect_MCCS)
 530                                   {
 531                                     status = tre_new_item(ctx->mem,
 532                                              TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
 533                                              equiv, &max_i, items);
 534                                     if (status != REG_OK)
 535                                       goto error;
 536                                   }
 537                               }
 538                             else
 539                               {
 540                                 /* Process character class */
 541                                 DPRINT(("tre_parse_bracket:  class: '%.*" STRF
 542                                         "'\n", REST(start - 2)));
 543                                 if (!collect_MCCS)
 544                                   {
 545                                     char tmp_str[64];
 546                                     tre_ctype_t class;
 547                                     int len = MIN(re - start, 63);
 548 #ifdef TRE_WCHAR
 549                                     {
 550                                       tre_char_t tmp_wcs[64];
 551                                       wcsncpy(tmp_wcs, start, (size_t)len);
 552                                       tmp_wcs[len] = L'\0';
 553 #if defined HAVE_WCSRTOMBS
 554                                       {
 555                                         mbstate_t state;
 556                                         const tre_char_t *src = tmp_wcs;
 557                                         memset(&state, '\0', sizeof(state));
 558                                         len = wcsrtombs_l(tmp_str, &src,
 559                                                       sizeof(tmp_str), &state,
 560                                                       ctx->loc);
 561                                       }
 562 #elif defined HAVE_WCSTOMBS
 563                                       len = wcstombs(tmp_str, tmp_wcs, 63);
 564 #endif /* defined HAVE_WCSTOMBS */
 565                                     }
 566 #else /* !TRE_WCHAR */
 567                                     strncpy(tmp_str, (const char*)start, len);
 568 #endif /* !TRE_WCHAR */
 569                                     tmp_str[len] = '\0';
 570                                     DPRINT(("  class name: %s\n", tmp_str));
 571                                     class = tre_ctype_l(tmp_str, ctx->loc);
 572                                     if (!class)
 573                                       {
 574                                         status = REG_ECTYPE;
 575                                         goto error;
 576                                       }
 577                                     status = tre_new_item(ctx->mem,
 578                                              TRE_BRACKET_MATCH_TYPE_CLASS,
 579                                              class, &max_i, items);
 580                                     if (status != REG_OK)
 581                                       goto error;
 582                                   }
 583                               }
 584                             re++;
 585                             break;
 586                           }
 587                       }
 588                   }
 589                 other++;
 590                 break;
 591               }
 592
 593             default:
 594               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 595               c = CHAR_LBRACKET;
 596               goto process_single_character;
 597               break;
 598             }
 599           break;
 600
 601         case CHAR_RBRACKET:
 602           /* A first right bracket */
 603           if (re == ctx->re)
 604             {
 605               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 606               min = CHAR_RBRACKET;
 607               range = 0;
 608               other++;
 609               break;
 610             }
 611           /* Done */
 612           if (collect_MCCS)
 613             {
 614               DPRINT(("tre_parse_bracket:       done: '%.*" STRF "'\n",
 615                       REST(re)));
 616               if (col_syms)
 617                 {
 618                   /* Mark the character following the right bracket.  Set len
 619                    * to whether there are other things besides the
 620                    * multi-character collating symbols */
 621                   col_syms->start = re + 1;
 622                   col_syms->len = other;
 623                   /* Mark the end of the list */
 624                   cp->start = NULL;
 625                 }
 626               *result = col_syms;
 627               return REG_OK;
 628             }
 629           /* range > 0 is not possible, since we did a lookahead after the
 630            * hyphen */
 631           if (range == 0)
 632             {
 633               status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
 634                                     min, &max_i, items);
 635               if (status != REG_OK)
 636                 goto error;
 637             }
 638           DPRINT(("tre_parse_bracket:   done: '%.*" STRF "'\n", REST(re)));
 639           *items_size = max_i;
 640           ctx->re = re + 1;
 641           return REG_OK;
 642
 643         default:
 644           DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 645           c = *re;
 646 process_single_character:
 647           /* Process single character */
 648           if (range > 0)
 649             {
 650               int mine, maxe;
 651
 652 process_end_range:
 653               /* Get collation equivalence values */
 654               mine = __collate_equiv_value(ctx->loc, &min, 1);
 655               maxe = __collate_equiv_value(ctx->loc, &c, 1);
 656               if (maxe < mine)
 657                 {
 658                   status = REG_ERANGE;
 659                   goto error;
 660                 }
 661               if (!collect_MCCS)
 662                 {
 663                   status = tre_new_item(ctx->mem,
 664                                         TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
 665                                         mine, &max_i, items);
 666                   if (status != REG_OK)
 667                     goto error;
 668                   status = tre_new_item(ctx->mem,
 669                                         TRE_BRACKET_MATCH_TYPE_RANGE_END,
 670                                         maxe, &max_i, items);
 671                   if (status != REG_OK)
 672                     goto error;
 673                 }
 674               range = -1;
 675             }
 676           else
 677             {
 678 process_begin_range:
 679               if (!collect_MCCS)
 680                 {
 681                   if (range == 0)
 682                     {
 683                       status = tre_new_item(ctx->mem,
 684                                             TRE_BRACKET_MATCH_TYPE_CHAR,
 685                                             min, &max_i, items);
 686                       if (status != REG_OK)
 687                         goto error;
 688                     }
 689                   min = c;
 690                 }
 691               range = 0;
 692             }
 693           other++;
 694           break;
 695         }
 696     }
 697   status = REG_EBRACK;
 698 error:
 699   DPRINT(("tre_parse_bracket:   error: '%.*" STRF "', status=%d\n",
 700           REST(re), status));
 701   if (col_syms)
 702     xfree(col_syms);
 703   return status;
 704 }
 705
 706 #ifdef TRE_DEBUG
 707 static const char *bracket_match_type_str[] = {
 708   "unused",
 709   "char",
 710   "range begin",
 711   "range end",
 712   "class",
 713   "equivalence value",
 714 };
 715 #endif /* TRE_DEBUG */
 716
 717 static reg_errcode_t
 718 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 719 {
 720   tre_ast_node_t *node;
 721   reg_errcode_t status = REG_OK;
 722   tre_bracket_match_list_t *items;
 723   int max_i = 32;
 724   tre_collating_symbol *col_syms = NULL;
 725
 726   /* Handle special cases [[:<:]] and [[:>:]] */
 727   if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
 728       && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
 729       && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
 730       && ctx->re[5] == CHAR_RBRACKET)
 731     {
 732       *result = tre_ast_new_literal(ctx->mem, ASSERTION,
 733                       (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
 734                       -1);
 735       DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
 736               "[[:<:]]" : "[[:>:]]"));
 737       ctx->re += 6;
 738       return *result ? REG_OK : REG_ESPACE;
 739     }
 740
 741   /* Start off with an array of `max_i' elements. */
 742   items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
 743   if (items == NULL)
 744     return REG_ESPACE;
 745
 746   if (*ctx->re == CHAR_CARET)
 747     {
 748       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
 749       items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
 750       ctx->re++;
 751     }
 752
 753   status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
 754
 755   if (status != REG_OK)
 756     goto parse_bracket_done;
 757
 758   /* If there are collating symbols, split off the multi-character ones
 759    * into a union of the bracket expression (without the collating symbols)
 760    * and the multiple-character sequences.  We create an equivalent input
 761    * string and run tre_parse() recursively */
 762   if (col_syms)
 763     {
 764       tre_char_t *str, *sp;
 765       tre_collating_symbol *cp;
 766       tre_parse_ctx_t subctx;
 767
 768       /* Allocate a new string.  We start with the size of the original
 769        * bracket expression (minus 1) and add 2 (for a leading "[" and
 770        * a trailing nil; don't need a "^", since it is illegal to have
 771        * inverted MCCSs).  Since a multi-character collating symbols
 772        * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
 773        * need to worry about the new string getting too long. */
 774       xfree(items);
 775       str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
 776       if (str == NULL)
 777         {
 778           xfree(col_syms);
 779           return REG_ESPACE;
 780         }
 781       sp = str;
 782       if (col_syms->len > 0)
 783         {
 784           /* There are other items in the bracket expression besides the
 785            * multi-character collating symbols, so create a new bracket
 786            * expression with only those other itmes. */
 787           const tre_char_t *re;
 788           ptrdiff_t i;
 789
 790           *sp++ = '[';
 791           re = ctx->re;
 792           for (cp = col_syms + 1; cp->start; cp++)
 793             {
 794               /* The "- 2" is to account for the "[." */
 795               if ((i = ((cp->start - re) - 2)) > 0)
 796                 {
 797                   memcpy(sp, re, sizeof(*sp) * i);
 798                   sp += i;
 799                 }
 800               /* The "+ 2" is to account for the ".]" */
 801               re = cp->start + cp->len + 2;
 802             }
 803             i = col_syms->start - re; /* Includes the trailing right bracket */
 804             memcpy(sp, re, sizeof(*sp) * i);
 805             sp += i;
 806             *sp++ = '|';
 807         }
 808       for (cp = col_syms + 1; cp->start; cp++)
 809         {
 810           memcpy(sp, cp->start, sizeof(*sp) * cp->len);
 811           sp += cp->len;
 812           if (cp[1].start)
 813             *sp++ = '|';
 814         }
 815       *sp = 0;
 816       DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
 817               str));
 818
 819       memcpy(&subctx, ctx, sizeof(subctx));
 820       subctx.re = str;
 821       subctx.len = sp - str;
 822       subctx.nofirstsub = 1;
 823       subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
 824       status = tre_parse(&subctx);
 825       xfree(str);
 826       if (status != REG_OK)
 827         {
 828           xfree(col_syms);
 829           return status;
 830         }
 831       ctx->re = col_syms->start;
 832       ctx->position = subctx.position;
 833       xfree(col_syms);
 834       *result = subctx.result;
 835       DPRINT(("tre_parse_bracket: Returning to original string\n"));
 836       return REG_OK;
 837     }
 838
 839   DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
 840   node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
 841   if (node == NULL)
 842     {
 843       status = REG_ESPACE;
 844       goto parse_bracket_done;
 845     }
 846   else
 847     {
 848       tre_literal_t *l = node->obj;
 849       l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
 850                                          SIZEOF_BRACKET_MATCH_LIST(items));
 851       if (l->u.bracket_match_list == NULL)
 852         {
 853           status = REG_ESPACE;
 854           goto parse_bracket_done;
 855         }
 856       memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
 857     }
 858
 859 #ifdef TRE_DEBUG
 860   {
 861     int i;
 862     tre_bracket_match_t *b;
 863     DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
 864             items->num_bracket_matches, items->flags));
 865     for (i = 0, b = items->bracket_matches;
 866          i < items->num_bracket_matches; i++, b++)
 867       {
 868         DPRINT(("   %d: %s %d\n", i, bracket_match_type_str[b->type],
 869                 b->value));
 870       }
 871   }
 872 #endif /* TRE_DEBUG */
 873
 874  parse_bracket_done:
 875   xfree(items);
 876   ctx->position++;
 877   *result = node;
 878   return status;
 879 }
 880
 881
 882 /* Parses a positive decimal integer.  Returns -1 if the string does not
 883    contain a valid number. */
 884 static int
 885 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
 886 {
 887   int num = -1;
 888   const tre_char_t *r = *regex;
 889   while (r < regex_end && *r >= L'0' && *r <= L'9')
 890     {
 891       if (num < 0)
 892         num = 0;
 893       num = num * 10 + *r - L'0';
 894       r++;
 895     }
 896   *regex = r;
 897   return num;
 898 }
 899
 900
 901 static reg_errcode_t
 902 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 903 {
 904   int min, max;
 905 #ifdef TRE_APPROX
 906   int i;
 907   int cost_ins, cost_del, cost_subst, cost_max;
 908   int limit_ins, limit_del, limit_subst, limit_err;
 909   const tre_char_t *start;
 910 #endif /* TRE_APPROX */
 911   const tre_char_t *r = ctx->re;
 912   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
 913 #ifdef TRE_APPROX
 914   int approx = 0;
 915   int costs_set = 0;
 916   int counts_set = 0;
 917
 918   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
 919   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
 920 #endif /* TRE_APPROX */
 921
 922   /* Parse number (minimum repetition count). */
 923   min = -1;
 924   if (r >= ctx->re_end)
 925 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
 926     return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
 927 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 928     return REG_EBRACE;
 929 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 930   if (*r >= L'0' && *r <= L'9') {
 931     DPRINT(("tre_parse:   min count: '%.*" STRF "'\n", REST(r)));
 932     min = tre_parse_int(&r, ctx->re_end);
 933   }
 934 #ifndef TRE_APPROX
 935   else
 936 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
 937       /* For ERE, return REG_NOMATCH to signal that the lbrace should
 938          be treated as a literal */
 939       return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
 940 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 941       return REG_BADBR;
 942 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 943 #endif /* !TRE_APPROX */
 944
 945   /* Parse comma and second number (maximum repetition count). */
 946   max = min;
 947   if (r < ctx->re_end && *r == CHAR_COMMA)
 948     {
 949       r++;
 950       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
 951       max = tre_parse_int(&r, ctx->re_end);
 952     }
 953
 954   /* Check that the repeat counts are sane. */
 955   if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
 956     return REG_BADBR;
 957
 958
 959 #ifdef TRE_APPROX
 960   /*
 961    '{'
 962      optionally followed immediately by a number == minimum repcount
 963      optionally followed by , then a number == maximum repcount
 964       + then a number == maximum insertion count
 965       - then a number == maximum deletion count
 966       # then a number == maximum substitution count
 967       ~ then a number == maximum number of errors
 968       Any of +, -, # or ~ without followed by a number means that
 969       the maximum count/number of errors is infinite.
 970
 971       An equation of the form
 972         Xi + Yd + Zs < C
 973       can be specified to set costs and the cost limit to a value
 974       different from the default value:
 975         - X is the cost of an insertion
 976         - Y is the cost of a deletion
 977         - Z is the cost of a substitution
 978         - C is the maximum cost
 979
 980       If no count limit or cost is set for an operation, the operation
 981       is not allowed at all.
 982   */
 983
 984
 985   do {
 986     int done;
 987     start = r;
 988
 989     /* Parse count limit settings */
 990     done = 0;
 991     if (!counts_set)
 992       while (r + 1 < ctx->re_end && !done)
 993         {
 994           switch (*r)
 995             {
 996             case CHAR_PLUS:  /* Insert limit */
 997               DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
 998               r++;
 999               limit_ins = tre_parse_int(&r, ctx->re_end);
1000               if (limit_ins < 0)
1001                 limit_ins = INT_MAX;
1002               counts_set = 1;
1003               break;
1004             case CHAR_MINUS: /* Delete limit */
1005               DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
1006               r++;
1007               limit_del = tre_parse_int(&r, ctx->re_end);
1008               if (limit_del < 0)
1009                 limit_del = INT_MAX;
1010               counts_set = 1;
1011               break;
1012             case CHAR_HASH:  /* Substitute limit */
1013               DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1014               r++;
1015               limit_subst = tre_parse_int(&r, ctx->re_end);
1016               if (limit_subst < 0)
1017                 limit_subst = INT_MAX;
1018               counts_set = 1;
1019               break;
1020             case CHAR_TILDE: /* Maximum number of changes */
1021               DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1022               r++;
1023               limit_err = tre_parse_int(&r, ctx->re_end);
1024               if (limit_err < 0)
1025                 limit_err = INT_MAX;
1026               approx = 1;
1027               break;
1028             case CHAR_COMMA:
1029               r++;
1030               break;
1031             case L' ':
1032               r++;
1033               break;
1034             case L'}':
1035               done = 1;
1036               break;
1037             default:
1038               done = 1;
1039               break;
1040             }
1041         }
1042
1043     /* Parse cost restriction equation. */
1044     done = 0;
1045     if (!costs_set)
1046       while (r + 1 < ctx->re_end && !done)
1047         {
1048           switch (*r)
1049             {
1050             case CHAR_PLUS:
1051             case L' ':
1052               r++;
1053               break;
1054             case L'<':
1055               DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
1056               r++;
1057               while (*r == L' ')
1058                 r++;
1059               cost_max = tre_parse_int(&r, ctx->re_end);
1060               if (cost_max < 0)
1061                 cost_max = INT_MAX;
1062               else
1063                 cost_max--;
1064               approx = 1;
1065               break;
1066             case CHAR_COMMA:
1067               r++;
1068               done = 1;
1069               break;
1070             default:
1071               if (*r >= L'0' && *r <= L'9')
1072                 {
1073 #ifdef TRE_DEBUG
1074                   const tre_char_t *sr = r;
1075 #endif /* TRE_DEBUG */
1076                   int cost = tre_parse_int(&r, ctx->re_end);
1077                   /* XXX - make sure r is not past end. */
1078                   switch (*r)
1079                     {
1080                     case L'i':  /* Insert cost */
1081                       DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
1082                               REST(sr)));
1083                       r++;
1084                       cost_ins = cost;
1085                       costs_set = 1;
1086                       break;
1087                     case L'd':  /* Delete cost */
1088                       DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
1089                               REST(sr)));
1090                       r++;
1091                       cost_del = cost;
1092                       costs_set = 1;
1093                       break;
1094                     case L's':  /* Substitute cost */
1095                       DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
1096                               REST(sr)));
1097                       r++;
1098                       cost_subst = cost;
1099                       costs_set = 1;
1100                       break;
1101                     default:
1102                       return REG_BADBR;
1103                     }
1104                 }
1105               else
1106                 {
1107                   done = 1;
1108                   break;
1109                 }
1110             }
1111         }
1112   } while (start != r);
1113 #endif /* TRE_APPROX */
1114
1115   /*{*//* Missing }. */
1116   if (r >= ctx->re_end)
1117     return REG_EBRACE;
1118
1119   /* Empty contents of {}. */
1120   if (r == ctx->re)
1121     return REG_BADBR;
1122
1123   /* Parse the ending '}' or '\}'.*/
1124   if (ctx->cflags & REG_EXTENDED)
1125     {
1126       if (r >= ctx->re_end || *r != CHAR_RBRACE)
1127         return REG_BADBR;
1128       r++;
1129       /* Parse trailing '?' marking minimal repetition. */
1130       if (r < ctx->re_end)
1131         {
1132           if (*r == CHAR_QUESTIONMARK)
1133             {
1134               /* Process the question mark only in enhanced mode.
1135                  Otherwise, the question mark is an error in ERE
1136                  or a literal in BRE */
1137               if (ctx->cflags & REG_ENHANCED)
1138                 {
1139                   minimal = !(ctx->cflags & REG_UNGREEDY);
1140                   r++;
1141                 }
1142               else return REG_BADRPT;
1143             }
1144           else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1145             {
1146               /* These are reserved for future extensions. */
1147               return REG_BADRPT;
1148             }
1149         }
1150     }
1151   else
1152     {
1153       if (r + 1 >= ctx->re_end
1154           || *r != CHAR_BACKSLASH
1155           || *(r + 1) != CHAR_RBRACE)
1156         return REG_BADBR;
1157       r += 2;
1158       if (r < ctx->re_end && *r == CHAR_STAR)
1159         {
1160           /* This is reserved for future extensions. */
1161           return REG_BADRPT;
1162         }
1163     }
1164
1165   if (minimal)
1166     ctx->num_reorder_tags++;
1167
1168   if (!result) goto parse_bound_exit;
1169   /* Create the AST node(s). */
1170   /* Originally, if min == 0 && max == 0, we immediately replace the whole
1171      iteration with EMPTY.  This unfortunately drops any submatches, and
1172      messes up setting the pmatch values (we can get tags of -1, and
1173      tag values in the billions).  So we leave it and process this case as
1174      usual, and wait until tre_expand_ast() to replace with EMPTY */
1175 #ifdef TRE_APPROX
1176   if (min < 0 && max < 0)
1177     /* Only approximate parameters set, no repetitions. */
1178     min = max = 1;
1179 #endif /* TRE_APPROX */
1180
1181   *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1182   if (!*result)
1183     return REG_ESPACE;
1184
1185 #ifdef TRE_APPROX
1186   /* If approximate matching parameters are set, add them to the
1187      iteration node. */
1188   if (approx || costs_set || counts_set)
1189     {
1190       int *params;
1191       tre_iteration_t *iter = (*result)->obj;
1192
1193       if (costs_set || counts_set)
1194         {
1195           if (limit_ins == TRE_PARAM_UNSET)
1196             {
1197               if (cost_ins == TRE_PARAM_UNSET)
1198                 limit_ins = 0;
1199               else
1200                 limit_ins = INT_MAX;
1201             }
1202
1203           if (limit_del == TRE_PARAM_UNSET)
1204             {
1205               if (cost_del == TRE_PARAM_UNSET)
1206                 limit_del = 0;
1207               else
1208                 limit_del = INT_MAX;
1209             }
1210
1211           if (limit_subst == TRE_PARAM_UNSET)
1212             {
1213               if (cost_subst == TRE_PARAM_UNSET)
1214                 limit_subst = 0;
1215               else
1216                 limit_subst = INT_MAX;
1217             }
1218         }
1219
1220       if (cost_max == TRE_PARAM_UNSET)
1221         cost_max = INT_MAX;
1222       if (limit_err == TRE_PARAM_UNSET)
1223         limit_err = INT_MAX;
1224
1225       ctx->have_approx = 1;
1226       params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1227       if (!params)
1228         return REG_ESPACE;
1229       for (i = 0; i < TRE_PARAM_LAST; i++)
1230         params[i] = TRE_PARAM_UNSET;
1231       params[TRE_PARAM_COST_INS] = cost_ins;
1232       params[TRE_PARAM_COST_DEL] = cost_del;
1233       params[TRE_PARAM_COST_SUBST] = cost_subst;
1234       params[TRE_PARAM_COST_MAX] = cost_max;
1235       params[TRE_PARAM_MAX_INS] = limit_ins;
1236       params[TRE_PARAM_MAX_DEL] = limit_del;
1237       params[TRE_PARAM_MAX_SUBST] = limit_subst;
1238       params[TRE_PARAM_MAX_ERR] = limit_err;
1239       iter->params = params;
1240     }
1241 #endif /* TRE_APPROX */
1242
1243 parse_bound_exit:
1244 #ifdef TRE_APPROX
1245   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1246           "limits [%d,%d,%d, total %d]\n",
1247           min, max, cost_ins, cost_del, cost_subst, cost_max,
1248           limit_ins, limit_del, limit_subst, limit_err));
1249 #else /* !TRE_APPROX */
1250   DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1251 #endif /* !TRE_APPROX */
1252
1253
1254   ctx->re = r;
1255   return REG_OK;
1256 }
1257
1258 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1259    non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1260    treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1261    Because we now set up tags for even non-capturing parenthesized
1262    subexpressions, we always call PARSE_MARK_FOR_SUBMATCH.  So if we
1263    pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1264    have it restore cflags after the subexpression, we don't need to have
1265    a separate PARSE_RESTORE_CFLAGS, and then after processing the
1266    non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1267    This has the side-benefit of now matching the perl behavior: the RE
1268    foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1269    of foo AND (?i) (bar OR zap). */
1270 typedef enum {
1271   PARSE_RE = 0,
1272   PARSE_ATOM,
1273   PARSE_MARK_FOR_SUBMATCH,
1274   PARSE_BRANCH,
1275   PARSE_PIECE,
1276   PARSE_CATENATION,
1277   PARSE_POST_CATENATION,
1278   PARSE_UNION,
1279   PARSE_POST_UNION,
1280   PARSE_POSTFIX,
1281 } tre_parse_re_stack_symbol_t;
1282
1283
1284 reg_errcode_t
1285 tre_parse(tre_parse_ctx_t *ctx)
1286 {
1287   tre_ast_node_t *result = NULL;
1288   tre_parse_re_stack_symbol_t symbol;
1289   reg_errcode_t status = REG_OK;
1290   tre_stack_t *stack = ctx->stack;
1291   int bottom = tre_stack_num_objects(stack);
1292   int depth = 0;
1293   int temporary_cflags = 0;
1294   int bre_branch_begin;
1295 #ifdef TRE_DEBUG
1296   const tre_char_t *tmp_re;
1297 #endif
1298
1299   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1300           ctx->len, ctx->re, ctx->len, ctx->cflags));
1301
1302   if (ctx->len <= 0) return REG_EMPTY;
1303   if (!ctx->nofirstsub)
1304     {
1305       STACK_PUSH(stack, int, ctx->cflags);
1306       STACK_PUSH(stack, int, ctx->submatch_id);
1307       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1308       ctx->submatch_id++;
1309     }
1310   STACK_PUSH(stack, int, 0); // bre_branch_begin
1311   STACK_PUSH(stack, int, PARSE_RE);
1312   ctx->re_start = ctx->re;
1313   ctx->re_end = ctx->re + ctx->len;
1314
1315
1316   /* The following is basically just a recursive descent parser.  I use
1317      an explicit stack instead of recursive functions mostly because of
1318      two reasons: compatibility with systems which have an overflowable
1319      call stack, and efficiency (both in lines of code and speed).  */
1320   while (tre_stack_num_objects(stack) > bottom)
1321     {
1322       symbol = tre_stack_pop_int(stack);
1323       switch (symbol)
1324         {
1325         case PARSE_RE:
1326           /* Parse a full regexp.  A regexp is one or more branches,
1327              separated by the union operator `|'. */
1328           bre_branch_begin = tre_stack_pop_int(stack);
1329           if (
1330 #ifdef REG_LITERAL
1331               !(ctx->cflags & REG_LITERAL) &&
1332 #endif /* REG_LITERAL */
1333               ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1334             STACK_PUSHX(stack, int, PARSE_UNION);
1335           STACK_PUSHX(stack, int, bre_branch_begin);
1336           STACK_PUSHX(stack, int, PARSE_BRANCH);
1337           break;
1338
1339         case PARSE_BRANCH:
1340           /* Parse a branch.  A branch is one or more pieces, concatenated.
1341              A piece is an atom possibly followed by a postfix operator. */
1342           bre_branch_begin = tre_stack_pop_int(stack);
1343           STACK_PUSHX(stack, int, PARSE_CATENATION);
1344           STACK_PUSHX(stack, int, bre_branch_begin);
1345           STACK_PUSHX(stack, int, PARSE_PIECE);
1346           break;
1347
1348         case PARSE_PIECE:
1349           /* Parse a piece.  A piece is an atom possibly followed by one
1350              or more postfix operators. */
1351           bre_branch_begin = tre_stack_pop_int(stack);
1352           STACK_PUSHX(stack, int, PARSE_POSTFIX);
1353           STACK_PUSHX(stack, int, bre_branch_begin);
1354           STACK_PUSHX(stack, int, PARSE_ATOM);
1355           break;
1356
1357         case PARSE_CATENATION:
1358           /* If the expression has not ended, parse another piece. */
1359           {
1360             tre_char_t c;
1361             if (ctx->re >= ctx->re_end)
1362               break;
1363             c = *ctx->re;
1364 #ifdef REG_LITERAL
1365             if (!(ctx->cflags & REG_LITERAL))
1366               {
1367 #endif /* REG_LITERAL */
1368                 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1369                     ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1370                     && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1371                     *(ctx->re + 1) == CHAR_PIPE))
1372                   break;
1373                 if ((ctx->cflags & REG_EXTENDED
1374                      && c == CHAR_RPAREN && depth > 0)
1375                     || (!(ctx->cflags & REG_EXTENDED)
1376                         && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1377                             && *(ctx->re + 1) == CHAR_RPAREN))
1378                   {
1379                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1380                       return REG_EPAREN;
1381                     DPRINT(("tre_parse:   group end: '%.*" STRF "'\n",
1382                             REST(ctx->re)));
1383                     depth--;
1384                     if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1385                       ctx->re += 2;
1386                     break;
1387                   }
1388 #ifdef REG_LITERAL
1389               }
1390 #endif /* REG_LITERAL */
1391
1392 #ifdef REG_LEFT_ASSOC
1393             if (ctx->cflags & REG_LEFT_ASSOC)
1394               {
1395                 /* Left associative concatenation. */
1396                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1397                 STACK_PUSHX(stack, voidptr, result);
1398                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1399                 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1400                 STACK_PUSHX(stack, int, PARSE_PIECE);
1401               }
1402             else
1403 #endif /* REG_LEFT_ASSOC */
1404               {
1405                 /* Default case, right associative concatenation. */
1406                 STACK_PUSHX(stack, voidptr, result);
1407                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1408                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1409                 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1410                 STACK_PUSHX(stack, int, PARSE_PIECE);
1411               }
1412             break;
1413           }
1414
1415         case PARSE_POST_CATENATION:
1416           {
1417             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1418             tre_ast_node_t *tmp_node;
1419             tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1420             if (!tmp_node)
1421               return REG_ESPACE;
1422             result = tmp_node;
1423             break;
1424           }
1425
1426         case PARSE_UNION:
1427           if (ctx->re >= ctx->re_end)
1428             break;
1429 #ifdef REG_LITERAL
1430           if (ctx->cflags & REG_LITERAL)
1431             break;
1432 #endif /* REG_LITERAL */
1433           if (!(ctx->cflags & REG_EXTENDED))
1434             {
1435               if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1436                 break;
1437               ctx->re++;
1438             }
1439           switch (*ctx->re)
1440             {
1441             case CHAR_PIPE:
1442               DPRINT(("tre_parse:       union: '%.*" STRF "'\n",
1443                       REST(ctx->re)));
1444               STACK_PUSHX(stack, int, PARSE_UNION);
1445               STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1446               STACK_PUSHX(stack, voidptr, result);
1447               STACK_PUSHX(stack, int, PARSE_POST_UNION);
1448               /* We need to pass a boolean (eventually) to PARSE_ATOM to
1449                  indicate if this is the beginning of a BRE extended branch. */
1450               STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1451               STACK_PUSHX(stack, int, PARSE_BRANCH);
1452               ctx->re++;
1453               break;
1454
1455             case CHAR_RPAREN:
1456               ctx->re++;
1457               break;
1458
1459             default:
1460               if (!(ctx->cflags & REG_EXTENDED))
1461                 ctx->re--;
1462               break;
1463             }
1464           break;
1465
1466         case PARSE_POST_UNION:
1467           {
1468             tre_ast_node_t *tmp_node;
1469             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1470             const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1471             /* error on empty expression at end of union */
1472             if (pipechar == ctx->re - 1)
1473               {
1474                 return REG_EMPTY;
1475               }
1476             tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1477             if (!tmp_node)
1478               return REG_ESPACE;
1479             result = tmp_node;
1480             break;
1481           }
1482
1483         case PARSE_POSTFIX:
1484           /* Parse postfix operators. */
1485           if (ctx->re >= ctx->re_end)
1486             break;
1487 #ifdef REG_LITERAL
1488           if (ctx->cflags & REG_LITERAL)
1489             break;
1490 #endif /* REG_LITERAL */
1491           int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1492           int rep_min = 0;
1493           int rep_max = -1;
1494 #ifdef TRE_DEBUG
1495           int lbrace_off;
1496 #endif
1497           switch (*ctx->re)
1498             {
1499             case CHAR_PLUS:
1500             case CHAR_QUESTIONMARK:
1501               if (!(ctx->cflags & REG_EXTENDED))
1502                 break;
1503                 /*FALLTHROUGH*/
1504             case CHAR_STAR:
1505               {
1506                 tre_ast_node_t *tmp_node;
1507 #ifdef TRE_DEBUG
1508                 const char *tstr = "star";
1509                 tmp_re = ctx->re;
1510 #endif
1511
1512         handle_plus_or_question:
1513                 /* error on iteration of raw assertion (not in subexpression) */
1514                 if (result->type == LITERAL && result->submatch_id < 0 &&
1515                     IS_ASSERTION((tre_literal_t *)result->obj))
1516                   {
1517                     if (!(ctx->cflags & REG_EXTENDED)) break;
1518                     return REG_BADRPT;
1519                   }
1520                 if (*ctx->re == CHAR_PLUS)
1521                   {
1522                     rep_min = 1;
1523 #ifdef TRE_DEBUG
1524                     tstr = "plus";
1525 #endif
1526                   }
1527                 if (*ctx->re == CHAR_QUESTIONMARK)
1528                   {
1529                     rep_max = 1;
1530 #ifdef TRE_DEBUG
1531                     tstr = "questionmark";
1532 #endif
1533                   }
1534
1535                 if (ctx->cflags & REG_EXTENDED)
1536                   {
1537                     if (ctx->re + 1 < ctx->re_end)
1538                       {
1539                         if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1540                           {
1541                             /* Process the question mark only in enhanced mode.
1542                                Otherwise, the question mark is an error in ERE */
1543                             if (ctx->cflags & REG_ENHANCED)
1544                               {
1545                                 minimal = !(ctx->cflags & REG_UNGREEDY);
1546                                 ctx->re++;
1547                               }
1548                             else return REG_BADRPT;
1549                           }
1550                         else if (*(ctx->re + 1) == CHAR_STAR
1551                                  || *(ctx->re + 1) == CHAR_PLUS)
1552                           {
1553                             /* These are reserved for future extensions. */
1554                             return REG_BADRPT;
1555                           }
1556                       }
1557                   }
1558                 else
1559                   {
1560                     if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1561                       {
1562                         /* This is reserved for future extensions. */
1563                         return REG_BADRPT;
1564                       }
1565                     if (ctx->re + 2 < ctx->re_end)
1566                       {
1567                         if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1568                           {
1569                             /* Process the question mark only in enhanced mode.
1570                                Otherwise, the question mark is a literal in BRE */
1571                             if (ctx->cflags & REG_ENHANCED)
1572                               {
1573                                 minimal = !(ctx->cflags & REG_UNGREEDY);
1574                                 ctx->re += 2;
1575                               }
1576                           }
1577                         else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1578                           {
1579                             /* This is reserved for future extensions. */
1580                             return REG_BADRPT;
1581                           }
1582                       }
1583                   }
1584
1585                 if (minimal)
1586                   ctx->num_reorder_tags++;
1587
1588                 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1589                         minimal ? "  minimal" : "greedy", tstr, REST(tmp_re)));
1590                 if (result == NULL)
1591                   {
1592                     if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1593                     else goto parse_literal;
1594                   }
1595                 ctx->re++;
1596                 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1597                                             minimal);
1598                 if (tmp_node == NULL)
1599                   return REG_ESPACE;
1600                 result = tmp_node;
1601
1602                 /* Set the iterator with a submatch id in the invisible range
1603                  * (which will be overridden if a real submatch is needed) */
1604                 result->submatch_id = ctx->submatch_id_invisible++;
1605
1606 #if 0
1607                 /* We don't allow multiple postfixes, but this might be needed
1608                    to support approximate matching */
1609                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1610 #endif
1611               }
1612               break;
1613
1614             case CHAR_BACKSLASH:
1615               /* "\{" is special without REG_EXTENDED */
1616               /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1617               if (!(ctx->cflags & REG_EXTENDED)
1618                   && ctx->re + 1 < ctx->re_end)
1619                 {
1620                   switch (*(ctx->re + 1))
1621                     {
1622                     case CHAR_LBRACE:
1623                       ctx->re++;
1624 #ifdef TRE_DEBUG
1625                       lbrace_off = 2;
1626 #endif
1627                       goto parse_brace;
1628                     case CHAR_PLUS:
1629                     case CHAR_QUESTIONMARK:
1630                       if (ctx->cflags & REG_ENHANCED)
1631                         {
1632 #ifdef TRE_DEBUG
1633                           tmp_re = ctx->re;
1634 #endif
1635                           ctx->re++;
1636                           goto handle_plus_or_question;
1637                         }
1638                       break;
1639                     }
1640                   break;
1641                 }
1642               else
1643                 break;
1644
1645             case CHAR_LBRACE:
1646               {
1647                 int raw_assertion;
1648
1649                 /* "{" is literal without REG_EXTENDED */
1650                 if (!(ctx->cflags & REG_EXTENDED))
1651                   break;
1652 #ifdef TRE_DEBUG
1653                 lbrace_off = 1;
1654 #endif
1655
1656             parse_brace:
1657                 /* error on iteration of raw assertion (not in subexpression),
1658                    but wait until after parsing bounds */
1659                 raw_assertion = (result->type == LITERAL
1660                                  && result->submatch_id < 0
1661                                  && IS_ASSERTION((tre_literal_t *)result->obj));
1662                 ctx->re++;
1663
1664                 status = tre_parse_bound(ctx, &result);
1665 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1666                 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1667                    is to be treated as a literal. */
1668                 if (status == REG_NOMATCH)
1669                   {
1670                     ctx->re--;
1671                     break;
1672                   }
1673 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1674                 DPRINT(("tre_parse:     bound: '%.*" STRF "'\n",
1675                         REST(ctx->re - lbrace_off)));
1676                 if (status != REG_OK)
1677                   return status;
1678                 if (raw_assertion) return REG_BADRPT;
1679
1680                 /* Set the iterator with a submatch id in the invisible range
1681                  * (which will be overridden if a real submatch is needed) */
1682                 if (result->type == ITERATION)
1683                   result->submatch_id = ctx->submatch_id_invisible++;
1684
1685 #if 0
1686                 /* We don't allow multiple postfixes, but this might be needed
1687                    to support approximate matching */
1688                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1689 #endif
1690                 break;
1691               }
1692             }
1693           break;
1694
1695         case PARSE_ATOM:
1696           {
1697             /* Parse an atom.  An atom is a regular expression enclosed in `()',
1698                an empty set of `()', a bracket expression, `.', `^', `$',
1699                a `\' followed by a character, or a single character. */
1700
1701             /* The stack contains a boolean value, whether PARSE_ATOM is
1702                being called just after the start of a group (left paren)
1703                in a BRE */
1704             bre_branch_begin = tre_stack_pop_int(stack);
1705
1706             /* End of regexp? (empty string). */
1707             if (ctx->re >= ctx->re_end)
1708               goto parse_literal;
1709
1710 #ifdef REG_LITERAL
1711             if (ctx->cflags & REG_LITERAL)
1712               goto parse_literal;
1713 #endif /* REG_LITERAL */
1714
1715             switch (*ctx->re)
1716               {
1717               case CHAR_LPAREN:  /* parenthesized subexpression */
1718
1719                 /* Handle "(?...)" extensions.  They work in a way similar
1720                    to Perls corresponding extensions. */
1721                 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1722                     (REG_EXTENDED|REG_ENHANCED)
1723                     && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1724                   {
1725                     int new_cflags = ctx->cflags;
1726                     int bit = 1;
1727                     int invisible_submatch = 0;
1728                     DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1729                             REST(ctx->re)));
1730                     ctx->re += 2;
1731                     while (/*CONSTCOND*/1)
1732                       {
1733                         if (*ctx->re == L'i')
1734                           {
1735                             DPRINT(("tre_parse:     icase: '%.*" STRF "'\n",
1736                                     REST(ctx->re)));
1737                             if (bit)
1738                               new_cflags |= REG_ICASE;
1739                             else
1740                               new_cflags &= ~REG_ICASE;
1741                             ctx->re++;
1742                           }
1743                         else if (*ctx->re == L'n')
1744                           {
1745                             DPRINT(("tre_parse:   newline: '%.*" STRF "'\n",
1746                                     REST(ctx->re)));
1747                             if (bit)
1748                               new_cflags |= REG_NEWLINE;
1749                             else
1750                               new_cflags &= ~REG_NEWLINE;
1751                             ctx->re++;
1752                           }
1753 #ifdef REG_LEFT_ASSOC
1754                         else if (*ctx->re == L'l')
1755                           {
1756                             DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1757                                     REST(ctx->re)));
1758                             if (bit)
1759                               new_cflags |= REG_LEFT_ASSOC;
1760                             else
1761                               new_cflags &= ~REG_LEFT_ASSOC;
1762                             ctx->re++;
1763                           }
1764 #endif /* REG_LEFT_ASSOC */
1765 #ifdef REG_UNGREEDY
1766                         else if (*ctx->re == L'U')
1767                           {
1768                             DPRINT(("tre_parse:    ungreedy: '%.*" STRF "'\n",
1769                                     REST(ctx->re)));
1770                             if (bit)
1771                               new_cflags |= REG_UNGREEDY;
1772                             else
1773                               new_cflags &= ~REG_UNGREEDY;
1774                             ctx->re++;
1775                           }
1776 #endif /* REG_UNGREEDY */
1777                         else if (*ctx->re == CHAR_MINUS)
1778                           {
1779                             DPRINT(("tre_parse:  turn off: '%.*" STRF "'\n",
1780                                     REST(ctx->re)));
1781                             ctx->re++;
1782                             bit = 0;
1783                           }
1784                         else if (*ctx->re == CHAR_COLON)
1785                           {
1786                             DPRINT(("tre_parse:  no group: '%.*" STRF
1787                                     "', (invisible submatch %d)\n",
1788                                     REST(ctx->re), ctx->submatch_id_invisible));
1789                             ctx->re++;
1790                             depth++;
1791                             invisible_submatch = 1;
1792                             break;
1793                           }
1794                         else if (*ctx->re == CHAR_HASH)
1795                           {
1796                             DPRINT(("tre_parse:    comment: '%.*" STRF "'\n",
1797                                     REST(ctx->re)));
1798                             /* A comment can contain any character except a
1799                                right parenthesis */
1800                             while (*ctx->re != CHAR_RPAREN
1801                                    && ctx->re < ctx->re_end)
1802                               ctx->re++;
1803                             if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1804                               {
1805                                 ctx->re++;
1806                                 break;
1807                               }
1808                             else
1809                               return REG_BADPAT;
1810                           }
1811                         else if (*ctx->re == CHAR_RPAREN)
1812                           {
1813                             ctx->re++;
1814                             break;
1815                           }
1816                         else
1817                           return REG_BADRPT;
1818                       }
1819
1820                     /* Turn on the cflags changes for the rest of the
1821                        enclosing group. */
1822                     if (invisible_submatch)
1823                       {
1824                         STACK_PUSHX(stack, int, ctx->cflags);
1825                         STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1826                         STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1827                         ctx->submatch_id_invisible++;
1828                         STACK_PUSHX(stack, int, 0); // bre_branch_begin
1829                         STACK_PUSHX(stack, int, PARSE_RE);
1830                       }
1831                     else {
1832                         STACK_PUSHX(stack, int, 0); // bre_branch_begin
1833                         STACK_PUSHX(stack, int, PARSE_ATOM);
1834                     }
1835                     ctx->cflags = new_cflags;
1836                     break;
1837                   }
1838
1839                 if (ctx->cflags & REG_EXTENDED)
1840                   {
1841                 parse_bre_lparen:
1842                     DPRINT(("tre_parse: group begin: '%.*" STRF
1843                             "', submatch %d\n", REST(ctx->re),
1844                             ctx->submatch_id));
1845                     ctx->re++;
1846                     /* First parse a whole RE, then mark the resulting tree
1847                        for submatching. */
1848                     STACK_PUSHX(stack, int, ctx->cflags);
1849                     STACK_PUSHX(stack, int, ctx->submatch_id);
1850                     STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1851                     /* We need to pass a boolean (eventually) to PARSE_ATOM to
1852                        indicate if this is the beginning of a BRE group. */
1853                     STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1854                     STACK_PUSHX(stack, int, PARSE_RE);
1855                     ctx->submatch_id++;
1856                     depth++;
1857                   }
1858                 else
1859                   goto parse_literal;
1860                 break;
1861
1862               case CHAR_RPAREN:  /* end of current subexpression */
1863                 if (ctx->cflags & REG_EXTENDED && depth > 0)
1864                   {
1865               parse_bre_rparen_empty:
1866                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1867                       return REG_EPAREN;
1868                     DPRINT(("tre_parse:     empty: '%.*" STRF "'\n",
1869                             REST(ctx->re)));
1870                     /* We were expecting an atom, but instead the current
1871                        subexpression was closed.  POSIX leaves the meaning of
1872                        this to be implementation-defined.  We interpret this as
1873                        an empty expression (which matches an empty string).  */
1874                     result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1875                     if (result == NULL)
1876                       return REG_ESPACE;
1877                     if (!(ctx->cflags & REG_EXTENDED))
1878                       ctx->re--;
1879                   }
1880                 else
1881                   goto parse_literal;
1882                 break;
1883
1884               case CHAR_LBRACKET: /* bracket expression */
1885                 DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1886                         REST(ctx->re)));
1887                 ctx->re++;
1888                 status = tre_parse_bracket(ctx, &result);
1889                 if (status != REG_OK)
1890                   return status;
1891                 break;
1892
1893               case CHAR_BACKSLASH:
1894                 /* Deal with "\(", "\)" or "\{" for BREs */
1895                 if (!(ctx->cflags & REG_EXTENDED)
1896                     && ctx->re + 1 < ctx->re_end)
1897                   {
1898                     if (*(ctx->re + 1) == CHAR_LPAREN)
1899                       {
1900                         ctx->re++;
1901                         goto parse_bre_lparen;
1902                       }
1903                     else if (*(ctx->re + 1) == CHAR_RPAREN)
1904                       {
1905                         ctx->re++;
1906                         goto parse_bre_rparen_empty;
1907                       }
1908                     if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1909                   }
1910
1911                 if (ctx->re + 1 >= ctx->re_end)
1912                   /* Trailing backslash. */
1913                   return REG_EESCAPE;
1914
1915                 if (!(ctx->cflags & REG_ENHANCED))
1916                   {
1917                     DPRINT(("tre_parse:  unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1918                     ctx->re++;
1919                     goto unenhanced_backslash;
1920                   }
1921
1922                 /* If a macro is used, parse the expanded macro recursively. */
1923                 {
1924                   tre_char_t buf[64];
1925                   tre_expand_macro(ctx->re + 1, ctx->re_end,
1926                                    buf, elementsof(buf));
1927                   if (buf[0] != 0)
1928                     {
1929                       tre_parse_ctx_t subctx;
1930                       memcpy(&subctx, ctx, sizeof(subctx));
1931                       subctx.re = buf;
1932                       subctx.len = tre_strlen(buf);
1933                       subctx.nofirstsub = 1;
1934                       status = tre_parse(&subctx);
1935                       if (status != REG_OK)
1936                         return status;
1937                       ctx->re += 2;
1938                       ctx->position = subctx.position;
1939                       result = subctx.result;
1940                       break;
1941                     }
1942                 }
1943
1944 #ifdef REG_LITERAL
1945                 if (*(ctx->re + 1) == L'Q')
1946                   {
1947                     DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1948                             REST(ctx->re)));
1949                     ctx->cflags |= REG_LITERAL;
1950                     temporary_cflags |= REG_LITERAL;
1951                     ctx->re += 2;
1952                     STACK_PUSHX(stack, int, 0);
1953                     STACK_PUSHX(stack, int, PARSE_ATOM);
1954                     break;
1955                   }
1956 #endif /* REG_LITERAL */
1957
1958                 DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1959                 ctx->re++;
1960                 switch (*ctx->re)
1961                   {
1962                   case L'b':
1963                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1964                                                  ASSERT_AT_WB, -1);
1965                     ctx->re++;
1966                     break;
1967                   case L'B':
1968                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1969                                                  ASSERT_AT_WB_NEG, -1);
1970                     ctx->re++;
1971                     break;
1972                   case L'<':
1973                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1974                                                  ASSERT_AT_BOW, -1);
1975                     ctx->re++;
1976                     break;
1977                   case L'>':
1978                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1979                                                  ASSERT_AT_EOW, -1);
1980                     ctx->re++;
1981                     break;
1982                   case L'x':
1983                     ctx->re++;
1984                     if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1985                       {
1986                         /* 8 bit hex char. */
1987                         char tmp[3] = {0, 0, 0};
1988                         long val;
1989                         DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1990                                 REST(ctx->re - 2)));
1991
1992                         if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1993                             ctx->re < ctx->re_end)
1994                           {
1995                             tmp[0] = (char)ctx->re[0];
1996                             ctx->re++;
1997                           }
1998                         if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1999                             ctx->re < ctx->re_end)
2000                           {
2001                             tmp[1] = (char)ctx->re[0];
2002                             ctx->re++;
2003                           }
2004                         val = strtol(tmp, NULL, 16);
2005                         result = tre_ast_new_literal(ctx->mem, (int)val,
2006                                                      (int)val, ctx->position);
2007                         ctx->position++;
2008                         break;
2009                       }
2010                     else if (ctx->re < ctx->re_end)
2011                       {
2012                         /* Wide char. */
2013                         char tmp[32];
2014                         long val;
2015                         int i = 0;
2016                         ctx->re++;
2017                         while (ctx->re_end - ctx->re >= 0)
2018                           {
2019                             if (ctx->re[0] == CHAR_RBRACE)
2020                               break;
2021                             if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2022                               {
2023                                 tmp[i] = (char)ctx->re[0];
2024                                 i++;
2025                                 ctx->re++;
2026                                 continue;
2027                               }
2028                             return REG_EBRACE;
2029                           }
2030                         ctx->re++;
2031                         tmp[i] = 0;
2032                         val = strtol(tmp, NULL, 16);
2033                         result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2034                                                      ctx->position);
2035                         ctx->position++;
2036                         break;
2037                       }
2038                     /*FALLTHROUGH*/
2039
2040                   default:
2041                   unenhanced_backslash:
2042                     if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2043                         REG_EXTENDED &&
2044                         tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2045                       {
2046                         /* Back reference (only in BRE or enhanced). */
2047                         int val = *ctx->re - L'0';
2048                         DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
2049                                 REST(ctx->re - 1)));
2050                         result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2051                                                      ctx->position);
2052                         if (result == NULL)
2053                           return REG_ESPACE;
2054
2055                         /* Set the backref with a submatch id in the invisible
2056                          * range (which will be overridden if a real submatch
2057                          * is needed) */
2058                         result->submatch_id = ctx->submatch_id_invisible++;
2059
2060                         ctx->position++;
2061                         ctx->num_reorder_tags++;
2062                         ctx->max_backref = MAX(val, ctx->max_backref);
2063                         ctx->re++;
2064                       }
2065                     else
2066                       {
2067                         /* Escaped character. */
2068                         DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
2069                                 REST(ctx->re - 1)));
2070                         result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2071                                                      ctx->position);
2072                         ctx->position++;
2073                         ctx->re++;
2074                       }
2075                     break;
2076                   }
2077                 if (result == NULL)
2078                   return REG_ESPACE;
2079                 break;
2080
2081               case CHAR_PERIOD:  /* the any-symbol */
2082                 DPRINT(("tre_parse:       any: '%.*" STRF "'\n",
2083                         REST(ctx->re)));
2084                 if (ctx->cflags & REG_NEWLINE)
2085                   {
2086                     tre_ast_node_t *tmp1;
2087                     tre_ast_node_t *tmp2;
2088                     tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2089                                                ctx->position);
2090                     if (!tmp1)
2091                       return REG_ESPACE;
2092                     tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2093                                                ctx->position + 1);
2094                     if (!tmp2)
2095                       return REG_ESPACE;
2096                     result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2097                     if (!result)
2098                       return REG_ESPACE;
2099                     ctx->position += 2;
2100                   }
2101                 else
2102                   {
2103                     result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2104                                                  ctx->position);
2105                     if (!result)
2106                       return REG_ESPACE;
2107                     ctx->position++;
2108                   }
2109                 ctx->re++;
2110                 break;
2111
2112               case CHAR_CARET:   /* beginning of line assertion */
2113                 /* '^' has a special meaning everywhere in EREs, at the
2114                    beginning of the RE and after \( is BREs.  It is also
2115                    special in enhanced BREs at the beginning of each branches
2116                    of a union */
2117                 if (ctx->cflags & REG_EXTENDED
2118                     || bre_branch_begin
2119                     || ctx->re == ctx->re_start)
2120                   {
2121                     DPRINT(("tre_parse:       BOL: '%.*" STRF "'\n",
2122                             REST(ctx->re)));
2123                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
2124                                                  ASSERT_AT_BOL, -1);
2125                     if (result == NULL)
2126                       return REG_ESPACE;
2127                     ctx->re++;
2128                   }
2129                 else
2130                   goto parse_literal;
2131                 break;
2132
2133               case CHAR_DOLLAR:  /* end of line assertion. */
2134                 /* '$' is special everywhere in EREs, and in the end of the
2135                    string and before \) is BREs. */
2136                 if (ctx->cflags & REG_EXTENDED
2137                     || (ctx->re + 2 < ctx->re_end
2138                         && *(ctx->re + 1) == CHAR_BACKSLASH
2139                         && *(ctx->re + 2) == CHAR_RPAREN)
2140                     || ctx->re + 1 == ctx->re_end)
2141                   {
2142                     DPRINT(("tre_parse:       EOL: '%.*" STRF "'\n",
2143                             REST(ctx->re)));
2144                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
2145                                                  ASSERT_AT_EOL, -1);
2146                     if (result == NULL)
2147                       return REG_ESPACE;
2148                     ctx->re++;
2149                   }
2150                 else
2151                   goto parse_literal;
2152                 break;
2153
2154               default:
2155               parse_literal:
2156
2157                 if (temporary_cflags && ctx->re + 1 < ctx->re_end
2158                     && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2159                   {
2160                     DPRINT(("tre_parse:  end tmps: '%.*" STRF "'\n",
2161                             REST(ctx->re)));
2162                     ctx->cflags &= ~temporary_cflags;
2163                     temporary_cflags = 0;
2164                     ctx->re += 2;
2165                     if (ctx->re < ctx->re_end)
2166                       {
2167                         STACK_PUSHX(stack, int, 0);
2168                         STACK_PUSHX(stack, int, PARSE_ATOM);
2169                       }
2170                     else
2171                       {
2172                         result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2173                         if (!result) return REG_ESPACE;
2174                       }
2175                     break;
2176                   }
2177
2178
2179                 /* We are expecting an atom.  If the subexpression (or the whole
2180                    regexp ends here, we interpret it as an empty expression
2181                    (which matches an empty string), which is an error.
2182                    Iterations of an empty expression is also an error. */
2183 #ifdef REG_LITERAL
2184                 if (!(ctx->cflags & REG_LITERAL))
2185                   {
2186 #endif /* REG_LITERAL */
2187                     /* error on end of string */
2188                     if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2189                                                        : REG_EMPTY;
2190                     /* error on unions and iterations of empty expressions */
2191                     if (ctx->cflags & REG_EXTENDED)
2192                       {
2193                         if (ctx->re < ctx->re_end)
2194                           {
2195                             if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2196                             if (*ctx->re == CHAR_LBRACE)
2197                               {
2198                                 ctx->re++;
2199                   empty_parse_bound:
2200                                 /* We need to parse the bound first and return
2201                                    any error, before returning REG_BADRPT */
2202                                 status = tre_parse_bound(ctx, NULL);
2203 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2204                                 /* For ERE, if REG_NOMATCH is returned, we
2205                                    treat the lbrace as a literal. */
2206                                 if (status == REG_NOMATCH)
2207                                   {
2208                                     ctx->re--;
2209                                     /* Drop down to literal-handling code */
2210                                   }
2211                                 else
2212                                   {
2213 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2214                                     if (status != REG_OK)
2215                                       return status;
2216                                     return REG_BADRPT;
2217 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2218                                   }
2219 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2220                               }
2221 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2222                             else
2223 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2224                             if (*ctx->re == CHAR_STAR
2225                                 || *ctx->re == CHAR_PLUS
2226                                 || *ctx->re == CHAR_QUESTIONMARK)
2227                               {
2228                                 return REG_BADRPT;
2229                               }
2230                           }
2231                       }
2232                     else if (ctx->re + 1 < ctx->re_end
2233                              && *ctx->re == CHAR_BACKSLASH
2234                              && *(ctx->re + 1) == CHAR_LBRACE)
2235                       {
2236                         ctx->re += 2;
2237                         goto empty_parse_bound;
2238                       }
2239 #ifdef REG_LITERAL
2240                   }
2241 #endif /* REG_LITERAL */
2242
2243                 DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
2244                         REST(ctx->re)));
2245                 /* Note that we can't use an tre_isalpha() test here, since there
2246                    may be characters which are alphabetic but neither upper or
2247                    lower case. */
2248                 if (ctx->cflags & REG_ICASE
2249                     && (tre_isupper_l(*ctx->re, ctx->loc) ||
2250                     tre_islower_l(*ctx->re, ctx->loc)))
2251                   {
2252                     tre_ast_node_t *tmp1;
2253                     tre_ast_node_t *tmp2;
2254
2255                     /* XXX - Can there be more than one opposite-case
2256                        counterpoints for some character in some locale?  Or
2257                        more than two characters which all should be regarded
2258                        the same character if case is ignored?  If yes, there
2259                        does not seem to be a portable way to detect it.  I guess
2260                        that at least for multi-character collating elements there
2261                        could be several opposite-case counterpoints, but they
2262                        cannot be supported portably anyway. */
2263                     tmp1 = tre_ast_new_literal(ctx->mem,
2264                                                tre_toupper_l(*ctx->re, ctx->loc),
2265                                                tre_toupper_l(*ctx->re, ctx->loc),
2266                                                ctx->position);
2267                     if (!tmp1)
2268                       return REG_ESPACE;
2269                     tmp2 = tre_ast_new_literal(ctx->mem,
2270                                                tre_tolower_l(*ctx->re, ctx->loc),
2271                                                tre_tolower_l(*ctx->re, ctx->loc),
2272                                                ctx->position);
2273                     if (!tmp2)
2274                       return REG_ESPACE;
2275                     result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2276                     if (!result)
2277                       return REG_ESPACE;
2278                   }
2279                 else
2280                   {
2281                     result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2282                                                  ctx->position);
2283                     if (!result)
2284                       return REG_ESPACE;
2285                   }
2286                 ctx->position++;
2287                 ctx->re++;
2288                 break;
2289               }
2290             break;
2291           }
2292
2293         case PARSE_MARK_FOR_SUBMATCH:
2294           {
2295             int submatch_id = tre_stack_pop_int(stack);
2296
2297             ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2298             if (result->submatch_id >= 0 &&
2299                 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2300               {
2301                 tre_ast_node_t *n, *tmp_node;
2302                 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2303                   break;
2304                 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2305                 if (n == NULL)
2306                   return REG_ESPACE;
2307                 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2308                 if (tmp_node == NULL)
2309                   return REG_ESPACE;
2310                 tmp_node->num_submatches = result->num_submatches;
2311                 result = tmp_node;
2312               }
2313             result->submatch_id = submatch_id;
2314             if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2315               result->num_submatches++;
2316             break;
2317           }
2318
2319         default:
2320           assert(0);
2321           break;
2322         }
2323     }
2324
2325   /* Check for missing closing parentheses. */
2326   if (depth > 0)
2327     return REG_EPAREN;
2328
2329   ctx->result = result;
2330
2331   return REG_OK;
2332 }
2333
2334 /* EOF */