src/regex/regc_locale.c

   1 /*
   2  * regc_locale.c --
   3  *
   4  *      This file contains locale-specific regexp routines.
   5  *      This file is #included by regcomp.c.
   6  *
   7  * Copyright (c) 1998 by Scriptics Corporation.
   8  *
   9  * This software is copyrighted by the Regents of the University of
  10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
  11  * Corporation and other parties.  The following terms apply to all files
  12  * associated with the software unless explicitly disclaimed in
  13  * individual files.
  14  *
  15  * The authors hereby grant permission to use, copy, modify, distribute,
  16  * and license this software and its documentation for any purpose, provided
  17  * that existing copyright notices are retained in all copies and that this
  18  * notice is included verbatim in any distributions. No written agreement,
  19  * license, or royalty fee is required for any of the authorized uses.
  20  * Modifications to this software may be copyrighted by their authors
  21  * and need not follow the licensing terms described here, provided that
  22  * the new terms are clearly indicated on the first page of each file where
  23  * they apply.
  24  *
  25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
  26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
  28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
  32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.      THIS SOFTWARE
  34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
  35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
  36  * MODIFICATIONS.
  37  *
  38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
  39  * U.S. government, the Government shall have only "Restricted Rights"
  40  * in the software and related documentation as defined in the Federal
  41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
  42  * are acquiring the software on behalf of the Department of Defense, the
  43  * software shall be classified as "Commercial Computer Software" and the
  44  * Government shall have only "Restricted Rights" as defined in Clause
  45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
  46  * authors grant the U.S. Government and others acting in its behalf
  47  * permission to use and distribute the software in accordance with the
  48  * terms specified in this license.
  49  *
  50  * $Header$
  51  */
  52
  53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
  54 {
  55         while(*cp++ == (const char)*wp++ && --nNum){}
  56
  57         return nNum;
  58 }
  59
  60 /* ASCII character-name table */
  61
  62 static struct cname
  63 {
  64         char       *name;
  65         char            code;
  66 }       cnames[] =
  67
  68 {
  69         {
  70                 "NUL", '\0'
  71         },
  72         {
  73                 "SOH", '\001'
  74         },
  75         {
  76                 "STX", '\002'
  77         },
  78         {
  79                 "ETX", '\003'
  80         },
  81         {
  82                 "EOT", '\004'
  83         },
  84         {
  85                 "ENQ", '\005'
  86         },
  87         {
  88                 "ACK", '\006'
  89         },
  90         {
  91                 "BEL", '\007'
  92         },
  93         {
  94                 "alert", '\007'
  95         },
  96         {
  97                 "BS", '\010'
  98         },
  99         {
 100                 "backspace", '\b'
 101         },
 102         {
 103                 "HT", '\011'
 104         },
 105         {
 106                 "tab", '\t'
 107         },
 108         {
 109                 "LF", '\012'
 110         },
 111         {
 112                 "newline", '\n'
 113         },
 114         {
 115                 "VT", '\013'
 116         },
 117         {
 118                 "vertical-tab", '\v'
 119         },
 120         {
 121                 "FF", '\014'
 122         },
 123         {
 124                 "form-feed", '\f'
 125         },
 126         {
 127                 "CR", '\015'
 128         },
 129         {
 130                 "carriage-return", '\r'
 131         },
 132         {
 133                 "SO", '\016'
 134         },
 135         {
 136                 "SI", '\017'
 137         },
 138         {
 139                 "DLE", '\020'
 140         },
 141         {
 142                 "DC1", '\021'
 143         },
 144         {
 145                 "DC2", '\022'
 146         },
 147         {
 148                 "DC3", '\023'
 149         },
 150         {
 151                 "DC4", '\024'
 152         },
 153         {
 154                 "NAK", '\025'
 155         },
 156         {
 157                 "SYN", '\026'
 158         },
 159         {
 160                 "ETB", '\027'
 161         },
 162         {
 163                 "CAN", '\030'
 164         },
 165         {
 166                 "EM", '\031'
 167         },
 168         {
 169                 "SUB", '\032'
 170         },
 171         {
 172                 "ESC", '\033'
 173         },
 174         {
 175                 "IS4", '\034'
 176         },
 177         {
 178                 "FS", '\034'
 179         },
 180         {
 181                 "IS3", '\035'
 182         },
 183         {
 184                 "GS", '\035'
 185         },
 186         {
 187                 "IS2", '\036'
 188         },
 189         {
 190                 "RS", '\036'
 191         },
 192         {
 193                 "IS1", '\037'
 194         },
 195         {
 196                 "US", '\037'
 197         },
 198         {
 199                 "space", ' '
 200         },
 201         {
 202                 "exclamation-mark", '!'
 203         },
 204         {
 205                 "quotation-mark", '"'
 206         },
 207         {
 208                 "number-sign", '#'
 209         },
 210         {
 211                 "dollar-sign", '$'
 212         },
 213         {
 214                 "percent-sign", '%'
 215         },
 216         {
 217                 "ampersand", '&'
 218         },
 219         {
 220                 "apostrophe", '\''
 221         },
 222         {
 223                 "left-parenthesis", '('
 224         },
 225         {
 226                 "right-parenthesis", ')'
 227         },
 228         {
 229                 "asterisk", '*'
 230         },
 231         {
 232                 "plus-sign", '+'
 233         },
 234         {
 235                 "comma", ','
 236         },
 237         {
 238                 "hyphen", '-'
 239         },
 240         {
 241                 "hyphen-minus", '-'
 242         },
 243         {
 244                 "period", '.'
 245         },
 246         {
 247                 "full-stop", '.'
 248         },
 249         {
 250                 "slash", '/'
 251         },
 252         {
 253                 "solidus", '/'
 254         },
 255         {
 256                 "zero", '0'
 257         },
 258         {
 259                 "one", '1'
 260         },
 261         {
 262                 "two", '2'
 263         },
 264         {
 265                 "three", '3'
 266         },
 267         {
 268                 "four", '4'
 269         },
 270         {
 271                 "five", '5'
 272         },
 273         {
 274                 "six", '6'
 275         },
 276         {
 277                 "seven", '7'
 278         },
 279         {
 280                 "eight", '8'
 281         },
 282         {
 283                 "nine", '9'
 284         },
 285         {
 286                 "colon", ':'
 287         },
 288         {
 289                 "semicolon", ';'
 290         },
 291         {
 292                 "less-than-sign", '<'
 293         },
 294         {
 295                 "equals-sign", '='
 296         },
 297         {
 298                 "greater-than-sign", '>'
 299         },
 300         {
 301                 "question-mark", '?'
 302         },
 303         {
 304                 "commercial-at", '@'
 305         },
 306         {
 307                 "left-square-bracket", '['
 308         },
 309         {
 310                 "backslash", '\\'
 311         },
 312         {
 313                 "reverse-solidus", '\\'
 314         },
 315         {
 316                 "right-square-bracket", ']'
 317         },
 318         {
 319                 "circumflex", '^'
 320         },
 321         {
 322                 "circumflex-accent", '^'
 323         },
 324         {
 325                 "underscore", '_'
 326         },
 327         {
 328                 "low-line", '_'
 329         },
 330         {
 331                 "grave-accent", '`'
 332         },
 333         {
 334                 "left-brace", '{'
 335         },
 336         {
 337                 "left-curly-bracket", '{'
 338         },
 339         {
 340                 "vertical-line", '|'
 341         },
 342         {
 343                 "right-brace", '}'
 344         },
 345         {
 346                 "right-curly-bracket", '}'
 347         },
 348         {
 349                 "tilde", '~'
 350         },
 351         {
 352                 "DEL", '\177'
 353         },
 354         {
 355                 NULL, 0
 356         }
 357 };
 358
 359
 360 /*
 361  * nmcces - how many distinct MCCEs are there?
 362  */
 363 static int
 364 nmcces(struct vars * v)
 365 {
 366         /*
 367          * No multi-character collating elements defined at the moment.
 368          */
 369         return 0;
 370 }
 371
 372 /*
 373  * nleaders - how many chrs can be first chrs of MCCEs?
 374  */
 375 static int
 376 nleaders(struct vars * v)
 377 {
 378         return 0;
 379 }
 380
 381 /*
 382  * allmcces - return a cvec with all the MCCEs of the locale
 383  */
 384 static struct cvec *
 385 allmcces(struct vars * v,               /* context */
 386                  struct cvec * cv)              /* this is supposed to have enough room */
 387 {
 388         return clearcvec(cv);
 389 }
 390
 391 /*
 392  * element - map collating-element name to celt
 393  */
 394 static celt
 395 element(struct vars * v,                /* context */
 396                 chr *startp,                    /* points to start of name */
 397                 chr *endp)                              /* points just past end of name */
 398 {
 399         struct cname *cn;
 400         size_t          len;
 401
 402         /* generic:  one-chr names stand for themselves */
 403         assert(startp < endp);
 404         len = endp - startp;
 405         if (len == 1)
 406                 return *startp;
 407
 408         NOTE(REG_ULOCALE);
 409
 410         /* search table */
 411         for (cn = cnames; cn->name != NULL; cn++)
 412         {
 413                 if (strlen(cn->name) == len &&
 414                         char_and_wchar_strncmp(cn->name, startp, len) == 0)
 415                 {
 416                         break;                          /* NOTE BREAK OUT */
 417                 }
 418         }
 419         if (cn->name != NULL)
 420                 return CHR(cn->code);
 421
 422         /* couldn't find it */
 423         ERR(REG_ECOLLATE);
 424         return 0;
 425 }
 426
 427 /*
 428  * range - supply cvec for a range, including legality check
 429  */
 430 static struct cvec *
 431 range(struct vars * v,                  /* context */
 432           celt a,                                       /* range start */
 433           celt b,                                       /* range end, might equal a */
 434           int cases)                            /* case-independent? */
 435 {
 436         int                     nchrs;
 437         struct cvec *cv;
 438         celt            c,
 439                                 lc,
 440                                 uc;
 441
 442         if (a != b && !before(a, b))
 443         {
 444                 ERR(REG_ERANGE);
 445                 return NULL;
 446         }
 447
 448         if (!cases)
 449         {                                                       /* easy version */
 450                 cv = getcvec(v, 0, 1, 0);
 451                 NOERRN();
 452                 addrange(cv, a, b);
 453                 return cv;
 454         }
 455
 456         /*
 457          * When case-independent, it's hard to decide when cvec ranges are
 458          * usable, so for now at least, we won't try.  We allocate enough
 459          * space for two case variants plus a little extra for the two title
 460          * case variants.
 461          */
 462
 463         nchrs = (b - a + 1) * 2 + 4;
 464
 465         cv = getcvec(v, nchrs, 0, 0);
 466         NOERRN();
 467
 468         for (c = a; c <= b; c++)
 469         {
 470                 addchr(cv, c);
 471                 lc = wx_tolower((chr) c);
 472                 if (c != lc)
 473                         addchr(cv, lc);
 474                 uc = wx_toupper((chr) c);
 475                 if (c != uc)
 476                         addchr(cv, uc);
 477         }
 478
 479         return cv;
 480 }
 481
 482 /*
 483  * before - is celt x before celt y, for purposes of range legality?
 484  */
 485 static int                                              /* predicate */
 486 before(celt x, celt y)
 487 {
 488         /* trivial because no MCCEs */
 489         if (x < y)
 490                 return 1;
 491         return 0;
 492 }
 493
 494 /*
 495  * eclass - supply cvec for an equivalence class
 496  * Must include case counterparts on request.
 497  */
 498 static struct cvec *
 499 eclass(struct vars * v,                 /* context */
 500            celt c,                                      /* Collating element representing the
 501                                                                  * equivalence class. */
 502            int cases)                           /* all cases? */
 503 {
 504         struct cvec *cv;
 505
 506         /* crude fake equivalence class for testing */
 507         if ((v->cflags & REG_FAKE) && c == 'x')
 508         {
 509                 cv = getcvec(v, 4, 0, 0);
 510                 addchr(cv, (chr) 'x');
 511                 addchr(cv, (chr) 'y');
 512                 if (cases)
 513                 {
 514                         addchr(cv, (chr) 'X');
 515                         addchr(cv, (chr) 'Y');
 516                 }
 517                 return cv;
 518         }
 519
 520         /* otherwise, none */
 521         if (cases)
 522                 return allcases(v, c);
 523         cv = getcvec(v, 1, 0, 0);
 524         assert(cv != NULL);
 525         addchr(cv, (chr) c);
 526         return cv;
 527 }
 528
 529 /*
 530  * cclass - supply cvec for a character class
 531  *
 532  * Must include case counterparts on request.
 533  */
 534 static struct cvec *
 535 cclass(struct vars * v,                 /* context */
 536            chr *startp,                         /* where the name starts */
 537            chr *endp,                           /* just past the end of the name */
 538            int cases)                           /* case-independent? */
 539 {
 540         size_t          len;
 541         struct cvec *cv = NULL;
 542         char      **namePtr;
 543         int                     i,
 544                                 index;
 545
 546         /*
 547          * The following arrays define the valid character class names.
 548          */
 549
 550         static char *classNames[] = {
 551                 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
 552                 "lower", "print", "punct", "space", "upper", "xdigit", NULL
 553         };
 554
 555         enum classes
 556         {
 557                 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 558                 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
 559         };
 560
 561         /*
 562          * Map the name to the corresponding enumerated value.
 563          */
 564         len = endp - startp;
 565         index = -1;
 566         for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 567         {
 568                 if (strlen(*namePtr) == len &&
 569                         char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 570                 {
 571                         index = i;
 572                         break;
 573                 }
 574         }
 575         if (index == -1)
 576         {
 577                 ERR(REG_ECTYPE);
 578                 return NULL;
 579         }
 580
 581         /*
 582          * Remap lower and upper to alpha if the match is case insensitive.
 583          */
 584
 585         if (cases &&
 586                 ((enum classes) index == CC_LOWER ||
 587                  (enum classes) index == CC_UPPER))
 588                 index = (int) CC_ALPHA;
 589
 590         /*
 591          * Now compute the character class contents.
 592          *
 593          * For the moment, assume that only char codes < 256 can be in these
 594          * classes.
 595          */
 596
 597         switch ((enum classes) index)
 598         {
 599                 case CC_PRINT:
 600                 case CC_ALNUM:
 601                         cv = getcvec(v, UCHAR_MAX, 1, 0);
 602                         if (cv)
 603                         {
 604                                 for (i = 0; i <= UCHAR_MAX; i++)
 605                                 {
 606                                         if (wx_isalpha((chr) i))
 607                                                 addchr(cv, (chr) i);
 608                                 }
 609                                 addrange(cv, (chr) '0', (chr) '9');
 610                         }
 611                         break;
 612                 case CC_ALPHA:
 613                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 614                         if (cv)
 615                         {
 616                                 for (i = 0; i <= UCHAR_MAX; i++)
 617                                 {
 618                                         if (wx_isalpha((chr) i))
 619                                                 addchr(cv, (chr) i);
 620                                 }
 621                         }
 622                         break;
 623                 case CC_ASCII:
 624                         cv = getcvec(v, 0, 1, 0);
 625                         if (cv)
 626                                 addrange(cv, 0, 0x7f);
 627                         break;
 628                 case CC_BLANK:
 629                         cv = getcvec(v, 2, 0, 0);
 630                         addchr(cv, '\t');
 631                         addchr(cv, ' ');
 632                         break;
 633                 case CC_CNTRL:
 634                         cv = getcvec(v, 0, 2, 0);
 635                         addrange(cv, 0x0, 0x1f);
 636                         addrange(cv, 0x7f, 0x9f);
 637                         break;
 638                 case CC_DIGIT:
 639                         cv = getcvec(v, 0, 1, 0);
 640                         if (cv)
 641                                 addrange(cv, (chr) '0', (chr) '9');
 642                         break;
 643                 case CC_PUNCT:
 644                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 645                         if (cv)
 646                         {
 647                                 for (i = 0; i <= UCHAR_MAX; i++)
 648                                 {
 649                                         if (wx_ispunct((chr) i))
 650                                                 addchr(cv, (chr) i);
 651                                 }
 652                         }
 653                         break;
 654                 case CC_XDIGIT:
 655                         cv = getcvec(v, 0, 3, 0);
 656                         if (cv)
 657                         {
 658                                 addrange(cv, '0', '9');
 659                                 addrange(cv, 'a', 'f');
 660                                 addrange(cv, 'A', 'F');
 661                         }
 662                         break;
 663                 case CC_SPACE:
 664                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 665                         if (cv)
 666                         {
 667                                 for (i = 0; i <= UCHAR_MAX; i++)
 668                                 {
 669                                         if (wx_isspace((chr) i))
 670                                                 addchr(cv, (chr) i);
 671                                 }
 672                         }
 673                         break;
 674                 case CC_LOWER:
 675                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 676                         if (cv)
 677                         {
 678                                 for (i = 0; i <= UCHAR_MAX; i++)
 679                                 {
 680                                         if (wx_islower((chr) i))
 681                                                 addchr(cv, (chr) i);
 682                                 }
 683                         }
 684                         break;
 685                 case CC_UPPER:
 686                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 687                         if (cv)
 688                         {
 689                                 for (i = 0; i <= UCHAR_MAX; i++)
 690                                 {
 691                                         if (wx_isupper((chr) i))
 692                                                 addchr(cv, (chr) i);
 693                                 }
 694                         }
 695                         break;
 696                 case CC_GRAPH:
 697                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 698                         if (cv)
 699                         {
 700                                 for (i = 0; i <= UCHAR_MAX; i++)
 701                                 {
 702                                         if (wx_isgraph((chr) i))
 703                                                 addchr(cv, (chr) i);
 704                                 }
 705                         }
 706                         break;
 707         }
 708         if (cv == NULL)
 709                 ERR(REG_ESPACE);
 710         return cv;
 711 }
 712
 713 /*
 714  * allcases - supply cvec for all case counterparts of a chr (including itself)
 715  *
 716  * This is a shortcut, preferably an efficient one, for simple characters;
 717  * messy cases are done via range().
 718  */
 719 static struct cvec *
 720 allcases(struct vars * v,               /* context */
 721                  chr pc)                                /* character to get case equivs of */
 722 {
 723         struct cvec *cv;
 724         chr                     c = (chr) pc;
 725         chr                     lc,
 726                                 uc;
 727
 728         lc = wx_tolower((chr) c);
 729         uc = wx_toupper((chr) c);
 730
 731         cv = getcvec(v, 2, 0, 0);
 732         addchr(cv, lc);
 733         if (lc != uc)
 734                 addchr(cv, uc);
 735         return cv;
 736 }
 737
 738 /*
 739  * cmp - chr-substring compare
 740  *
 741  * Backrefs need this.  It should preferably be efficient.
 742  * Note that it does not need to report anything except equal/unequal.
 743  * Note also that the length is exact, and the comparison should not
 744  * stop at embedded NULs!
 745  */
 746 static int                                              /* 0 for equal, nonzero for unequal */
 747 cmp(const chr *x, const chr *y, /* strings to compare */
 748         size_t len)                                     /* exact length of comparison */
 749 {
 750         return memcmp(VS(x), VS(y), len * sizeof(chr));
 751 }
 752
 753 /*
 754  * casecmp - case-independent chr-substring compare
 755  *
 756  * REG_ICASE backrefs need this.  It should preferably be efficient.
 757  * Note that it does not need to report anything except equal/unequal.
 758  * Note also that the length is exact, and the comparison should not
 759  * stop at embedded NULs!
 760  */
 761 static int                                              /* 0 for equal, nonzero for unequal */
 762 casecmp(const chr *x, const chr *y,             /* strings to compare */
 763                 size_t len)                             /* exact length of comparison */
 764 {
 765         for (; len > 0; len--, x++, y++)
 766         {
 767                 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
 768                         return 1;
 769         }
 770         return 0;
 771 }