src/regex/regc_locale.c

   1 /*
   2  * regc_locale.c --
   3  *
   4  *      This file contains locale-specific regexp routines.
   5  *      This file is #included by regcomp.c.
   6  *
   7  * Copyright (c) 1998 by Scriptics Corporation.
   8  *
   9  * This software is copyrighted by the Regents of the University of
  10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
  11  * Corporation and other parties.  The following terms apply to all files
  12  * associated with the software unless explicitly disclaimed in
  13  * individual files.
  14  *
  15  * The authors hereby grant permission to use, copy, modify, distribute,
  16  * and license this software and its documentation for any purpose, provided
  17  * that existing copyright notices are retained in all copies and that this
  18  * notice is included verbatim in any distributions. No written agreement,
  19  * license, or royalty fee is required for any of the authorized uses.
  20  * Modifications to this software may be copyrighted by their authors
  21  * and need not follow the licensing terms described here, provided that
  22  * the new terms are clearly indicated on the first page of each file where
  23  * they apply.
  24  *
  25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
  26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
  28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
  32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.      THIS SOFTWARE
  34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
  35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
  36  * MODIFICATIONS.
  37  *
  38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
  39  * U.S. government, the Government shall have only "Restricted Rights"
  40  * in the software and related documentation as defined in the Federal
  41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
  42  * are acquiring the software on behalf of the Department of Defense, the
  43  * software shall be classified as "Commercial Computer Software" and the
  44  * Government shall have only "Restricted Rights" as defined in Clause
  45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
  46  * authors grant the U.S. Government and others acting in its behalf
  47  * permission to use and distribute the software in accordance with the
  48  * terms specified in this license.
  49  *
  50  * $Header$
  51  */
  52
  53 int char_and_wchar_strncmp (const char* cp, const wx_wchar* wp, size_t nNum)
  54 {
  55         while(*cp++ == (const char)*wp++ && --nNum){}
  56
  57         return nNum;
  58 }
  59
  60 /* ASCII character-name table */
  61
  62 static struct cname
  63 {
  64         char       *name;
  65         char            code;
  66 }       cnames[] =
  67
  68 {
  69         {
  70                 "NUL", '\0'
  71         },
  72         {
  73                 "SOH", '\001'
  74         },
  75         {
  76                 "STX", '\002'
  77         },
  78         {
  79                 "ETX", '\003'
  80         },
  81         {
  82                 "EOT", '\004'
  83         },
  84         {
  85                 "ENQ", '\005'
  86         },
  87         {
  88                 "ACK", '\006'
  89         },
  90         {
  91                 "BEL", '\007'
  92         },
  93         {
  94                 "alert", '\007'
  95         },
  96         {
  97                 "BS", '\010'
  98         },
  99         {
 100                 "backspace", '\b'
 101         },
 102         {
 103                 "HT", '\011'
 104         },
 105         {
 106                 "tab", '\t'
 107         },
 108         {
 109                 "LF", '\012'
 110         },
 111         {
 112                 "newline", '\n'
 113         },
 114         {
 115                 "VT", '\013'
 116         },
 117         {
 118                 "vertical-tab", '\v'
 119         },
 120         {
 121                 "FF", '\014'
 122         },
 123         {
 124                 "form-feed", '\f'
 125         },
 126         {
 127                 "CR", '\015'
 128         },
 129         {
 130                 "carriage-return", '\r'
 131         },
 132         {
 133                 "SO", '\016'
 134         },
 135         {
 136                 "SI", '\017'
 137         },
 138         {
 139                 "DLE", '\020'
 140         },
 141         {
 142                 "DC1", '\021'
 143         },
 144         {
 145                 "DC2", '\022'
 146         },
 147         {
 148                 "DC3", '\023'
 149         },
 150         {
 151                 "DC4", '\024'
 152         },
 153         {
 154                 "NAK", '\025'
 155         },
 156         {
 157                 "SYN", '\026'
 158         },
 159         {
 160                 "ETB", '\027'
 161         },
 162         {
 163                 "CAN", '\030'
 164         },
 165         {
 166                 "EM", '\031'
 167         },
 168         {
 169                 "SUB", '\032'
 170         },
 171         {
 172                 "ESC", '\033'
 173         },
 174         {
 175                 "IS4", '\034'
 176         },
 177         {
 178                 "FS", '\034'
 179         },
 180         {
 181                 "IS3", '\035'
 182         },
 183         {
 184                 "GS", '\035'
 185         },
 186         {
 187                 "IS2", '\036'
 188         },
 189         {
 190                 "RS", '\036'
 191         },
 192         {
 193                 "IS1", '\037'
 194         },
 195         {
 196                 "US", '\037'
 197         },
 198         {
 199                 "space", ' '
 200         },
 201         {
 202                 "exclamation-mark", '!'
 203         },
 204         {
 205                 "quotation-mark", '"'
 206         },
 207         {
 208                 "number-sign", '#'
 209         },
 210         {
 211                 "dollar-sign", '$'
 212         },
 213         {
 214                 "percent-sign", '%'
 215         },
 216         {
 217                 "ampersand", '&'
 218         },
 219         {
 220                 "apostrophe", '\''
 221         },
 222         {
 223                 "left-parenthesis", '('
 224         },
 225         {
 226                 "right-parenthesis", ')'
 227         },
 228         {
 229                 "asterisk", '*'
 230         },
 231         {
 232                 "plus-sign", '+'
 233         },
 234         {
 235                 "comma", ','
 236         },
 237         {
 238                 "hyphen", '-'
 239         },
 240         {
 241                 "hyphen-minus", '-'
 242         },
 243         {
 244                 "period", '.'
 245         },
 246         {
 247                 "full-stop", '.'
 248         },
 249         {
 250                 "slash", '/'
 251         },
 252         {
 253                 "solidus", '/'
 254         },
 255         {
 256                 "zero", '0'
 257         },
 258         {
 259                 "one", '1'
 260         },
 261         {
 262                 "two", '2'
 263         },
 264         {
 265                 "three", '3'
 266         },
 267         {
 268                 "four", '4'
 269         },
 270         {
 271                 "five", '5'
 272         },
 273         {
 274                 "six", '6'
 275         },
 276         {
 277                 "seven", '7'
 278         },
 279         {
 280                 "eight", '8'
 281         },
 282         {
 283                 "nine", '9'
 284         },
 285         {
 286                 "colon", ':'
 287         },
 288         {
 289                 "semicolon", ';'
 290         },
 291         {
 292                 "less-than-sign", '<'
 293         },
 294         {
 295                 "equals-sign", '='
 296         },
 297         {
 298                 "greater-than-sign", '>'
 299         },
 300         {
 301                 "question-mark", '?'
 302         },
 303         {
 304                 "commercial-at", '@'
 305         },
 306         {
 307                 "left-square-bracket", '['
 308         },
 309         {
 310                 "backslash", '\\'
 311         },
 312         {
 313                 "reverse-solidus", '\\'
 314         },
 315         {
 316                 "right-square-bracket", ']'
 317         },
 318         {
 319                 "circumflex", '^'
 320         },
 321         {
 322                 "circumflex-accent", '^'
 323         },
 324         {
 325                 "underscore", '_'
 326         },
 327         {
 328                 "low-line", '_'
 329         },
 330         {
 331                 "grave-accent", '`'
 332         },
 333         {
 334                 "left-brace", '{'
 335         },
 336         {
 337                 "left-curly-bracket", '{'
 338         },
 339         {
 340                 "vertical-line", '|'
 341         },
 342         {
 343                 "right-brace", '}'
 344         },
 345         {
 346                 "right-curly-bracket", '}'
 347         },
 348         {
 349                 "tilde", '~'
 350         },
 351         {
 352                 "DEL", '\177'
 353         },
 354         {
 355                 NULL, 0
 356         }
 357 };
 358
 359 /*
 360  * some ctype functions with non-ascii-char guard
 361  */
 362 static int
 363 wx_isdigit(wx_wchar c)
 364 {
 365         return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
 366 }
 367
 368 static int
 369 wx_isalpha(wx_wchar c)
 370 {
 371         return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
 372 }
 373
 374 static int
 375 wx_isalnum(wx_wchar c)
 376 {
 377         return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
 378 }
 379
 380 static int
 381 wx_isupper(wx_wchar c)
 382 {
 383         return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
 384 }
 385
 386 static int
 387 wx_islower(wx_wchar c)
 388 {
 389         return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
 390 }
 391
 392 static int
 393 wx_isgraph(wx_wchar c)
 394 {
 395         return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
 396 }
 397
 398 static int
 399 wx_ispunct(wx_wchar c)
 400 {
 401         return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
 402 }
 403
 404 static int
 405 wx_isspace(wx_wchar c)
 406 {
 407         return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
 408 }
 409
 410 static wx_wchar
 411 wx_toupper(wx_wchar c)
 412 {
 413         if (c >= 0 && c <= UCHAR_MAX)
 414                 return toupper((unsigned char) c);
 415         return c;
 416 }
 417
 418 static wx_wchar
 419 wx_tolower(wx_wchar c)
 420 {
 421         if (c >= 0 && c <= UCHAR_MAX)
 422                 return tolower((unsigned char) c);
 423         return c;
 424 }
 425
 426
 427 /*
 428  * nmcces - how many distinct MCCEs are there?
 429  */
 430 static int
 431 nmcces(struct vars * v)
 432 {
 433         /*
 434          * No multi-character collating elements defined at the moment.
 435          */
 436         return 0;
 437 }
 438
 439 /*
 440  * nleaders - how many chrs can be first chrs of MCCEs?
 441  */
 442 static int
 443 nleaders(struct vars * v)
 444 {
 445         return 0;
 446 }
 447
 448 /*
 449  * allmcces - return a cvec with all the MCCEs of the locale
 450  */
 451 static struct cvec *
 452 allmcces(struct vars * v,               /* context */
 453                  struct cvec * cv)              /* this is supposed to have enough room */
 454 {
 455         return clearcvec(cv);
 456 }
 457
 458 /*
 459  * element - map collating-element name to celt
 460  */
 461 static celt
 462 element(struct vars * v,                /* context */
 463                 chr *startp,                    /* points to start of name */
 464                 chr *endp)                              /* points just past end of name */
 465 {
 466         struct cname *cn;
 467         size_t          len;
 468
 469         /* generic:  one-chr names stand for themselves */
 470         assert(startp < endp);
 471         len = endp - startp;
 472         if (len == 1)
 473                 return *startp;
 474
 475         NOTE(REG_ULOCALE);
 476
 477         /* search table */
 478         for (cn = cnames; cn->name != NULL; cn++)
 479         {
 480                 if (strlen(cn->name) == len &&
 481                         char_and_wchar_strncmp(cn->name, startp, len) == 0)
 482                 {
 483                         break;                          /* NOTE BREAK OUT */
 484                 }
 485         }
 486         if (cn->name != NULL)
 487                 return CHR(cn->code);
 488
 489         /* couldn't find it */
 490         ERR(REG_ECOLLATE);
 491         return 0;
 492 }
 493
 494 /*
 495  * range - supply cvec for a range, including legality check
 496  */
 497 static struct cvec *
 498 range(struct vars * v,                  /* context */
 499           celt a,                                       /* range start */
 500           celt b,                                       /* range end, might equal a */
 501           int cases)                            /* case-independent? */
 502 {
 503         int                     nchrs;
 504         struct cvec *cv;
 505         celt            c,
 506                                 lc,
 507                                 uc;
 508
 509         if (a != b && !before(a, b))
 510         {
 511                 ERR(REG_ERANGE);
 512                 return NULL;
 513         }
 514
 515         if (!cases)
 516         {                                                       /* easy version */
 517                 cv = getcvec(v, 0, 1, 0);
 518                 NOERRN();
 519                 addrange(cv, a, b);
 520                 return cv;
 521         }
 522
 523         /*
 524          * When case-independent, it's hard to decide when cvec ranges are
 525          * usable, so for now at least, we won't try.  We allocate enough
 526          * space for two case variants plus a little extra for the two title
 527          * case variants.
 528          */
 529
 530         nchrs = (b - a + 1) * 2 + 4;
 531
 532         cv = getcvec(v, nchrs, 0, 0);
 533         NOERRN();
 534
 535         for (c = a; c <= b; c++)
 536         {
 537                 addchr(cv, c);
 538                 lc = wx_tolower((chr) c);
 539                 if (c != lc)
 540                         addchr(cv, lc);
 541                 uc = wx_toupper((chr) c);
 542                 if (c != uc)
 543                         addchr(cv, uc);
 544         }
 545
 546         return cv;
 547 }
 548
 549 /*
 550  * before - is celt x before celt y, for purposes of range legality?
 551  */
 552 static int                                              /* predicate */
 553 before(celt x, celt y)
 554 {
 555         /* trivial because no MCCEs */
 556         if (x < y)
 557                 return 1;
 558         return 0;
 559 }
 560
 561 /*
 562  * eclass - supply cvec for an equivalence class
 563  * Must include case counterparts on request.
 564  */
 565 static struct cvec *
 566 eclass(struct vars * v,                 /* context */
 567            celt c,                                      /* Collating element representing the
 568                                                                  * equivalence class. */
 569            int cases)                           /* all cases? */
 570 {
 571         struct cvec *cv;
 572
 573         /* crude fake equivalence class for testing */
 574         if ((v->cflags & REG_FAKE) && c == 'x')
 575         {
 576                 cv = getcvec(v, 4, 0, 0);
 577                 addchr(cv, (chr) 'x');
 578                 addchr(cv, (chr) 'y');
 579                 if (cases)
 580                 {
 581                         addchr(cv, (chr) 'X');
 582                         addchr(cv, (chr) 'Y');
 583                 }
 584                 return cv;
 585         }
 586
 587         /* otherwise, none */
 588         if (cases)
 589                 return allcases(v, c);
 590         cv = getcvec(v, 1, 0, 0);
 591         assert(cv != NULL);
 592         addchr(cv, (chr) c);
 593         return cv;
 594 }
 595
 596 /*
 597  * cclass - supply cvec for a character class
 598  *
 599  * Must include case counterparts on request.
 600  */
 601 static struct cvec *
 602 cclass(struct vars * v,                 /* context */
 603            chr *startp,                         /* where the name starts */
 604            chr *endp,                           /* just past the end of the name */
 605            int cases)                           /* case-independent? */
 606 {
 607         size_t          len;
 608         struct cvec *cv = NULL;
 609         char      **namePtr;
 610         int                     i,
 611                                 index;
 612
 613         /*
 614          * The following arrays define the valid character class names.
 615          */
 616
 617         static char *classNames[] = {
 618                 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
 619                 "lower", "print", "punct", "space", "upper", "xdigit", NULL
 620         };
 621
 622         enum classes
 623         {
 624                 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 625                 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
 626         };
 627
 628         /*
 629          * Map the name to the corresponding enumerated value.
 630          */
 631         len = endp - startp;
 632         index = -1;
 633         for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 634         {
 635                 if (strlen(*namePtr) == len &&
 636                         char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 637                 {
 638                         index = i;
 639                         break;
 640                 }
 641         }
 642         if (index == -1)
 643         {
 644                 ERR(REG_ECTYPE);
 645                 return NULL;
 646         }
 647
 648         /*
 649          * Remap lower and upper to alpha if the match is case insensitive.
 650          */
 651
 652         if (cases &&
 653                 ((enum classes) index == CC_LOWER ||
 654                  (enum classes) index == CC_UPPER))
 655                 index = (int) CC_ALPHA;
 656
 657         /*
 658          * Now compute the character class contents.
 659          *
 660          * For the moment, assume that only char codes < 256 can be in these
 661          * classes.
 662          */
 663
 664         switch ((enum classes) index)
 665         {
 666                 case CC_PRINT:
 667                 case CC_ALNUM:
 668                         cv = getcvec(v, UCHAR_MAX, 1, 0);
 669                         if (cv)
 670                         {
 671                                 for (i = 0; i <= UCHAR_MAX; i++)
 672                                 {
 673                                         if (wx_isalpha((chr) i))
 674                                                 addchr(cv, (chr) i);
 675                                 }
 676                                 addrange(cv, (chr) '0', (chr) '9');
 677                         }
 678                         break;
 679                 case CC_ALPHA:
 680                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 681                         if (cv)
 682                         {
 683                                 for (i = 0; i <= UCHAR_MAX; i++)
 684                                 {
 685                                         if (wx_isalpha((chr) i))
 686                                                 addchr(cv, (chr) i);
 687                                 }
 688                         }
 689                         break;
 690                 case CC_ASCII:
 691                         cv = getcvec(v, 0, 1, 0);
 692                         if (cv)
 693                                 addrange(cv, 0, 0x7f);
 694                         break;
 695                 case CC_BLANK:
 696                         cv = getcvec(v, 2, 0, 0);
 697                         addchr(cv, '\t');
 698                         addchr(cv, ' ');
 699                         break;
 700                 case CC_CNTRL:
 701                         cv = getcvec(v, 0, 2, 0);
 702                         addrange(cv, 0x0, 0x1f);
 703                         addrange(cv, 0x7f, 0x9f);
 704                         break;
 705                 case CC_DIGIT:
 706                         cv = getcvec(v, 0, 1, 0);
 707                         if (cv)
 708                                 addrange(cv, (chr) '0', (chr) '9');
 709                         break;
 710                 case CC_PUNCT:
 711                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 712                         if (cv)
 713                         {
 714                                 for (i = 0; i <= UCHAR_MAX; i++)
 715                                 {
 716                                         if (wx_ispunct((chr) i))
 717                                                 addchr(cv, (chr) i);
 718                                 }
 719                         }
 720                         break;
 721                 case CC_XDIGIT:
 722                         cv = getcvec(v, 0, 3, 0);
 723                         if (cv)
 724                         {
 725                                 addrange(cv, '0', '9');
 726                                 addrange(cv, 'a', 'f');
 727                                 addrange(cv, 'A', 'F');
 728                         }
 729                         break;
 730                 case CC_SPACE:
 731                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 732                         if (cv)
 733                         {
 734                                 for (i = 0; i <= UCHAR_MAX; i++)
 735                                 {
 736                                         if (wx_isspace((chr) i))
 737                                                 addchr(cv, (chr) i);
 738                                 }
 739                         }
 740                         break;
 741                 case CC_LOWER:
 742                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 743                         if (cv)
 744                         {
 745                                 for (i = 0; i <= UCHAR_MAX; i++)
 746                                 {
 747                                         if (wx_islower((chr) i))
 748                                                 addchr(cv, (chr) i);
 749                                 }
 750                         }
 751                         break;
 752                 case CC_UPPER:
 753                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 754                         if (cv)
 755                         {
 756                                 for (i = 0; i <= UCHAR_MAX; i++)
 757                                 {
 758                                         if (wx_isupper((chr) i))
 759                                                 addchr(cv, (chr) i);
 760                                 }
 761                         }
 762                         break;
 763                 case CC_GRAPH:
 764                         cv = getcvec(v, UCHAR_MAX, 0, 0);
 765                         if (cv)
 766                         {
 767                                 for (i = 0; i <= UCHAR_MAX; i++)
 768                                 {
 769                                         if (wx_isgraph((chr) i))
 770                                                 addchr(cv, (chr) i);
 771                                 }
 772                         }
 773                         break;
 774         }
 775         if (cv == NULL)
 776                 ERR(REG_ESPACE);
 777         return cv;
 778 }
 779
 780 /*
 781  * allcases - supply cvec for all case counterparts of a chr (including itself)
 782  *
 783  * This is a shortcut, preferably an efficient one, for simple characters;
 784  * messy cases are done via range().
 785  */
 786 static struct cvec *
 787 allcases(struct vars * v,               /* context */
 788                  chr pc)                                /* character to get case equivs of */
 789 {
 790         struct cvec *cv;
 791         chr                     c = (chr) pc;
 792         chr                     lc,
 793                                 uc;
 794
 795         lc = wx_tolower((chr) c);
 796         uc = wx_toupper((chr) c);
 797
 798         cv = getcvec(v, 2, 0, 0);
 799         addchr(cv, lc);
 800         if (lc != uc)
 801                 addchr(cv, uc);
 802         return cv;
 803 }
 804
 805 /*
 806  * cmp - chr-substring compare
 807  *
 808  * Backrefs need this.  It should preferably be efficient.
 809  * Note that it does not need to report anything except equal/unequal.
 810  * Note also that the length is exact, and the comparison should not
 811  * stop at embedded NULs!
 812  */
 813 static int                                              /* 0 for equal, nonzero for unequal */
 814 cmp(const chr *x, const chr *y, /* strings to compare */
 815         size_t len)                                     /* exact length of comparison */
 816 {
 817         return memcmp(VS(x), VS(y), len * sizeof(chr));
 818 }
 819
 820 /*
 821  * casecmp - case-independent chr-substring compare
 822  *
 823  * REG_ICASE backrefs need this.  It should preferably be efficient.
 824  * Note that it does not need to report anything except equal/unequal.
 825  * Note also that the length is exact, and the comparison should not
 826  * stop at embedded NULs!
 827  */
 828 static int                                              /* 0 for equal, nonzero for unequal */
 829 casecmp(const chr *x, const chr *y,             /* strings to compare */
 830                 size_t len)                             /* exact length of comparison */
 831 {
 832         for (; len > 0; len--, x++, y++)
 833         {
 834                 if ((*x != *y) && (wx_tolower(*x) != wx_tolower(*y)))
 835                         return 1;
 836         }
 837         return 0;
 838 }